From 9e485d7641ffeac13c4ac14207c75353cff6a4c1 Mon Sep 17 00:00:00 2001
From: Coding with Peter <coding.with.peter@gmail.com>
Date: Mon, 25 Mar 2024 18:15:28 -0700
Subject: [PATCH] stats on historical course offerings

---
 courses.py   | 119 +++++++++++++++++++++++++++++++++++++++++++++++++--
 semesters.py |   4 +-
 util.py      |   2 +-
 3 files changed, 120 insertions(+), 5 deletions(-)

diff --git a/courses.py b/courses.py
index 9615c85..9c3c99d 100644
--- a/courses.py
+++ b/courses.py
@@ -2018,7 +2018,119 @@ def change_link_in_all_terms_pages():
     term = 181
 
     courses = getCoursesInTerm(term,get_fresh=1,show=0,active=1)
-    
+
+def enrollment_helper():
+
+    ignore = ['JLE','JFT', 'CWE']
+    ignore2 = ['AH 190', 'AE 600', 'AE 602', 'AE 603','ACCT 190','AJ 100A', 'AJ 107A', 'AJ 213A','AJ 229A','AJ 231A','AMT 190','ATH 23','BUS 190','CD 190','COS 290','WTRM 290','SPAN 8A', 'SPAN 8B', 'SPAN 8C', 'SPAN 8D', 'RE 190','MKTG 190']
+    keep = 'code,name,days,cap,act,teacher,date,partofday,type,site'.split(',')
+    oo = codecs.open('cache/section_history.json','w','utf-8')
+    # fetch enrollment stats for last few years
+    from semesters import code, sems, to_sis_sem
+    from util import dept_from_name
+    raw = []
+    code.reverse()
+    sort = defaultdict(dict)
+    for s in sems.keys():
+        try:
+            sched1 = requests.get(f"http://gavilan.cc/schedule/{s}_sched_expanded.json").json()
+            sort[s] = defaultdict(dict)
+            for sect in sched1:
+                if sect['name'] in ignore2:
+                    continue
+                sect_smaller = funcy.project(sect,keep)
+                sect_smaller['sem'] = to_sis_sem(s)
+                if int(sect_smaller['cap'])==0 or int(sect_smaller['act'])==0:
+                    sect_smaller['fill_pct'] = 100
+                else:
+                    sect_smaller['fill_pct'] = round( (int(sect_smaller['act']) / int(sect_smaller['cap']))*100 )
+                d = dept_from_name(sect_smaller['code'])
+                if d in ignore:
+                    continue
+                sect_smaller['dept'] = d
+                raw.append(sect_smaller)
+
+                if not d in sort[s]:
+                    sort[s][d] = defaultdict(dict)
+                name = sect['code']
+                if not name in sort[s][d]:
+                    sort[s][d][name] = []
+                sort[s][d][name].append(sect_smaller)
+            print(f"{s} OK.")
+        except Exception as e:
+            print(f"{s} not found. {e}")
+            #sems.pop(s)
+    oo.write(json.dumps(sort,indent=2))
+
+    df = pd.DataFrame(raw)
+    df_sorted = df.sort_values(['dept', 'code', 'type','site','partofday','fill_pct'])
+    df_sorted.to_csv('cache/section_history.csv')
+
+    class_counts = df.groupby(['sem', 'code']).size().reset_index(name='class_count')
+    print("Class counts by semester")
+    print(class_counts)
+    pivot_df = class_counts.pivot_table(index='code', columns='sem', values='class_count', aggfunc='sum', fill_value=0)
+    # Reset the index to move 'class_name' back to a column
+    pivot_df.reset_index(inplace=True)
+    print(pivot_df)
+    pivot_df.to_csv('cache/section_counts_history.csv')
+
+
+    # Group by semester and class type, and then count the number of occurrences of each class type
+    class_type_counts = df.groupby(['sem', 'code', 'type']).size().reset_index(name='class_type_count')
+    print("Class type by semester")
+    print(class_type_counts)
+    pivot_df2 = class_type_counts.pivot_table(index='code', columns=['sem','type'], values='class_type_count', aggfunc='sum', fill_value=0)
+    # Reset the index to move 'class_name' back to a column
+    pivot_df2.reset_index(inplace=True)
+
+    kmeans = try_clustering(pivot_df2.copy())
+
+    pivot_df2.insert(0, "Cluster", kmeans.labels_)
+
+    print(pivot_df2)
+    pivot_df2.to_csv('cache/section_and_mode_counts_history.csv')
+
+
+    # Group by teacher
+    class_teacher_counts = df.groupby(['sem', 'code', 'teacher']).size().reset_index(name='class_teacher_count')
+    print("Class teacher by semester")
+    print(class_teacher_counts)
+
+    # group by COURSE (ie: ENGL1A)
+
+    # For each historical WINTER, SPRING, SUMMER, FALL:
+
+    # number of sections offered, by mode, time of day, campus
+
+    # all teachers who taught it (and their qual to teach online)
+
+    # fill percentage for each section, then by mode, tod, campus
+
+def try_clustering(df):
+    # Import required libraries
+    from sklearn.cluster import KMeans
+
+    # Preprocessing
+
+    # Assuming df is your DataFrame and "modes" is your categorical column
+    #df['code'] = df['code'].astype('category').cat.codes 
+
+    # Removing any other unnecessary columns
+    df = df.drop(['code'], axis=1)
+
+    # Perform KMeans clustering
+    kmeans = KMeans(n_clusters=4, random_state=0).fit(df) 
+
+    # Get the cluster labels
+    labels = kmeans.labels_
+
+    # Add labels to the DataFrame
+    #df['clusters'] = labels
+    #print(df)
+    #df.to_csv('cache/section_and_mode_counts_history_clusters.csv')
+    return kmeans
+
 
 if __name__ == "__main__":
     options = { 1: ['Cross check schedule with ztc responses',make_ztc_list] ,  
@@ -2063,13 +2175,14 @@ if __name__ == "__main__":
                 44: ['List users who passed GOTT 1 / Bootcamp', get_gott1_passers],
                 45: ['List users who passed Plagiarism Module', get_plague_passers],
                 46: ['make courses visible to auth users', modify_courses],
+                47: ['enrollment helper', enrollment_helper],
                 # 24: ['Add course evals to whole semester',instructor_list_to_activate_evals],   
                 # 21: ['Add announcements to homepage', change_course_ann_homepage],
                 # TODO wanted: group shell for each GP (guided pathway) as a basic student services gateway....
                 #
 
-                45: ['Fetch rubric scores and comments', fetch_rubric_scores],
-                46: ['Fetch announcements in a course', fetch_announcements],
+                50: ['Fetch rubric scores and comments', fetch_rubric_scores],
+                51: ['Fetch announcements in a course', fetch_announcements],
               }
     print ('')
 
diff --git a/semesters.py b/semesters.py
index 39431b6..bc4f196 100644
--- a/semesters.py
+++ b/semesters.py
@@ -1,6 +1,6 @@
 # Try to gather all the different formats and ways of labeling a semester, along with their associated dates.
 
-import json
+import json, funcy
 
 sem_to_short = { 'Summer 2021': 'su21', 'Fall 2021':'fa21', 'Winter 2022':'wi22', 'Spring 2022':'sp22', 'Summer 2022':'su22', 'Fall 2022':'fa22' }
 
@@ -51,6 +51,8 @@ def to_sis_sem(s):
 
 # print(json.dumps(semester_list,indent=2))
 
+sems = funcy.project(semester_list, code)
+#print(json.dumps(sems,indent=2))
 
 """
 
diff --git a/util.py b/util.py
index f8afd9c..674cef4 100644
--- a/util.py
+++ b/util.py
@@ -6,7 +6,7 @@ import re, csv
 from collections import defaultdict
 from bs4 import BeautifulSoup as bs
 import pytz, datetime, dateutil, json
-from time import timedelta
+from datetime import timedelta
 from dateutil import tz