more semester / section enrollment stats and model

2023-10-30 21:16:07 -07:00 · 2023-10-30 21:16:07 -07:00 · a1f3778280
parent 466f789355
commit a1f3778280
2 changed files with 306 additions and 4 deletions
--- a/localcache.py
+++ b/localcache.py
@ -1929,7 +1929,17 @@ def courses_to_sched():
            print(e)
    conn.commit()

+def query_multiple(q, database=sqlite_file):
+    conn,cur = db(database)   # 'cache/canvas_data/data20231012.db'
+    conn.row_factory = dict_factory
+    cur = conn.cursor()
+    cur.execute(q)
+    return cur.fetchall()

+def query_execute(q, database=sqlite_file):
+    conn,cur = db(database)  
+    cur.execute(q)
+    conn.commit()

 def sched_to_db():
    d = 'DROP TABLE IF EXISTS `schedule`;'
--- a/stats.py
+++ b/stats.py
@ -53,13 +53,16 @@

 - 
 """
-import codecs, os
+import codecs, os, warnings, itertools
 import json, csv, requests, sys, re
+import numpy as np
+import pandas as pd
 from multiprocessing import Semaphore
 from statistics import mean, median, stdev
 from pipelines import fetch, url
 from courses import getCoursesInTerm, course_enrollment
 from localcache import get_course_enrollments
+from localcache import query_multiple
 from collections import defaultdict

 all_grades_file = f"cache/grades_all.csv"
@ -897,15 +900,299 @@ def try_make_sched():
    print(teachers)


-def section_stats():
-    pass

+def sched_lookup_tables():
+
+    # Renumber the semesters
+    # sp16 su16 fa16 wi17 sp17 su17 fa17 wi18 
+    #semesters = "sp18 su18 fa18 wi19 sp19 su19 fa19 wi20 sp20 su20 fa20 wi21 sp21 su21 fa21 wi22 sp22 su22 fa22 wi23 sp23 su23 fa23 wi24 sp24 su24 fa24 wi25 sp25 su25 fa25 wi26".split(" ")
+
+    sem_fourcode = "sp18 su18 fa18 sp19 su19 fa19 sp20 su20 fa20 sp21 su21 fa21 sp22 su22 fa22 sp23 su23 fa23 sp24 su24 fa24 sp25 su25 fa25".split(" ")
+    int_numbers = [x for x in range(1,len(sem_fourcode)+1)]
+    fourcode_2_int = {semester: number for semester, number in zip(sem_fourcode, int_numbers)}
+    int_2_fourcode = {v: k for k, v in fourcode_2_int.items()}
+
+    sis_2_fourcode = {}
+    fourcode_2_sis = {}
+    yr = 2018
+    sems = ['30','50','70']
+    i = 0
+    semcodes = []
+    while yr < 2026:
+        for s in sems:
+            semcodes.append(f"{yr}{s}")
+            sis_2_fourcode[f"{yr}{s}"] = sem_fourcode[i]
+            fourcode_2_sis[sis_2_fourcode[f"{yr}{s}"]] = f"{yr}{s}"
+            #print(f"UPDATE schedule SET semsis={yr}{s} WHERE sem='{semesters[i]}';")
+            i += 1
+        yr += 1
+    return fourcode_2_int, int_2_fourcode, sis_2_fourcode, fourcode_2_sis, semcodes
+
+
+
+
+
+
+def section_stats_bymode():
+    data = query_multiple("SELECT code, semsis, COUNT(id) AS sections, sum(act) filter (WHERE type='in-person') AS inperson, sum(act) filter (WHERE type='online') AS online, sum(act) filter (WHERE type='hybrid') AS hybrid, sum(act) filter (WHERE type='online live') AS onlinelive FROM schedule GROUP BY code, semsis ORDER BY code, semsis;", 'cache/canvas_data/data20231012.db')
+    import pandas as pd
+    df = pd.DataFrame(data)
+    df.fillna(0,inplace=True)
+    for L in 'sections,inperson,online,hybrid,onlinelive'.split(','):
+        df[L] = df[L].astype(int)
+    print(df)
+    df.to_csv('cache/section_stats_bymode.csv')
+    return df
+
+def section_stats():
    # for each course, (ENG1A) how many are enrolled in each all sections? 
    # (and break down by mode,time,location,etc)
    #
    # for each course, how many are first semester gav students?
    #
-    # 
+    data = query_multiple("SELECT * FROM schedule ORDER BY code,id", 'cache/canvas_data/data20231012.db')
+
+    fourcode_2_int, int_2_fourcode, sis_2_fourcode, fourcode_2_sis, semcodes = sched_lookup_tables()
+
+    # Assuming your data is in a list of dictionaries called data
+    df = pd.DataFrame(data)
+
+    # Drop the specified columns
+    df = df.drop(columns=['id', 'crn', 'units', 'teacher', 'start', 'end', 'loc', 'cap'])
+
+    codecs.open('cache/sem_mapping.json','w','utf-8').write(json.dumps(fourcode_2_int,indent=2))
+
+    df['sem'] = df['sem'].map(fourcode_2_int) 
+    df.set_index('sem', inplace=True)
+    return df
+
+def simple_exp_smoothing_section_model():
+    sout = codecs.open('cache/section_predictions.txt','w','utf-8')
+    from statsmodels.tsa.api import SimpleExpSmoothing
+    warnings.filterwarnings("ignore")
+    periods = 3
+    start = 19
+
+    df = section_stats()
+    print(df)
+    df = df.sort_index()
+    
+    predictions = {}
+    for course_code in df['code'].unique():
+        try:
+            print(course_code)
+            sout.write(course_code + "\n")
+            this_set = df[df['code'] == course_code]['act']
+            this_set = this_set.groupby('sem').sum()
+            #this_set.fillna(method='ffill', inplace=True)
+            #this_set.fillna(0, inplace=True)
+
+            # Create a new index with all required semesters
+            new_index = np.arange(this_set.index.min(), this_set.index.max()+1)
+
+            # Reindex the DataFrame and fill missing values with 0
+            this_set = this_set.reindex(new_index, fill_value=0)
+
+            print(this_set.to_string())
+
+            sout.write(this_set.to_string() + "\n")
+            model = SimpleExpSmoothing(this_set)
+            fit = model.fit(smoothing_level=0.2)  # initiate with a smoothing level of 0.2
+            # Later modify above line based on if your data has high or low variability
+
+            #prediction = fit.forecast(start=32,end=34)  # predict attendance for the next 3 semesters
+            prediction = fit.predict(start=start,end=start+4)
+            print(prediction)
+            sout.write(str(prediction) + "\n")
+            sout.flush()
+            predictions[course_code] = prediction
+        except Exception as e:
+            print(f"Model creation failed for {course_code} due to {str(e)}")
+            sout.write(f"Model creation failed for {course_code} due to {str(e)}\n")
+            """
+            model = ARIMA(this_set, order=(1,1,1)) #ARIMA params (p, d, q)
+            model_fit = model.fit()
+            forecast_result = model_fit.forecast(steps=periods)
+            if forecast_result:
+                predictions[course_code] = forecast_result[0]
+            else:
+                print(f"No prediction for {course_code}. Skipping...")"""
+
+
+# statistics - use a smooth exponential model to predict the next 3 semesters of enrollment
+# Doesn't really seem to get the patterns.
+def exp_smoothing_section_model():
+    sout = codecs.open('cache/section_predictions.txt','w','utf-8')
+    from statsmodels.tsa.api import ExponentialSmoothing
+    warnings.filterwarnings("ignore")
+    periods = 3
+    start = 19
+
+    fourcode_2_int, int_2_fourcode, sis_2_fourcode, fourcode_2_sis, semcodes = sched_lookup_tables()
+
+    df = section_stats()
+    print(df)
+    df = df.sort_index()
+    
+    predictions = {}
+    for course_code in df['code'].unique():
+        try:
+            print(course_code)
+            #sout.write(course_code + "\n")
+            this_set = df[df['code'] == course_code]['act']
+            this_set = this_set.groupby('sem').sum()
+            #this_set.fillna(method='ffill', inplace=True)
+            #this_set.fillna(0, inplace=True)
+
+            # Create a new index with all required semesters
+            new_index = np.arange(this_set.index.min(), this_set.index.max()+1)
+
+            # Reindex the DataFrame and fill missing values with 0
+            this_set = this_set.reindex(new_index, fill_value=0)
+
+            print(this_set.to_string())
+
+            for i,v in this_set.items():
+                sout.write(f"{course_code},{int_2_fourcode[i]},{v}\n")
+
+            model = ExponentialSmoothing(this_set, seasonal_periods=4, trend='add', seasonal='add')
+            fit = model.fit()
+
+            prediction = fit.predict(start=start,end=start+4)
+            print(prediction)
+            for i,v in prediction.items():
+                v = int(v)
+                if v<0: v=0
+                sout.write(f"{course_code},{int_2_fourcode[i]}, {v}\n")
+            sout.flush()
+            predictions[course_code] = prediction
+        except Exception as e:
+            print(f"Model creation failed for {course_code} due to {str(e)}")
+            #sout.write(f"Model creation failed for {course_code} due to {str(e)}\n")
+
+def student_by_semester():
+
+    query = """
+    SELECT u.name, u.canvasid, s.code, s.semsis FROM users u
+    JOIN enrollment e ON u.id = e.user_id
+    JOIN courses c ON c.id = e.course_id
+    JOIN terms t ON c.termid = t.id
+    JOIN schedule s ON c.schedule = s.id
+    WHERE e.type='StudentEnrollment' AND e.workflow='active' 
+    ORDER BY u.sortablename, s.semsis;
+    """
+
+    df = pd.DataFrame(query_multiple(query, 'cache/canvas_data/data20231012.db'))
+                           
+    # Apply groupby and aggregate the courses in each semester in a comma-separated string
+    df['courses'] = df.groupby(['name','canvasid','semsis'])['code'].transform(lambda x : ' / '.join(x))
+
+    # Removing duplicates
+    df = df[['name','canvasid','semsis','courses']].drop_duplicates()
+
+    # Create pivot table
+    df_pivot = df.pivot_table(values='courses', index=['name','canvasid'], columns='semsis', aggfunc='first').reset_index()
+
+    # Adding prefix to new columns names to recognize them
+    df_pivot.columns = [str(col) + '_sem' if isinstance(col, int) else col for col in df_pivot.columns]
+
+    df_pivot.to_csv('cache/student_by_semester.csv')
+
+
+def sections_grouped_by_year_mode():
+    df = section_stats_bymode()
+
+    # list of unique courses
+    df_all_courses = df['code'].unique()
+
+    # list of unique semesters
+    df_all_semesters = df['semsis'].unique()
+    df_all_semesters.sort()
+
+    
+
+    raw_data = {}
+    for line in df:
+        print(line['semsis'])
+        sis = str(line['semsis'])
+        year = sis[0:4]
+        raw_data[ f"{line['code']}{year}"] = [line['inperson'],line['online'],line['hybrid'],line['onlinelive']]
+    print(raw_data)
+    return
+
+    for course in df_all_courses:
+        c = str(course)
+        template = {'code':[c,c,c], 'semsis':[], 'inperson':[], 'online':[], 'hybrid':[], 'onlinelive':[]}
+
+    # group semesters in to groups of 3 by year
+    for i in df_all_semesters:
+        j = str(i)
+        year = j[0:4]
+        print(f"{i} ({year})")
+
+    # for each course, for each group of 3 semesters, fill in values, using 0 if necessary
+
+    # ...
+
+def lstm_model_sections():
+    from keras.models import Sequential
+    from keras.layers import Dense
+    from keras.layers import LSTM
+    from sklearn.preprocessing import MinMaxScaler
+    from sklearn.model_selection import train_test_split
+
+    # Preprocessing
+
+    # Normalize inputs for better performance
+    df = section_stats_bymode()
+    print(df)
+    scaler = MinMaxScaler(feature_range=(0, 1))
+    dataset_scaled = scaler.fit_transform(df.drop(['code', 'semsis'], axis=1)) 
+    print("scaled:")
+    print(df)
+
+    # Split features and targets (Assuming you want to predict 'online' enrollments)
+    X = dataset_scaled[:, 1:] 
+    Y = dataset_scaled[:,0:1] 
+
+    # Train / Test split
+    x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=0)
+
+    # Reshape input to be [samples, time steps, features] which is required for LSTM
+    x_train = np.reshape(x_train, (x_train.shape[0], x_train.shape[1], 1))
+    x_test = np.reshape(x_test, (x_test.shape[0], x_test.shape[1], 1))
+
+    print("x_train shape:", x_train.shape)
+    print(x_train)
+
+    print("\n\nTraining...\n\n")
+
+    # LSTM architecture
+    model = Sequential()
+    model.add(LSTM(50, input_shape=(X.shape[1], 1))) # 50 LSTM blocks
+    model.add(Dense(1)) # Since we are predicting only 1 output ('online' enrollments)
+    model.compile(loss='mean_squared_error', optimizer='adam')
+    model.fit(x_train, y_train, epochs=5, batch_size=1) # Training the model
+
+    # Prediction
+    scaler_predict = MinMaxScaler()
+    scaler_predict.fit_transform(df[['online']])
+    trainPredict = model.predict(x_train)
+    testPredict = model.predict(x_test)
+
+    # Invert predictions (Due to normalization)
+    trainPredict = scaler_predict.inverse_transform(trainPredict)
+    testPredict = scaler_predict.inverse_transform(testPredict)
+
+    # Now you have your future prediction in testPredict.
+
+    print("Predictions:")
+    print(testPredict)
+    np.savetxt('cache/section_predictions_lstm.txt',testPredict, fmt='%f')
+
+    # I'm lost here...
+    df
+

 if __name__ == "__main__":
    options = { 1: ['get all historical grades from ilearn',get_all] ,  
@ -920,6 +1207,11 @@ if __name__ == "__main__":
                10: ['normalize course histories', normalize_course_histories],
                11: ['cluster student histories', cluster_student_histories],
                12: ['try to make a schedule', try_make_sched],
+                13: ['ES model section predict attendance', exp_smoothing_section_model],
+                14: ['section stats by mode', section_stats_bymode],
+                15: ['student courses by semester', student_by_semester],
+                16: ['LSTM model sections', lstm_model_sections],
+                17: ['rearrange section data to yearly form', sections_grouped_by_year_mode],
              }
    print ('')