more semester / section enrollment stats and model

This commit is contained in:
Coding with Peter 2023-10-30 21:16:07 -07:00
parent 466f789355
commit a1f3778280
2 changed files with 306 additions and 4 deletions

View File

@ -1929,7 +1929,17 @@ def courses_to_sched():
print(e)
conn.commit()
def query_multiple(q, database=sqlite_file):
conn,cur = db(database) # 'cache/canvas_data/data20231012.db'
conn.row_factory = dict_factory
cur = conn.cursor()
cur.execute(q)
return cur.fetchall()
def query_execute(q, database=sqlite_file):
conn,cur = db(database)
cur.execute(q)
conn.commit()
def sched_to_db():
d = 'DROP TABLE IF EXISTS `schedule`;'

300
stats.py
View File

@ -53,13 +53,16 @@
-
"""
import codecs, os
import codecs, os, warnings, itertools
import json, csv, requests, sys, re
import numpy as np
import pandas as pd
from multiprocessing import Semaphore
from statistics import mean, median, stdev
from pipelines import fetch, url
from courses import getCoursesInTerm, course_enrollment
from localcache import get_course_enrollments
from localcache import query_multiple
from collections import defaultdict
all_grades_file = f"cache/grades_all.csv"
@ -897,15 +900,299 @@ def try_make_sched():
print(teachers)
def section_stats():
pass
def sched_lookup_tables():
# Renumber the semesters
# sp16 su16 fa16 wi17 sp17 su17 fa17 wi18
#semesters = "sp18 su18 fa18 wi19 sp19 su19 fa19 wi20 sp20 su20 fa20 wi21 sp21 su21 fa21 wi22 sp22 su22 fa22 wi23 sp23 su23 fa23 wi24 sp24 su24 fa24 wi25 sp25 su25 fa25 wi26".split(" ")
sem_fourcode = "sp18 su18 fa18 sp19 su19 fa19 sp20 su20 fa20 sp21 su21 fa21 sp22 su22 fa22 sp23 su23 fa23 sp24 su24 fa24 sp25 su25 fa25".split(" ")
int_numbers = [x for x in range(1,len(sem_fourcode)+1)]
fourcode_2_int = {semester: number for semester, number in zip(sem_fourcode, int_numbers)}
int_2_fourcode = {v: k for k, v in fourcode_2_int.items()}
sis_2_fourcode = {}
fourcode_2_sis = {}
yr = 2018
sems = ['30','50','70']
i = 0
semcodes = []
while yr < 2026:
for s in sems:
semcodes.append(f"{yr}{s}")
sis_2_fourcode[f"{yr}{s}"] = sem_fourcode[i]
fourcode_2_sis[sis_2_fourcode[f"{yr}{s}"]] = f"{yr}{s}"
#print(f"UPDATE schedule SET semsis={yr}{s} WHERE sem='{semesters[i]}';")
i += 1
yr += 1
return fourcode_2_int, int_2_fourcode, sis_2_fourcode, fourcode_2_sis, semcodes
def section_stats_bymode():
data = query_multiple("SELECT code, semsis, COUNT(id) AS sections, sum(act) filter (WHERE type='in-person') AS inperson, sum(act) filter (WHERE type='online') AS online, sum(act) filter (WHERE type='hybrid') AS hybrid, sum(act) filter (WHERE type='online live') AS onlinelive FROM schedule GROUP BY code, semsis ORDER BY code, semsis;", 'cache/canvas_data/data20231012.db')
import pandas as pd
df = pd.DataFrame(data)
df.fillna(0,inplace=True)
for L in 'sections,inperson,online,hybrid,onlinelive'.split(','):
df[L] = df[L].astype(int)
print(df)
df.to_csv('cache/section_stats_bymode.csv')
return df
def section_stats():
# for each course, (ENG1A) how many are enrolled in each all sections?
# (and break down by mode,time,location,etc)
#
# for each course, how many are first semester gav students?
#
#
data = query_multiple("SELECT * FROM schedule ORDER BY code,id", 'cache/canvas_data/data20231012.db')
fourcode_2_int, int_2_fourcode, sis_2_fourcode, fourcode_2_sis, semcodes = sched_lookup_tables()
# Assuming your data is in a list of dictionaries called data
df = pd.DataFrame(data)
# Drop the specified columns
df = df.drop(columns=['id', 'crn', 'units', 'teacher', 'start', 'end', 'loc', 'cap'])
codecs.open('cache/sem_mapping.json','w','utf-8').write(json.dumps(fourcode_2_int,indent=2))
df['sem'] = df['sem'].map(fourcode_2_int)
df.set_index('sem', inplace=True)
return df
def simple_exp_smoothing_section_model():
sout = codecs.open('cache/section_predictions.txt','w','utf-8')
from statsmodels.tsa.api import SimpleExpSmoothing
warnings.filterwarnings("ignore")
periods = 3
start = 19
df = section_stats()
print(df)
df = df.sort_index()
predictions = {}
for course_code in df['code'].unique():
try:
print(course_code)
sout.write(course_code + "\n")
this_set = df[df['code'] == course_code]['act']
this_set = this_set.groupby('sem').sum()
#this_set.fillna(method='ffill', inplace=True)
#this_set.fillna(0, inplace=True)
# Create a new index with all required semesters
new_index = np.arange(this_set.index.min(), this_set.index.max()+1)
# Reindex the DataFrame and fill missing values with 0
this_set = this_set.reindex(new_index, fill_value=0)
print(this_set.to_string())
sout.write(this_set.to_string() + "\n")
model = SimpleExpSmoothing(this_set)
fit = model.fit(smoothing_level=0.2) # initiate with a smoothing level of 0.2
# Later modify above line based on if your data has high or low variability
#prediction = fit.forecast(start=32,end=34) # predict attendance for the next 3 semesters
prediction = fit.predict(start=start,end=start+4)
print(prediction)
sout.write(str(prediction) + "\n")
sout.flush()
predictions[course_code] = prediction
except Exception as e:
print(f"Model creation failed for {course_code} due to {str(e)}")
sout.write(f"Model creation failed for {course_code} due to {str(e)}\n")
"""
model = ARIMA(this_set, order=(1,1,1)) #ARIMA params (p, d, q)
model_fit = model.fit()
forecast_result = model_fit.forecast(steps=periods)
if forecast_result:
predictions[course_code] = forecast_result[0]
else:
print(f"No prediction for {course_code}. Skipping...")"""
# statistics - use a smooth exponential model to predict the next 3 semesters of enrollment
# Doesn't really seem to get the patterns.
def exp_smoothing_section_model():
sout = codecs.open('cache/section_predictions.txt','w','utf-8')
from statsmodels.tsa.api import ExponentialSmoothing
warnings.filterwarnings("ignore")
periods = 3
start = 19
fourcode_2_int, int_2_fourcode, sis_2_fourcode, fourcode_2_sis, semcodes = sched_lookup_tables()
df = section_stats()
print(df)
df = df.sort_index()
predictions = {}
for course_code in df['code'].unique():
try:
print(course_code)
#sout.write(course_code + "\n")
this_set = df[df['code'] == course_code]['act']
this_set = this_set.groupby('sem').sum()
#this_set.fillna(method='ffill', inplace=True)
#this_set.fillna(0, inplace=True)
# Create a new index with all required semesters
new_index = np.arange(this_set.index.min(), this_set.index.max()+1)
# Reindex the DataFrame and fill missing values with 0
this_set = this_set.reindex(new_index, fill_value=0)
print(this_set.to_string())
for i,v in this_set.items():
sout.write(f"{course_code},{int_2_fourcode[i]},{v}\n")
model = ExponentialSmoothing(this_set, seasonal_periods=4, trend='add', seasonal='add')
fit = model.fit()
prediction = fit.predict(start=start,end=start+4)
print(prediction)
for i,v in prediction.items():
v = int(v)
if v<0: v=0
sout.write(f"{course_code},{int_2_fourcode[i]}, {v}\n")
sout.flush()
predictions[course_code] = prediction
except Exception as e:
print(f"Model creation failed for {course_code} due to {str(e)}")
#sout.write(f"Model creation failed for {course_code} due to {str(e)}\n")
def student_by_semester():
query = """
SELECT u.name, u.canvasid, s.code, s.semsis FROM users u
JOIN enrollment e ON u.id = e.user_id
JOIN courses c ON c.id = e.course_id
JOIN terms t ON c.termid = t.id
JOIN schedule s ON c.schedule = s.id
WHERE e.type='StudentEnrollment' AND e.workflow='active'
ORDER BY u.sortablename, s.semsis;
"""
df = pd.DataFrame(query_multiple(query, 'cache/canvas_data/data20231012.db'))
# Apply groupby and aggregate the courses in each semester in a comma-separated string
df['courses'] = df.groupby(['name','canvasid','semsis'])['code'].transform(lambda x : ' / '.join(x))
# Removing duplicates
df = df[['name','canvasid','semsis','courses']].drop_duplicates()
# Create pivot table
df_pivot = df.pivot_table(values='courses', index=['name','canvasid'], columns='semsis', aggfunc='first').reset_index()
# Adding prefix to new columns names to recognize them
df_pivot.columns = [str(col) + '_sem' if isinstance(col, int) else col for col in df_pivot.columns]
df_pivot.to_csv('cache/student_by_semester.csv')
def sections_grouped_by_year_mode():
df = section_stats_bymode()
# list of unique courses
df_all_courses = df['code'].unique()
# list of unique semesters
df_all_semesters = df['semsis'].unique()
df_all_semesters.sort()
raw_data = {}
for line in df:
print(line['semsis'])
sis = str(line['semsis'])
year = sis[0:4]
raw_data[ f"{line['code']}{year}"] = [line['inperson'],line['online'],line['hybrid'],line['onlinelive']]
print(raw_data)
return
for course in df_all_courses:
c = str(course)
template = {'code':[c,c,c], 'semsis':[], 'inperson':[], 'online':[], 'hybrid':[], 'onlinelive':[]}
# group semesters in to groups of 3 by year
for i in df_all_semesters:
j = str(i)
year = j[0:4]
print(f"{i} ({year})")
# for each course, for each group of 3 semesters, fill in values, using 0 if necessary
# ...
def lstm_model_sections():
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
# Preprocessing
# Normalize inputs for better performance
df = section_stats_bymode()
print(df)
scaler = MinMaxScaler(feature_range=(0, 1))
dataset_scaled = scaler.fit_transform(df.drop(['code', 'semsis'], axis=1))
print("scaled:")
print(df)
# Split features and targets (Assuming you want to predict 'online' enrollments)
X = dataset_scaled[:, 1:]
Y = dataset_scaled[:,0:1]
# Train / Test split
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=0)
# Reshape input to be [samples, time steps, features] which is required for LSTM
x_train = np.reshape(x_train, (x_train.shape[0], x_train.shape[1], 1))
x_test = np.reshape(x_test, (x_test.shape[0], x_test.shape[1], 1))
print("x_train shape:", x_train.shape)
print(x_train)
print("\n\nTraining...\n\n")
# LSTM architecture
model = Sequential()
model.add(LSTM(50, input_shape=(X.shape[1], 1))) # 50 LSTM blocks
model.add(Dense(1)) # Since we are predicting only 1 output ('online' enrollments)
model.compile(loss='mean_squared_error', optimizer='adam')
model.fit(x_train, y_train, epochs=5, batch_size=1) # Training the model
# Prediction
scaler_predict = MinMaxScaler()
scaler_predict.fit_transform(df[['online']])
trainPredict = model.predict(x_train)
testPredict = model.predict(x_test)
# Invert predictions (Due to normalization)
trainPredict = scaler_predict.inverse_transform(trainPredict)
testPredict = scaler_predict.inverse_transform(testPredict)
# Now you have your future prediction in testPredict.
print("Predictions:")
print(testPredict)
np.savetxt('cache/section_predictions_lstm.txt',testPredict, fmt='%f')
# I'm lost here...
df
if __name__ == "__main__":
options = { 1: ['get all historical grades from ilearn',get_all] ,
@ -920,6 +1207,11 @@ if __name__ == "__main__":
10: ['normalize course histories', normalize_course_histories],
11: ['cluster student histories', cluster_student_histories],
12: ['try to make a schedule', try_make_sched],
13: ['ES model section predict attendance', exp_smoothing_section_model],
14: ['section stats by mode', section_stats_bymode],
15: ['student courses by semester', student_by_semester],
16: ['LSTM model sections', lstm_model_sections],
17: ['rearrange section data to yearly form', sections_grouped_by_year_mode],
}
print ('')