1642 lines
60 KiB
Python
1642 lines
60 KiB
Python
# statistics
|
|
|
|
"""
|
|
## Investigate: Success rates (grades) of students in:
|
|
|
|
- online courses (over all)
|
|
- sync and async and online live
|
|
- teachers/courses that have passed POCR (are all async?)
|
|
- teachers that have done more than the minimum training in online teaching
|
|
- in person classes, if grades are available
|
|
|
|
|
|
|
|
## Data collection
|
|
|
|
- Choose how many semesters (10?)
|
|
- Script 1 - given a CRN and Semester, download all grades
|
|
- Check if grades were used and make sense
|
|
- Compute mean, % > 70, median, etc.
|
|
|
|
- Anonymization steps
|
|
- replace teacher names w/ id number
|
|
- replace student names w/ id number
|
|
- replace course names w/ course code
|
|
|
|
- Script 2 - given all semester schedules, generate lists of:
|
|
- CRNs which are online, online live, hybrid, inperson, excluded
|
|
- CRNs in which teacher and course have passed pocr (and semester is greater than their pass date)
|
|
- CRNs in which teacher passed pocr for a different course (and semester is greater than their pass date)
|
|
- CRNs to exclude, for example SP20, because of covid. Possibly SU20 and FA20
|
|
|
|
- CRNs with are POCR approved
|
|
- CRNs in which teacher has done more than the minimum training in online teaching
|
|
|
|
- Student ids which have participated in the online orientation over a certain threshold
|
|
|
|
- Next steps: generate the x-reference for what categories teachers are in, and
|
|
integrate into the main data file.
|
|
|
|
|
|
- Next steps (June/July 2023)
|
|
- add campus, time of day, and sem_order (which semester in their college career did they take it) columns
|
|
- Organize rows by students
|
|
+ Develop a way to categorize them: by course set and/or score set (cluestering: kmeans, forest, etc)
|
|
|
|
- Goals
|
|
- display and summarize clusters of students on a dashboard
|
|
- ongoing categorization (implying course recommendations and interventions) based on it
|
|
-
|
|
|
|
|
|
## Hypothesis Testing
|
|
|
|
-
|
|
"""
|
|
import codecs, os, warnings, itertools
|
|
import json, csv, requests, sys, re
|
|
import numpy as np
|
|
import pandas as pd
|
|
from multiprocessing import Semaphore
|
|
from statistics import mean, median, stdev
|
|
from pipelines import fetch, url
|
|
from courses import getCoursesInTerm, course_enrollment
|
|
from localcache import get_course_enrollments
|
|
from localcache import query_multiple
|
|
from collections import defaultdict
|
|
|
|
all_grades_file = f"cache/grades_all.csv"
|
|
all_courses_file = f"cache/course_grades_all.csv"
|
|
all_courses_file2 = f"cache/course_grades_compact.csv"
|
|
all_courses_file3 = f"cache/course_grades_full.csv"
|
|
all_courses_file4 = "cache/course_grades_full_bystudent.csv"
|
|
all_courses_file5 = "cache/courses_passed_bystudent.csv"
|
|
student_courses_scores = "cache/courses_student_scores.csv"
|
|
student_orientation_participation = f'cache/participation_orientation_courses.json'
|
|
|
|
|
|
|
|
def num(s):
|
|
if s == '': return 0
|
|
s = re.sub(r'\.0','',s)
|
|
try:
|
|
return int(s)
|
|
except ValueError:
|
|
return float(s)
|
|
|
|
|
|
|
|
def sem_num_to_code(sem_num):
|
|
p = re.search(r'^(\d\d\d\d)(\d\d)$', sem_num)
|
|
if p:
|
|
yr = p.group(1)[2:4]
|
|
sem = p.group(2)
|
|
lookup = {'10':'wi','30':'sp', '50':'su', '70':'fa'}
|
|
return f"{lookup[sem]}{yr}"
|
|
return ""
|
|
|
|
def sem_code_to_num(sem_code): # fa23
|
|
p = re.search(r'^([a-z]{2})(\d\d)$', sem_code)
|
|
if p:
|
|
s = p.group(1)
|
|
y = p.group(2)
|
|
lookup = {'wi':'10','sp':'30', 'su':'50', 'fa':'70'}
|
|
return f"20{y}{lookup[s]}"
|
|
return ""
|
|
|
|
def codetest():
|
|
sems = '202330 202310 202270 202250 202230 202210 202170 202150 202130 202070 202050 202030 202010 201970 201950 201930 201910 201870 201850 201830'.split(' ')
|
|
codes = 'fa21 wi22 sp23 su23 fa23 wi24'.split(' ')
|
|
for s in sems:
|
|
print("{}: {}".format(s, sem_num_to_code(s)))
|
|
|
|
for c in codes:
|
|
print("{}: {}".format(c, sem_code_to_num(c)))
|
|
|
|
def get_all():
|
|
terms = '178 177 176 175 174 173 172 171 168 65 64 62 63 61 60 25 26 23 22 21'.split(' ')
|
|
sems = '202330 202310 202270 202250 202230 202210 202170 202150 202130 202070 202050 202030 202010 201970 201950 201930 201910 201870 201850 201830'.split(' ')
|
|
# Save grades to a CSV file
|
|
with open(all_grades_file, "w", newline="") as csvfile:
|
|
writer = csv.writer(csvfile)
|
|
writer.writerow(["crn", "sem", "coursecode", "s_can_id","g","name", "current", "final"])
|
|
for (term,sem) in zip(terms,sems):
|
|
print(term,sem,"\n")
|
|
courses = getCoursesInTerm(term,get_fresh=0,show=0,active=1)
|
|
for c in courses:
|
|
print(c['name'])
|
|
c_code = c['course_code']
|
|
grades(writer, sem, c['id'], c_code)
|
|
csvfile.flush()
|
|
|
|
|
|
def grades(writer, sem, COURSE_ID, course_code):
|
|
params = { "include[]": ["enrollments", "current_grading_period_scores"] }
|
|
grades = fetch(url + f"/api/v1/courses/{COURSE_ID}/users",0, params)
|
|
#grades = json.loads(grades.text)
|
|
|
|
for student in grades:
|
|
try:
|
|
id = student["id"]
|
|
name = student["name"]
|
|
g = student["login_id"]
|
|
print("\t", name)
|
|
if student['enrollments'][0]['type'] == 'StudentEnrollment':
|
|
grade = student["enrollments"][0]["grades"]["final_score"]
|
|
current = student["enrollments"][0]["grades"]["current_score"]
|
|
writer.writerow([COURSE_ID, sem, course_code, id, g, name, current, grade])
|
|
except Exception as e:
|
|
print("Exception:", e)
|
|
|
|
|
|
def get_student_orientations():
|
|
courses = {'iLearn Student Orientation 2022':'9768', # 8170 students
|
|
'Kickstart Online Orientation - Transfer':'36', # 6149
|
|
'Kickstart Online Orientation - New to College':'35', # 5392
|
|
'LIB732 SP18':'3295', # 2193
|
|
'LIB732 FA17':'2037', # 1868
|
|
'LIB732 SP17':'69', # 1645
|
|
'Kickstart Online Orientation - Returning':'37', # 1463
|
|
'iLearn Student Orientation 2023':'15924', # 1292
|
|
'LIB732 SU17':'1439' # 1281
|
|
}
|
|
|
|
views_bycourse = {}
|
|
all_student_ids = set()
|
|
|
|
# get pageviews of each orientation course
|
|
for c,i in courses.items():
|
|
print(c)
|
|
cache_file_name = f'cache/participation_course_{i}.json'
|
|
student_ids = [x[1] for x in get_course_enrollments(i)]
|
|
all_student_ids.update(student_ids)
|
|
if os.path.exists(cache_file_name):
|
|
pv = json.loads(codecs.open(cache_file_name,'r','utf-8').read())
|
|
else:
|
|
pv = get_student_page_views(i, student_ids)
|
|
codecs.open(cache_file_name,'w','utf-8').write(json.dumps(pv,indent=2))
|
|
views_bycourse[i] = pv
|
|
|
|
# add up pageviews for each student
|
|
views_bystudent = {}
|
|
for student_id in all_student_ids:
|
|
views_bystudent[student_id] = sum([views_bycourse[i].get(student_id,0) for i in courses.values()])
|
|
codecs.open(student_orientation_participation,'w','utf-8').write(json.dumps(views_bystudent,indent=2))
|
|
|
|
def get_student_page_views(course_id, student_ids):
|
|
page_views = {}
|
|
verbose = 0
|
|
|
|
for student_id in student_ids:
|
|
a = f'/api/v1/courses/{course_id}/analytics/users/{student_id}/activity'
|
|
response = fetch(url + a, verbose)
|
|
page_views[student_id] = sum(response.get('page_views', {}).values())
|
|
|
|
if verbose: print(page_views)
|
|
return page_views
|
|
|
|
schedules = {}
|
|
orientations = {}
|
|
|
|
def load_schedules():
|
|
global schedules
|
|
if not schedules:
|
|
for f in os.listdir('cache/schedule'):
|
|
m = re.search(r'(\w\w\d\d)_sched_expanded\.json', f)
|
|
if m:
|
|
sem = m.group(1)
|
|
schedules[sem] = json.loads( codecs.open('cache/schedule/' + f, 'r', 'utf-8').read() )
|
|
|
|
def load_orientations():
|
|
global orientations
|
|
if not orientations:
|
|
orientations = json.loads( codecs.open(student_orientation_participation,'r','utf-8').read() )
|
|
return orientations
|
|
|
|
|
|
def to_crn_fallback(name):
|
|
#print(name)
|
|
name = name.lower()
|
|
try:
|
|
m1 = re.search(r'(\d\d\d\d\d)',name)
|
|
if m1:
|
|
crn = m1.group(1)
|
|
else:
|
|
return None,None
|
|
m2 = re.search(r'([wispufa][wispufa]\d\d)',name.lower())
|
|
if m2:
|
|
sem = m2.group(1)
|
|
else:
|
|
return None, None
|
|
#print(name, crn, sem)
|
|
return crn, sem
|
|
except Exception as e:
|
|
#print("Exception: ", e, name)
|
|
return None, None
|
|
|
|
def ilearn_name_to_course_code(iname):
|
|
parts = iname.split(' ')
|
|
code = parts[0]
|
|
return code
|
|
|
|
def short_name_to_crn(name):
|
|
#print(name)
|
|
try:
|
|
parts = name.split(' ')
|
|
code = parts[0]
|
|
sem = parts[1]
|
|
crn = parts[2]
|
|
m_sem = re.search(r'^(\w\w\d\d)$',sem)
|
|
if not m_sem:
|
|
return to_crn_fallback(name)
|
|
m = re.search(r'^(\d\d\d\d\d)$',crn)
|
|
if m:
|
|
return crn,sem
|
|
else:
|
|
crn_parts = crn.split('/')
|
|
m = re.search(r'^(\d\d\d\d\d)$',crn_parts[0])
|
|
if m:
|
|
return crn_parts[0],sem
|
|
#print("non standard course short name: ", code, sem, crn)
|
|
return to_crn_fallback(name)
|
|
except Exception as e:
|
|
#print("Exception: ", e, name)
|
|
return to_crn_fallback(name)
|
|
|
|
|
|
def fixname(n):
|
|
return re.sub(r'\s+',' ', n).strip()
|
|
|
|
|
|
def short_name_to_teacher_type_crn_sem(name):
|
|
load_schedules()
|
|
crn, sem = short_name_to_crn(name)
|
|
|
|
try:
|
|
if sem:
|
|
sem = sem.lower()
|
|
if sem[0:2]=='wi':
|
|
sem = 'sp' + sem[2:]
|
|
for course in schedules[sem]:
|
|
if course['crn'] == crn:
|
|
return fixname(course['teacher']), course['type'], crn, sem
|
|
except Exception as e:
|
|
return None, None, None, None
|
|
|
|
return None, None, None, None
|
|
|
|
pocrs = {}
|
|
|
|
def load_pocrs():
|
|
global pocrs
|
|
if not pocrs:
|
|
with open('cache/pocr_passed.csv') as csvfile:
|
|
csvreader = csv.reader(csvfile)
|
|
next(csvreader)
|
|
for row in csvreader:
|
|
pocrs[row[0] + " " + row[1]] = row[2]
|
|
return pocrs
|
|
|
|
def lookup_pocr(teacher,course,sem):
|
|
p = load_pocrs()
|
|
pcode = teacher + " " + course
|
|
if pcode in p:
|
|
sem_passed = sem_code_to_num(p[pcode])
|
|
sem_test = sem_code_to_num(sem)
|
|
if sem_passed < sem_test:
|
|
return True
|
|
return False
|
|
|
|
def nametest():
|
|
with open(all_courses_file) as csvfile:
|
|
csvreader = csv.reader(csvfile)
|
|
next(csvreader)
|
|
|
|
for row in csvreader:
|
|
print(row[0], "-", short_name_to_teacher_type_crn_sem(row[0]))
|
|
next(csvreader)
|
|
|
|
def above_70(li,maximum):
|
|
cutoff = 0.7 * maximum
|
|
above = list(filter(lambda x: x >= cutoff, li))
|
|
return round(len(above)/len(li), 3)
|
|
|
|
|
|
|
|
|
|
|
|
# v1, does a row of averages for each course
|
|
def process_one_course_grades(block, output, out_c, teacher_to_code, course_to_code):
|
|
fxns = [mean, median, stdev, min, max, len]
|
|
c_id = block[0][0]
|
|
sem = block[0][1]
|
|
course_code = block[0][2]
|
|
cur_scores = [num(x[6]) for x in block]
|
|
final_scores = [num(x[7]) for x in block]
|
|
teacher, mode, crn, sem2 = short_name_to_teacher_type_crn_sem(course_code)
|
|
if not teacher:
|
|
return
|
|
tch_code = teacher_to_code[teacher]
|
|
crs_code = course_to_code[course_code]
|
|
if len(final_scores) < 2:
|
|
return
|
|
try:
|
|
(cur_mean, cur_median, cur_stdev, cur_min, cur_max, cur_count) = [round(f(cur_scores)) for f in fxns]
|
|
(final_mean, final_median, final_stdev, final_min, final_max, final_count) = [round(f(final_scores)) for f in fxns]
|
|
|
|
cur_pct_passed = above_70(cur_scores, cur_max)
|
|
final_pct_passed = above_70(final_scores, final_max)
|
|
|
|
if final_max == 0: return
|
|
|
|
scaled_final_scores = [ x / final_max for x in final_scores]
|
|
(scl_mean, scl_median, scl_stdev, scl_min, scl_max, scl_count) = [round(f(scaled_final_scores),2) for f in fxns]
|
|
|
|
good_code = ilearn_name_to_course_code(course_code)
|
|
pocr = 1 if lookup_pocr(teacher, good_code, sem2) else 0
|
|
|
|
output.writerow( [crs_code, good_code, pocr, tch_code, mode, final_pct_passed, scl_mean, scl_median, scl_stdev, scl_min, scl_max, final_count] )
|
|
out_c.writerow([crs_code, good_code, pocr, tch_code, mode, final_pct_passed, scl_mean, scl_median, scl_stdev, final_count])
|
|
except Exception as e:
|
|
print("Exception:", e)
|
|
|
|
|
|
|
|
|
|
# v2, one line per student/course
|
|
def process_one_course_grades_full(block, out_f, teacher_to_code, course_to_code):
|
|
fxns = [mean, median, stdev, min, max, len]
|
|
c_id = block[0][0]
|
|
sem = block[0][1]
|
|
course_code = block[0][2]
|
|
cur_scores = [num(x[6]) for x in block]
|
|
final_scores = [num(x[7]) for x in block]
|
|
teacher, mode, crn, sem2 = short_name_to_teacher_type_crn_sem(course_code)
|
|
if not teacher:
|
|
return
|
|
tch_code = teacher_to_code[teacher]
|
|
crs_code = course_to_code[course_code]
|
|
if len(final_scores) < 2:
|
|
return
|
|
try:
|
|
|
|
# "course_code course pocr_status orientation_status teacher_code mode student_id scaled_score"
|
|
|
|
(final_mean, final_median, final_stdev, final_min, final_max, final_count) = [round(f(final_scores)) for f in fxns]
|
|
final_pct_passed = above_70(final_scores, final_max)
|
|
|
|
if final_max == 0: return
|
|
|
|
scaled_final_scores = [ x / final_max for x in final_scores]
|
|
(scl_mean, scl_median, scl_stdev, scl_min, scl_max, scl_count) = [round(f(scaled_final_scores),2) for f in fxns]
|
|
|
|
good_code = ilearn_name_to_course_code(course_code)
|
|
pocr = 1 if lookup_pocr(teacher, good_code, sem2) else 0
|
|
|
|
o = load_orientations()
|
|
|
|
for row in block:
|
|
student_id = row[3]
|
|
orientation = o[student_id] if student_id in o else 0
|
|
scaled_score = round(num(row[7]) / final_max, 2)
|
|
out_f.writerow([crs_code, good_code, pocr, orientation, tch_code, mode, student_id, scaled_score])
|
|
print(course_code)
|
|
except Exception as e:
|
|
print("Exception:", e)
|
|
|
|
def process_grades():
|
|
# first loop to get all names
|
|
courses_labeled = {}
|
|
teacher_to_code = {}
|
|
code_to_teacher = {}
|
|
|
|
course_to_code = {}
|
|
code_to_course = {}
|
|
|
|
index = 1001
|
|
crs_index = 4001
|
|
|
|
with open(all_grades_file, newline="") as csvfile:
|
|
csvreader = csv.reader(csvfile)
|
|
next(csvreader)
|
|
for row in csvreader:
|
|
crn_sem = row[0] + '_' + row[1]
|
|
if not crn_sem in courses_labeled:
|
|
teacher, mode, crn, sem2 = short_name_to_teacher_type_crn_sem(row[2])
|
|
courses_labeled[crn_sem] = teacher
|
|
|
|
if not row[2] in course_to_code:
|
|
course_to_code[row[2]] = crs_index
|
|
code_to_course[crs_index] = row[2]
|
|
crs_index += 1
|
|
|
|
if teacher:
|
|
if not teacher in teacher_to_code:
|
|
teacher_to_code[teacher] = index
|
|
code_to_teacher[index] = teacher
|
|
index += 1
|
|
codecs.open('cache/teacher_lookup_codes.json','w','utf-8').write( json.dumps( [teacher_to_code, code_to_teacher], indent=2) )
|
|
codecs.open('cache/course_lookup_codes.json','w','utf-8').write( json.dumps( [course_to_code, code_to_course], indent=2) )
|
|
|
|
out_fullrows = codecs.open(all_courses_file3,'w','utf-8')
|
|
out_f = csv.writer(out_fullrows)
|
|
out_f.writerow("course_code course pocr_status orientation_status teacher_code mode student_id scaled_score".split(" "))
|
|
|
|
out_compact = codecs.open(all_courses_file2,'w','utf-8')
|
|
out_c = csv.writer(out_compact)
|
|
out_c.writerow("course_code course pocr_status teacher_code mode percent_passed scl_mean scl_median scl_stdev count".split(" "))
|
|
with open(all_courses_file, "w", newline="") as output_f:
|
|
output = csv.writer(output_f)
|
|
output.writerow("course_code course pocr_status teacher_code mode percent_passed scl_mean scl_median scl_stdev scl_min scl_max count".split(" "))
|
|
|
|
with open(all_grades_file, newline="") as csvfile:
|
|
csvreader = csv.reader(csvfile)
|
|
block = []
|
|
current_index = None
|
|
|
|
next(csvreader)
|
|
|
|
for row in csvreader:
|
|
index = row[0]
|
|
|
|
if index != current_index:
|
|
if block:
|
|
process_one_course_grades(block, output, out_c, teacher_to_code, course_to_code)
|
|
process_one_course_grades_full(block, out_f, teacher_to_code, course_to_code)
|
|
block = []
|
|
current_index = index
|
|
|
|
block.append(row)
|
|
|
|
if block:
|
|
process_one_course_grades(block, output, out_c, teacher_to_code, course_to_code)
|
|
process_one_course_grades_full(block, out_f, teacher_to_code, course_to_code)
|
|
|
|
|
|
def reorganize_grades_student():
|
|
with open(all_courses_file3, newline="") as csvfile:
|
|
csvreader = csv.reader(csvfile)
|
|
bystudent = defaultdict(list)
|
|
|
|
next(csvreader)
|
|
|
|
for row in csvreader:
|
|
st = row[6]
|
|
bystudent[st].append(row)
|
|
|
|
students = sorted(bystudent.keys())
|
|
with open(all_courses_file4, "w", newline="") as output_f:
|
|
with open(all_courses_file5, "w", newline="") as output_s:
|
|
with open(student_courses_scores,'w') as output_scs:
|
|
output_s.write("student,courses\n")
|
|
output = csv.writer(output_f)
|
|
output.writerow("course_code course pocr_status orientation_status teacher_code mode student_id scaled_score".split(" "))
|
|
# student id 0 has no courses
|
|
output.writerow([0,])
|
|
for st in students:
|
|
courses = [r[1] for r in bystudent[st]]
|
|
scores = [r[7] for r in bystudent[st]]
|
|
zipped = zip(courses,scores)
|
|
output_scs.write(st + ",")
|
|
for c,s in zipped:
|
|
output_scs.write(f"{c}|{s},")
|
|
output_scs.write("\n")
|
|
output_s.write(st + "," + " ".join(courses) + "\n")
|
|
for row in bystudent[st]:
|
|
output.writerow(row)
|
|
|
|
|
|
def all_course_names_setup():
|
|
cc = json.loads(codecs.open('cache/courses/courses_built.json','r','utf-8').read())
|
|
courses = {}
|
|
for C in cc.values():
|
|
name = C['dept'] + C['number']
|
|
#print(name)
|
|
courses[ name ] = C
|
|
|
|
#co = codecs.open('cache/courses/names.json','w','utf-8')
|
|
#for c in sorted(courses.keys()):
|
|
# co.write(c + "\n")
|
|
|
|
cr = codecs.open('cache/courses/names.json','r','utf-8')
|
|
|
|
|
|
from_data = codecs.open('cache/courses_student_scores.csv','r','utf-8').readlines()
|
|
unknown = {}
|
|
for line in from_data:
|
|
parts = line.split(',')
|
|
stu_id = parts[0]
|
|
ea = parts[1:]
|
|
for C in ea:
|
|
each = C.split('|')
|
|
name = each[0]
|
|
if not name in courses:
|
|
unknown[name] = name
|
|
#data_courses[each[0]] += 1
|
|
for c in sorted(unknown.keys()):
|
|
print(c)
|
|
|
|
#co.write( json.dumps( {'unknown':unknown, 'coursenames':courses}, indent=2 ))
|
|
|
|
|
|
lookup = {}
|
|
names = {}
|
|
|
|
def shell2course(shell):
|
|
global lookup, names
|
|
if not lookup:
|
|
cr = json.loads(codecs.open('cache/courses/names.json','r','utf-8').read())
|
|
lookup = cr['unknown']
|
|
allcourses = cr['coursenames']
|
|
names = allcourses.keys()
|
|
|
|
if shell in names:
|
|
return shell
|
|
if shell in lookup:
|
|
c = lookup[shell]
|
|
if c in names:
|
|
return c
|
|
#print(f"Can't find course: {shell}")
|
|
return ""
|
|
|
|
|
|
|
|
|
|
def stu_record_line(line):
|
|
line = line.strip()
|
|
line = line.strip(',')
|
|
parts = line.split(',')
|
|
stu_id = parts[0]
|
|
courses = []
|
|
for C in parts[1:]:
|
|
courses.append(C.split('|'))
|
|
return stu_id, courses
|
|
|
|
def stu_record_to_vector(line, boolean=0):
|
|
id, courses = stu_record_line(line)
|
|
|
|
yesval = "true" if boolean else 1
|
|
noval = "false" if boolean else 0
|
|
|
|
template = json.loads(codecs.open('cache/courses/course_main_record.json','r','utf-8').read())
|
|
lookup = {}
|
|
for i,c in enumerate(template):
|
|
lookup[c] = i
|
|
vector = [noval for x in range(len(template))]
|
|
for C in courses:
|
|
goodname = shell2course(C[0])
|
|
if goodname:
|
|
vector[lookup[goodname]] = yesval # C[1] # score
|
|
return id,vector,courses
|
|
|
|
|
|
def grades_to_vectors(boolean=0, verbose=0):
|
|
grades = codecs.open('cache/courses_student_scores.csv','r','utf-8').readlines()
|
|
for L in grades:
|
|
id, vector, courses = stu_record_to_vector(L,boolean)
|
|
if verbose: print(id, vector)
|
|
yield id, vector, courses
|
|
|
|
def course_main_record():
|
|
return json.loads(codecs.open('cache/courses/course_main_record.json','r','utf-8').read())
|
|
|
|
|
|
def courses_to_vector_ordered(course_list):
|
|
# each course is (name, semester_order, score)
|
|
template = course_main_record()
|
|
lookup = {}
|
|
for i,c in enumerate(template):
|
|
lookup[c] = i
|
|
vector = ['0' for x in range(len(template))]
|
|
for course,order,score in course_list:
|
|
goodname = shell2course(course)
|
|
if goodname:
|
|
vector[lookup[goodname]] = str(order)
|
|
return vector
|
|
|
|
def courses_to_vector(course_list, boolean=1):
|
|
#print(course_list)
|
|
yesval = "true" if boolean else 1
|
|
noval = "false" if boolean else 0
|
|
template = course_main_record()
|
|
lookup = {}
|
|
for i,c in enumerate(template):
|
|
lookup[c] = i
|
|
vector = [noval for x in range(len(template))]
|
|
for C in course_list:
|
|
C = C.strip()
|
|
#goodname = shell2course(C[0])
|
|
#if goodname:
|
|
#print(C)
|
|
vector[lookup[C]] = yesval # C[1] # score
|
|
#print(vector)
|
|
return vector
|
|
|
|
def course_vector_to_names(vector):
|
|
template = course_main_record()
|
|
names = []
|
|
for i,v in enumerate(vector):
|
|
if v:
|
|
names.append(template[i])
|
|
return names
|
|
|
|
|
|
def all_course_names():
|
|
ac = json.loads(codecs.open('cache/courses/courses_built.json','r','utf-8').read())
|
|
master_record = []
|
|
for C in ac.values():
|
|
if C['status'] == 'Draft':
|
|
continue
|
|
name = C['dept'] + C['number']
|
|
master_record.append(name)
|
|
master_record = set(master_record)
|
|
master_record = list(master_record)
|
|
master_record = sorted(master_record)
|
|
|
|
## Extract from all 'accomplished courses'...
|
|
if 0:
|
|
complete_list = {}
|
|
missing_names = {}
|
|
with open(student_courses_scores,'r') as input_f:
|
|
for L in input_f:
|
|
stu_id, courses = stu_record_line(L)
|
|
for C in courses:
|
|
real_name = shell2course(C[0])
|
|
if real_name:
|
|
complete_list[real_name] = 1
|
|
else:
|
|
missing_names[C[0]] = 1
|
|
master_record = sorted(complete_list.keys())
|
|
print(f"Found {len(master_record)} courses")
|
|
print(master_record)
|
|
print(f"Missing {len(missing_names)} courses")
|
|
print(missing_names)
|
|
mr = codecs.open('cache/courses/course_main_record.json','w','utf-8')
|
|
mr.write(json.dumps(master_record,indent=2))
|
|
|
|
|
|
from semesters import sems_by_human_name, canvas_label
|
|
from semesters import code as semester_order
|
|
from localcache import all_students_history
|
|
from datetime import datetime, timedelta
|
|
|
|
def semester_dates():
|
|
#print()
|
|
for c in canvas_label:
|
|
print(sems_by_human_name[c])
|
|
|
|
length = 15
|
|
if sems_by_human_name[c]['code'][0:2] == 'su':
|
|
length = 5
|
|
start_date = sems_by_human_name[c]['start']
|
|
# Convert the date string to a datetime object
|
|
date_object = datetime.strptime(start_date, '%m/%d/%y')
|
|
start_fmt = date_object.strftime('%a %b %d, %Y')
|
|
|
|
# Add 15weeks, 5days to the date
|
|
new_date = date_object + timedelta(weeks=15)
|
|
new_date = new_date + timedelta(days=5)
|
|
|
|
# Format the new date as a string
|
|
new_date_string = new_date.strftime('%m/%d/%y')
|
|
end_fmt = new_date.strftime('%a %b %d, %Y')
|
|
|
|
# Print the new date
|
|
print(f"start: {start_fmt}, end: {end_fmt}")
|
|
|
|
|
|
|
|
current_student = ""
|
|
current_student_block = []
|
|
current_student_info = {'first':'', 'last':''}
|
|
normalized_blocks = []
|
|
|
|
ignore_courses = "El,zACCT20,GASPAR".split(",")
|
|
seen_courses = []
|
|
|
|
def course_line_process(line):
|
|
global current_student, current_student_block, seen_courses, normalized_blocks, current_student_info
|
|
|
|
sem = line['term_name']
|
|
m1 = re.search(r'^(\d\d\d\d)\s(\w+)$', sem)
|
|
if not m1: # is NOT an academic semester, skip
|
|
return
|
|
|
|
uid = line['canvasid']
|
|
if uid != current_student:
|
|
if current_student_block:
|
|
current_student_block.append(current_student_info)
|
|
normalized_blocks.append(current_student_block)
|
|
current_student_block = []
|
|
current_student_info = {'first':sems_by_human_name[sem]['code'], 'last':''}
|
|
current_student = uid
|
|
#print(f"Student: {uid} ({line['user_name']})")
|
|
|
|
# line is a dict
|
|
current_student_info['last'] = sems_by_human_name[sem]['code']
|
|
year, season = m1.group(1), m1.group(2)
|
|
date_format = "%Y-%m-%d %H:%M:%S.%f"
|
|
create_dt = datetime.strptime(line['created'], date_format)
|
|
update_dt = datetime.strptime(line['updated'], date_format)
|
|
sem_start = datetime.strptime(sems_by_human_name[sem]['start'], '%m/%d/%y')
|
|
|
|
course = line['course_name']
|
|
c_parts = course.split(' ')
|
|
if c_parts[0] in ignore_courses or c_parts[0] in seen_courses:
|
|
return
|
|
classname = shell2course(c_parts[0])
|
|
if not classname:
|
|
# print empty dict entry for initial setup
|
|
# print(f" \"{c_parts[0]}\": \"\",")
|
|
seen_courses.append(c_parts[0])
|
|
else:
|
|
#
|
|
flow = line['workflow']
|
|
mark = '+'
|
|
if flow == "deleted": mark = '-'
|
|
# normal start & finish, give add date
|
|
add_day = sem_start - create_dt
|
|
add_day = add_day.days
|
|
sign = '-'
|
|
if add_day < 0:
|
|
add_day = -add_day
|
|
sign = '+'
|
|
#print(f" {mark} {classname} added T{sign}{add_day} {semester_list[sem]['code']}")
|
|
temp_usr_name = re.sub(r',','',line['user_name'])
|
|
current_student_block.append(f"{uid},{temp_usr_name},{classname},add,T{sign}{add_day},{sems_by_human_name[sem]['code']}")
|
|
if flow == "deleted":
|
|
# deleted, give delete date
|
|
del_day = sem_start - update_dt
|
|
del_day = del_day.days
|
|
sign = '-'
|
|
if del_day < 0:
|
|
del_day = -del_day
|
|
sign = '+'
|
|
#print(f" {mark} {classname} deleted T{sign}{del_day} {semester_list[sem]['code']}")
|
|
current_student_block.append(f"{uid},{temp_usr_name},{classname},del,T{sign}{del_day},{sems_by_human_name[sem]['code']}")
|
|
|
|
|
|
def normalize_course_histories():
|
|
global normalized_blocks, current_student_block, current_student_info
|
|
all_students_history(course_line_process, limit=99910000)
|
|
current_student_block.append(current_student_info)
|
|
normalized_blocks.append(current_student_block)
|
|
|
|
codecs.open('cache/normalized_student_add_drop.json','w','utf-8').write(json.dumps(normalized_blocks,indent=2))
|
|
|
|
# let's see if we can get grades...
|
|
grades_by_student_course = defaultdict(dict)
|
|
print("Doing grades...")
|
|
with codecs.open('cache/courses_student_scores.csv','r','utf-8') as gradesfile:
|
|
for s in gradesfile:
|
|
parts = s.split(',')
|
|
stu = int(parts[0])
|
|
#print(stu)
|
|
for c in parts[1:]:
|
|
try:
|
|
#print(c)
|
|
crs,gra = c.split('|')
|
|
grades_by_student_course[stu][crs] = gra
|
|
except Exception as e:
|
|
pass
|
|
|
|
# go through again
|
|
print("Second pass of grades and student history...")
|
|
student_history = codecs.open('cache/normalized_student_history.csv','w','utf-8')
|
|
student_history.write("studentid,studentname,course,action,when,grade,sem_name,first_sem,last_sem,tenure_length,sem_index\n")
|
|
semester_order.reverse()
|
|
for blk in normalized_blocks:
|
|
info = blk[-1]
|
|
first = semester_order.index(info['first']) + 1
|
|
last = semester_order.index(info['last']) + 1
|
|
length = last - first + 1
|
|
|
|
for course in blk[:-1]:
|
|
parts = course.split(',')
|
|
#print(parts)
|
|
sem = parts[5]
|
|
sem_index = semester_order.index(sem) - first + 2
|
|
stu = int(parts[0])
|
|
crs = parts[2]
|
|
grade = ""
|
|
if stu in grades_by_student_course:
|
|
if crs in grades_by_student_course[stu]:
|
|
grade = grades_by_student_course[stu][crs]
|
|
|
|
student_history.write(",".join([parts[0], parts[1], parts[2], parts[3], parts[4], grade, parts[5], str(first), str(last), str(length), str(sem_index), ]) + '\n')
|
|
|
|
# make "unified records" or one line per student
|
|
student_history_2 = codecs.open('cache/normalized_student_history2.csv','w','utf-8')
|
|
allcourse = course_main_record()
|
|
#print(allcourse)
|
|
template = ['studentid', 'studentname', 'tenure_length']
|
|
template.extend(allcourse)
|
|
#print(template)
|
|
student_history_2.write( ",".join(template) + "\n" )
|
|
|
|
for blk in normalized_blocks:
|
|
student_block = []
|
|
info = blk[-1]
|
|
first = semester_order.index(info['first']) + 1
|
|
last = semester_order.index(info['last']) + 1
|
|
length = last - first + 1
|
|
|
|
temp_course_holder = {}
|
|
temp_course_grade_holder = {}
|
|
|
|
for course in blk[:-1]:
|
|
parts = course.split(',')
|
|
#print(parts)
|
|
sem = parts[5]
|
|
sem_index = semester_order.index(sem) - first + 2
|
|
stu = int(parts[0])
|
|
crs = parts[2]
|
|
if parts[3] == 'add':
|
|
temp_course_holder[crs] = sem_index
|
|
elif parts[3] == 'del' and crs in temp_course_holder:
|
|
del temp_course_holder[crs]
|
|
|
|
# now the temp_course_holder has the courses and semesters
|
|
for crs,sem_index in temp_course_holder.items():
|
|
grade = ""
|
|
if stu in grades_by_student_course:
|
|
if crs in grades_by_student_course[stu]:
|
|
grade = grades_by_student_course[stu][crs]
|
|
this_record = (crs, sem_index, grade)
|
|
student_block.append(this_record)
|
|
student_vector = [ parts[0], parts[1], str(length) ]
|
|
student_vector.extend(courses_to_vector_ordered(student_block))
|
|
|
|
student_history_2.write(",".join(student_vector) + '\n')
|
|
#print(student_vector)
|
|
|
|
def cluster_student_histories():
|
|
infile = 'cache/courses_student_scores.csv'
|
|
|
|
import pandas as pd
|
|
import matplotlib.pyplot as plt
|
|
from kneed import KneeLocator
|
|
from sklearn.datasets import make_blobs
|
|
from sklearn.cluster import KMeans
|
|
from sklearn.metrics import silhouette_score
|
|
from sklearn.preprocessing import StandardScaler
|
|
|
|
df = pd.read_csv(infile)
|
|
|
|
def dept(s):
|
|
parts = s.split(' ')
|
|
return parts[0]
|
|
|
|
def try_make_sched():
|
|
term = "fa23"
|
|
sched = requests.get(f"http://gavilan.cc/schedule/{term}_sched.json").json()
|
|
#print(json.dumps(sched,indent=2))
|
|
|
|
d = "CSIS"
|
|
courses = [ [x['code'], x['crn']] for x in sched if dept(x['code'])==d ]
|
|
teachers = { x['teacher'] for x in sched if dept(x['code'])==d }
|
|
|
|
print(courses)
|
|
print(teachers)
|
|
|
|
|
|
|
|
def sched_lookup_tables():
|
|
|
|
# Renumber the semesters
|
|
# sp16 su16 fa16 wi17 sp17 su17 fa17 wi18
|
|
#semesters = "sp18 su18 fa18 wi19 sp19 su19 fa19 wi20 sp20 su20 fa20 wi21 sp21 su21 fa21 wi22 sp22 su22 fa22 wi23 sp23 su23 fa23 wi24 sp24 su24 fa24 wi25 sp25 su25 fa25 wi26".split(" ")
|
|
|
|
sem_fourcode = "sp18 su18 fa18 sp19 su19 fa19 sp20 su20 fa20 sp21 su21 fa21 sp22 su22 fa22 sp23 su23 fa23 sp24 su24 fa24 sp25 su25 fa25".split(" ")
|
|
int_numbers = [x for x in range(1,len(sem_fourcode)+1)]
|
|
fourcode_2_int = {semester: number for semester, number in zip(sem_fourcode, int_numbers)}
|
|
int_2_fourcode = {v: k for k, v in fourcode_2_int.items()}
|
|
|
|
sis_2_fourcode = {}
|
|
fourcode_2_sis = {}
|
|
yr = 2018
|
|
sems = ['30','50','70']
|
|
i = 0
|
|
semcodes = []
|
|
while yr < 2026:
|
|
for s in sems:
|
|
semcodes.append(f"{yr}{s}")
|
|
sis_2_fourcode[f"{yr}{s}"] = sem_fourcode[i]
|
|
fourcode_2_sis[sis_2_fourcode[f"{yr}{s}"]] = f"{yr}{s}"
|
|
#print(f"UPDATE schedule SET semsis={yr}{s} WHERE sem='{semesters[i]}';")
|
|
i += 1
|
|
yr += 1
|
|
return fourcode_2_int, int_2_fourcode, sis_2_fourcode, fourcode_2_sis, semcodes
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def section_stats_bymode():
|
|
data = query_multiple("SELECT code, semsis, COUNT(id) AS sections, sum(act) filter (WHERE type='in-person') AS inperson, sum(act) filter (WHERE type='online') AS online, sum(act) filter (WHERE type='hybrid') AS hybrid, sum(act) filter (WHERE type='online live') AS onlinelive FROM schedule GROUP BY code, semsis ORDER BY code, semsis;", 'cache/canvas_data/data20231012.db')
|
|
import pandas as pd
|
|
df = pd.DataFrame(data)
|
|
df.fillna(0,inplace=True)
|
|
for L in 'sections,inperson,online,hybrid,onlinelive'.split(','):
|
|
df[L] = df[L].astype(int)
|
|
print(df)
|
|
df.to_csv('cache/section_stats_bymode.csv')
|
|
return df
|
|
|
|
def section_stats():
|
|
# for each course, (ENG1A) how many are enrolled in each all sections?
|
|
# (and break down by mode,time,location,etc)
|
|
#
|
|
# for each course, how many are first semester gav students?
|
|
#
|
|
data = query_multiple("SELECT * FROM schedule ORDER BY code,id", 'cache/canvas_data/data20231012.db')
|
|
|
|
fourcode_2_int, int_2_fourcode, sis_2_fourcode, fourcode_2_sis, semcodes = sched_lookup_tables()
|
|
|
|
# Assuming your data is in a list of dictionaries called data
|
|
df = pd.DataFrame(data)
|
|
|
|
# Drop the specified columns
|
|
df = df.drop(columns=['id', 'crn', 'units', 'teacher', 'start', 'end', 'loc', 'cap'])
|
|
|
|
codecs.open('cache/sem_mapping.json','w','utf-8').write(json.dumps(fourcode_2_int,indent=2))
|
|
|
|
df['sem'] = df['sem'].map(fourcode_2_int)
|
|
df.set_index('sem', inplace=True)
|
|
return df
|
|
|
|
def simple_exp_smoothing_section_model():
|
|
sout = codecs.open('cache/section_predictions.txt','w','utf-8')
|
|
from statsmodels.tsa.api import SimpleExpSmoothing
|
|
warnings.filterwarnings("ignore")
|
|
periods = 3
|
|
start = 19
|
|
|
|
df = section_stats()
|
|
print(df)
|
|
df = df.sort_index()
|
|
|
|
predictions = {}
|
|
for course_code in df['code'].unique():
|
|
try:
|
|
print(course_code)
|
|
sout.write(course_code + "\n")
|
|
this_set = df[df['code'] == course_code]['act']
|
|
this_set = this_set.groupby('sem').sum()
|
|
#this_set.fillna(method='ffill', inplace=True)
|
|
#this_set.fillna(0, inplace=True)
|
|
|
|
# Create a new index with all required semesters
|
|
new_index = np.arange(this_set.index.min(), this_set.index.max()+1)
|
|
|
|
# Reindex the DataFrame and fill missing values with 0
|
|
this_set = this_set.reindex(new_index, fill_value=0)
|
|
|
|
print(this_set.to_string())
|
|
|
|
sout.write(this_set.to_string() + "\n")
|
|
model = SimpleExpSmoothing(this_set)
|
|
fit = model.fit(smoothing_level=0.2) # initiate with a smoothing level of 0.2
|
|
# Later modify above line based on if your data has high or low variability
|
|
|
|
#prediction = fit.forecast(start=32,end=34) # predict attendance for the next 3 semesters
|
|
prediction = fit.predict(start=start,end=start+4)
|
|
print(prediction)
|
|
sout.write(str(prediction) + "\n")
|
|
sout.flush()
|
|
predictions[course_code] = prediction
|
|
except Exception as e:
|
|
print(f"Model creation failed for {course_code} due to {str(e)}")
|
|
sout.write(f"Model creation failed for {course_code} due to {str(e)}\n")
|
|
"""
|
|
model = ARIMA(this_set, order=(1,1,1)) #ARIMA params (p, d, q)
|
|
model_fit = model.fit()
|
|
forecast_result = model_fit.forecast(steps=periods)
|
|
if forecast_result:
|
|
predictions[course_code] = forecast_result[0]
|
|
else:
|
|
print(f"No prediction for {course_code}. Skipping...")"""
|
|
|
|
|
|
# statistics - use a smooth exponential model to predict the next 3 semesters of enrollment
|
|
# Doesn't really seem to get the patterns.
|
|
def exp_smoothing_section_model():
|
|
sout = codecs.open('cache/section_predictions.txt','w','utf-8')
|
|
from statsmodels.tsa.api import ExponentialSmoothing
|
|
warnings.filterwarnings("ignore")
|
|
periods = 3
|
|
start = 19
|
|
|
|
fourcode_2_int, int_2_fourcode, sis_2_fourcode, fourcode_2_sis, semcodes = sched_lookup_tables()
|
|
|
|
df = section_stats()
|
|
print(df)
|
|
df = df.sort_index()
|
|
|
|
predictions = {}
|
|
for course_code in df['code'].unique():
|
|
try:
|
|
print(course_code)
|
|
#sout.write(course_code + "\n")
|
|
this_set = df[df['code'] == course_code]['act']
|
|
this_set = this_set.groupby('sem').sum()
|
|
#this_set.fillna(method='ffill', inplace=True)
|
|
#this_set.fillna(0, inplace=True)
|
|
|
|
# Create a new index with all required semesters
|
|
new_index = np.arange(this_set.index.min(), this_set.index.max()+1)
|
|
|
|
# Reindex the DataFrame and fill missing values with 0
|
|
this_set = this_set.reindex(new_index, fill_value=0)
|
|
|
|
print(this_set.to_string())
|
|
|
|
for i,v in this_set.items():
|
|
sout.write(f"{course_code},{int_2_fourcode[i]},{v}\n")
|
|
|
|
model = ExponentialSmoothing(this_set, seasonal_periods=4, trend='add', seasonal='add')
|
|
fit = model.fit()
|
|
|
|
prediction = fit.predict(start=start,end=start+4)
|
|
print(prediction)
|
|
for i,v in prediction.items():
|
|
v = int(v)
|
|
if v<0: v=0
|
|
sout.write(f"{course_code},{int_2_fourcode[i]}, {v}\n")
|
|
sout.flush()
|
|
predictions[course_code] = prediction
|
|
except Exception as e:
|
|
print(f"Model creation failed for {course_code} due to {str(e)}")
|
|
#sout.write(f"Model creation failed for {course_code} due to {str(e)}\n")
|
|
|
|
def student_by_semester():
|
|
|
|
query = """
|
|
SELECT u.name, u.canvasid, s.code, s.semsis FROM users u
|
|
JOIN enrollment e ON u.id = e.user_id
|
|
JOIN courses c ON c.id = e.course_id
|
|
JOIN terms t ON c.termid = t.id
|
|
JOIN schedule s ON c.schedule = s.id
|
|
WHERE e.type='StudentEnrollment' AND e.workflow='active'
|
|
ORDER BY u.sortablename, s.semsis;
|
|
"""
|
|
|
|
df = pd.DataFrame(query_multiple(query, 'cache/canvas_data/data20231012.db'))
|
|
|
|
# Apply groupby and aggregate the courses in each semester in a comma-separated string
|
|
df['courses'] = df.groupby(['name','canvasid','semsis'])['code'].transform(lambda x : ' / '.join(x))
|
|
|
|
# Removing duplicates
|
|
df = df[['name','canvasid','semsis','courses']].drop_duplicates()
|
|
|
|
# Create pivot table
|
|
df_pivot = df.pivot_table(values='courses', index=['name','canvasid'], columns='semsis', aggfunc='first').reset_index()
|
|
|
|
# Adding prefix to new columns names to recognize them
|
|
df_pivot.columns = [str(col) + '_sem' if isinstance(col, int) else col for col in df_pivot.columns]
|
|
|
|
df_pivot.to_csv('cache/student_by_semester.csv')
|
|
|
|
|
|
def sections_grouped_by_year_mode():
|
|
df = section_stats_bymode()
|
|
|
|
# list of unique courses
|
|
df_all_courses = df['code'].unique()
|
|
|
|
# list of unique semesters
|
|
df_all_semesters = df['semsis'].unique()
|
|
df_all_semesters.sort()
|
|
|
|
|
|
|
|
raw_data = {}
|
|
for line in df:
|
|
print(line['semsis'])
|
|
sis = str(line['semsis'])
|
|
year = sis[0:4]
|
|
raw_data[ f"{line['code']}{year}"] = [line['inperson'],line['online'],line['hybrid'],line['onlinelive']]
|
|
print(raw_data)
|
|
return
|
|
|
|
for course in df_all_courses:
|
|
c = str(course)
|
|
template = {'code':[c,c,c], 'semsis':[], 'inperson':[], 'online':[], 'hybrid':[], 'onlinelive':[]}
|
|
|
|
# group semesters in to groups of 3 by year
|
|
for i in df_all_semesters:
|
|
j = str(i)
|
|
year = j[0:4]
|
|
print(f"{i} ({year})")
|
|
|
|
# for each course, for each group of 3 semesters, fill in values, using 0 if necessary
|
|
|
|
# ...
|
|
|
|
def lstm_model_sections():
|
|
from keras.models import Sequential
|
|
from keras.layers import Dense
|
|
from keras.layers import LSTM
|
|
from sklearn.preprocessing import MinMaxScaler
|
|
from sklearn.model_selection import train_test_split
|
|
|
|
# Preprocessing
|
|
|
|
# Normalize inputs for better performance
|
|
df = section_stats_bymode()
|
|
print(df)
|
|
scaler = MinMaxScaler(feature_range=(0, 1))
|
|
dataset_scaled = scaler.fit_transform(df.drop(['code', 'semsis'], axis=1))
|
|
print("scaled:")
|
|
print(df)
|
|
|
|
# Split features and targets (Assuming you want to predict 'online' enrollments)
|
|
X = dataset_scaled[:, 1:]
|
|
Y = dataset_scaled[:,0:1]
|
|
|
|
# Train / Test split
|
|
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=0)
|
|
|
|
# Reshape input to be [samples, time steps, features] which is required for LSTM
|
|
x_train = np.reshape(x_train, (x_train.shape[0], x_train.shape[1], 1))
|
|
x_test = np.reshape(x_test, (x_test.shape[0], x_test.shape[1], 1))
|
|
|
|
print("x_train shape:", x_train.shape)
|
|
print(x_train)
|
|
|
|
print("\n\nTraining...\n\n")
|
|
|
|
# LSTM architecture
|
|
model = Sequential()
|
|
model.add(LSTM(50, input_shape=(X.shape[1], 1))) # 50 LSTM blocks
|
|
model.add(Dense(1)) # Since we are predicting only 1 output ('online' enrollments)
|
|
model.compile(loss='mean_squared_error', optimizer='adam')
|
|
model.fit(x_train, y_train, epochs=5, batch_size=1) # Training the model
|
|
|
|
# Prediction
|
|
scaler_predict = MinMaxScaler()
|
|
scaler_predict.fit_transform(df[['online']])
|
|
trainPredict = model.predict(x_train)
|
|
testPredict = model.predict(x_test)
|
|
|
|
# Invert predictions (Due to normalization)
|
|
trainPredict = scaler_predict.inverse_transform(trainPredict)
|
|
testPredict = scaler_predict.inverse_transform(testPredict)
|
|
|
|
# Now you have your future prediction in testPredict.
|
|
|
|
print("Predictions:")
|
|
print(testPredict)
|
|
np.savetxt('cache/section_predictions_lstm.txt',testPredict, fmt='%f')
|
|
|
|
# I'm lost here...
|
|
df
|
|
|
|
|
|
|
|
|
|
|
|
def visualize_course_modes_multi_semester():
|
|
import plotly.express as px
|
|
from plotly.subplots import make_subplots
|
|
seasons = {'sp':'30','su':'50','fa':'70'}
|
|
semcodes = "sp18 su18 fa18 sp19 su19 fa19 sp20 su20 fa20 sp21 su21 fa21 sp22 su22 fa22 sp23 su23 fa23 sp24 su24 fa24".split(" ")
|
|
sems = { x:'20' + x[2:] + seasons[x[:2]] for x in semcodes }
|
|
sem_dfs = []
|
|
sem_dfs_depts = []
|
|
for s in sems.keys():
|
|
print(f"fetching {s}")
|
|
sched = requests.get(f"http://gavilan.cc/schedule/{s}_sched_expanded.json").json()
|
|
for crs in sched:
|
|
if 'extra' in crs: del crs['extra']
|
|
crs['dept'] = crs['code'].split(' ')[0]
|
|
df = pd.DataFrame(sched)
|
|
df_depts = df.copy()
|
|
df_depts = df_depts.drop(columns=['crn','sec','code','cmp','name','days','time','rem','wl_cap','wl_act','wl_rem','teacher','date','loc','ztc','time_start','time_end','start','end','doy'])
|
|
df = df.drop(columns=['crn','sec','code','cmp','name','days','time','rem','wl_cap','wl_act','wl_rem','teacher','date','loc','ztc','time_start','time_end','start','end','doy'])
|
|
grouped_by_dept = df_depts.groupby(['dept','type']).size().reset_index(name='count')
|
|
grouped_by_mode = df['type'].value_counts().reset_index()
|
|
grouped_by_dept["semester"] = sems[s]
|
|
grouped_by_mode["semester"] = sems[s]
|
|
sem_dfs.append(grouped_by_mode)
|
|
sem_dfs_depts.append(grouped_by_dept)
|
|
|
|
#grouped_json = grouped_by_dept.to_json(orient='records')
|
|
#j = json.loads(grouped_json)
|
|
#print(json.dumps(j,indent=2))
|
|
|
|
#grouped_by_dept.columns = ['Department', 'Count'] # rename the column names appropriately
|
|
#fig = px.bar(grouped_by_dept, x='Department', y='Count', title='Section Counts by Department')
|
|
#fig.write_html(f"cache/output_{s}.html")
|
|
|
|
|
|
combined_data = pd.concat(sem_dfs, axis=0)
|
|
#print(combined_data)
|
|
#combined_data = combined_data.rename(columns={'type':'count','index':'type'})
|
|
#print(combined_data)
|
|
combined_data.reset_index(drop=True,inplace=True)
|
|
#print(combined_data)
|
|
pivoted_data = combined_data.pivot(index='semester', columns='type', values='count')
|
|
pivoted_data.reset_index(inplace=True)
|
|
|
|
fig = px.bar(pivoted_data, x='semester',y=['hybrid', 'in-person', 'online', 'online live'], barmode='stack',
|
|
title='Course Delivery by Semester',
|
|
color_discrete_sequence=["#000066","#660000","#333366","#9400D3"])
|
|
fig.update_layout(height=200*len(fig['data']))
|
|
fig.write_html(f"cache/sections_by_deliverymode.html")
|
|
print(f"wrote to: cache/sections_by_deliverymode.html")
|
|
|
|
combined_data_depts = pd.concat(sem_dfs_depts, axis=0)
|
|
combined_data_depts.reset_index(drop=True,inplace=True)
|
|
#print(combined_data_depts)
|
|
combined_data_depts.to_csv('cache/section_delivery_by_dept.csv')
|
|
'''pivoted_data_depts = combined_data_depts.pivot(index='semester', columns='type', values='count')
|
|
pivoted_data_depts.reset_index(inplace=True)
|
|
|
|
fig = px.bar(pivoted_data_depts, x='semester',y=['hybrid', 'in-person', 'online', 'online live'], barmode='stack',
|
|
title='Course Delivery by Semester',
|
|
color_discrete_sequence=["#000066","#660000","#333366","#9400D3"])
|
|
fig.write_html(f"cache/sections_depts_by_deliverymode.html")'''
|
|
|
|
unique_depts = combined_data_depts['dept'].unique()
|
|
fig = make_subplots(rows=len(unique_depts), cols=1,
|
|
subplot_titles=unique_depts,
|
|
)
|
|
|
|
print("\n\nindividual departments\n\n")
|
|
for i, dept in enumerate(unique_depts, start=1):
|
|
#if i>1: break
|
|
# Filter the dataframe for the current department
|
|
print(f"{dept}")
|
|
dept_data = combined_data_depts[combined_data_depts['dept'] == dept]
|
|
|
|
# Pivot the data frame
|
|
pivoted_dept_data = dept_data.pivot(index='semester', columns='type', values='count').reset_index()
|
|
pivoted_dept_data.fillna(0,inplace=True)
|
|
#print(pivoted_dept_data)
|
|
|
|
# Plot the data
|
|
columns_to_plot = ['hybrid', 'in-person', 'online', 'online live']
|
|
valid_columns = [col for col in columns_to_plot if col in pivoted_dept_data.columns]
|
|
|
|
# to avoid futurewarning
|
|
# print(f" {valid_columns}")
|
|
# if len(valid_columns)==1: valid_columns = valid_columns[0]
|
|
# print(f" {valid_columns}")
|
|
|
|
fig_sub = px.bar(pivoted_dept_data, x='semester', y=valid_columns, barmode='stack',
|
|
#title=f'Course Delivery by Semester for {dept}',
|
|
color_discrete_sequence=["#000066","#660000","#333366","#9400D3"])
|
|
fig.add_traces(fig_sub['data'], rows=[i]*len(fig_sub['data']), cols=[1]*len(fig_sub['data']))
|
|
fig.update_layout(height=70*len(fig['data']), width=1100, showlegend=False)
|
|
fig.write_html(f"cache/sections_depts_by_deliverymode.html")
|
|
|
|
|
|
|
|
# given a list of classes, report back about the student on one row of info
|
|
def student_history_analysis(sh):
|
|
from functools import reduce
|
|
semesters_set = set()
|
|
num_sems = 0
|
|
num_course = len(sh)
|
|
num_units = 0
|
|
units_online = 0
|
|
units_inperson = 0
|
|
units_hybrid = 0
|
|
units_ol = 0
|
|
fa_23_units = 0
|
|
fa_23_online_units = 0
|
|
fa23_courses = 0
|
|
fa23_onlinecourses = 0
|
|
|
|
#un_list = [ float(x['units'].split('-')[0].split('/')[0]) for x in sh ]
|
|
#num_units = reduce(lambda x,y: x+y, un_list)
|
|
for section in sh:
|
|
semesters_set.add(section['sis'])
|
|
units = float(section['units'].split('-')[0].split('/')[0])
|
|
num_units += units
|
|
if section['type'] == 'in-person': units_inperson += units
|
|
if section['type'] == 'online': units_online += units
|
|
if section['type'] == 'hybrid': units_hybrid += units
|
|
if section['type'] == 'online live': units_ol += units
|
|
|
|
if section['sis'] == '202370':
|
|
fa_23_units += units
|
|
fa23_courses += 1
|
|
if not section['type'] == 'in-person':
|
|
fa_23_online_units += units
|
|
fa23_onlinecourses += 1
|
|
|
|
num_sems = len(semesters_set)
|
|
if num_units == 0:
|
|
pct_online = 0
|
|
else:
|
|
pct_online = round(100 * (units_online+units_hybrid+units_ol) / num_units, 1)
|
|
|
|
if fa_23_units == 0:
|
|
fa_23_pct_online = 0
|
|
else:
|
|
fa_23_pct_online = round(100 * (fa_23_online_units) / fa_23_units, 1)
|
|
|
|
if fa23_courses == 0:
|
|
fa23_pct_course_online = 0
|
|
else:
|
|
fa23_pct_course_online = round(100 * (fa23_onlinecourses) / fa23_courses, 1)
|
|
summary = [units, num_course, f"\"{sh[0]['sortablename']}\",{sh[0]['canvasid']},{num_sems},{num_course},{num_units},{units_online},{units_inperson},{units_hybrid},{units_ol},{pct_online},{fa_23_units},{fa_23_online_units},{fa_23_pct_online},{fa23_courses},{fa23_onlinecourses},{fa23_pct_course_online}"]
|
|
return summary
|
|
|
|
def report_student_stats():
|
|
from localcache import users_with_history, students_current_semester
|
|
from itertools import groupby
|
|
import plotly.graph_objects as go
|
|
import plotly.io as pio
|
|
import numpy as np
|
|
|
|
u = users_with_history()
|
|
this_sem = [x['canvasid'] for x in students_current_semester()]
|
|
|
|
df = pd.DataFrame(u)
|
|
filtered_df = df[df['canvasid'].isin(this_sem)]
|
|
filtered_df.to_csv('cache/student_history_current_students.csv',index=False)
|
|
|
|
oo = codecs.open('cache/student_units.txt','w','utf-8')
|
|
oo.write("name,id,num_sems,num_course,num_units,units_online,units_inperson,units_hybrid,units_ol,percent_online,fa23_units,fa23_onlineunits,fa23_pct_online,fa23_num_courses,fa23_num_onlinecourses,fa23_percent_online_course\n")
|
|
# Now group by that key
|
|
def kk(x): return x['canvasid']
|
|
grouped_dict = {key:list(group) for key, group in groupby(u, kk)}
|
|
|
|
shorter = []
|
|
percentages = []
|
|
|
|
for k,g in grouped_dict.items():
|
|
if k in this_sem:
|
|
h = student_history_analysis(g)
|
|
#oo.write(json.dumps(h[2],indent=2)+ "\n")
|
|
oo.write( str(h[2]) + "\n")
|
|
shorter.append(h)
|
|
p = h[2].split(',')[-1]
|
|
percentages.append(float(p))
|
|
else:
|
|
print(f"Skipping {k}")
|
|
#print(this_sem)
|
|
#oo.write('units,courses\n')
|
|
#shorter.sort(key=lambda x: x[0], reverse=True)
|
|
#for s in shorter:
|
|
# print(s[2])
|
|
# #oo.write(f"{s[0]},{s[1]}\n")
|
|
# #print('\n\n')
|
|
|
|
# Create a histogram
|
|
fig = go.Figure(data=[go.Histogram(x=percentages, xbins=dict(start=0,end=101, size=10))])
|
|
|
|
# Save the figure in an HTML file
|
|
pio.write_html(fig, 'cache/student_pct_onlinecourse.html')
|
|
|
|
|
|
def test_rpy():
|
|
pass
|
|
|
|
'''
|
|
def test_rpy():
|
|
from rpy2 import robjects
|
|
from rpy2.robjects import Formula, Environment
|
|
from rpy2.robjects.vectors import IntVector, FloatVector
|
|
from rpy2.robjects.lib import grid
|
|
from rpy2.robjects.packages import importr, data
|
|
from rpy2.rinterface import RRuntimeWarning
|
|
import warnings
|
|
|
|
# The R 'print' function
|
|
rprint = robjects.globalenv.get("print")
|
|
stats = importr('stats')
|
|
grdevices = importr('grDevices')
|
|
base = importr('base')
|
|
datasets = importr('datasets')
|
|
|
|
grid.activate()
|
|
import math, datetime
|
|
import rpy2.robjects.lib.ggplot2 as ggplot2
|
|
import rpy2.robjects as ro
|
|
from rpy2.robjects.packages import importr
|
|
base = importr('base')
|
|
|
|
mtcars = data(datasets).fetch('mtcars')['mtcars']
|
|
|
|
pp = ggplot2.ggplot(mtcars) + \
|
|
ggplot2.aes_string(x='wt', y='mpg', col='factor(cyl)') + \
|
|
ggplot2.geom_point() + \
|
|
ggplot2.geom_smooth(ggplot2.aes_string(group = 'cyl'),
|
|
method = 'lm')
|
|
pp.plot()
|
|
|
|
|
|
|
|
def test_rpy2():
|
|
import rpy2
|
|
print(rpy2.__version__)
|
|
import rpy2.robjects as robjects
|
|
from rpy2.robjects.packages import importr
|
|
# import R's "base" package
|
|
base = importr('base')
|
|
|
|
# import R's "utils" package
|
|
utils = importr('utils')
|
|
pi = robjects.r['pi']
|
|
print(f"pi={pi[0]}")
|
|
'''
|
|
|
|
|
|
|
|
|
|
import pandas as pd
|
|
from sklearn.cluster import KMeans
|
|
from sklearn.preprocessing import StandardScaler
|
|
from sklearn.tree import DecisionTreeClassifier
|
|
from sklearn.metrics import silhouette_score
|
|
from sklearn.tree import DecisionTreeClassifier, export_graphviz
|
|
import graphviz
|
|
|
|
from joblib import dump, load
|
|
|
|
|
|
def cluster_by_mode_1():
|
|
|
|
# Load the data from a CSV file
|
|
data = pd.read_csv('cache/students_bymode.csv')
|
|
|
|
# Extract the relevant features
|
|
features = data[['num_semesters', 'num_units', 'inperson_units', 'hybrid_units', 'online_units']]
|
|
|
|
# Standardize the features
|
|
scaler = StandardScaler()
|
|
scaled_features = scaler.fit_transform(features)
|
|
|
|
# Perform clustering with different numbers of clusters
|
|
for n_clusters in range(4, 12):
|
|
kmeans = KMeans(n_clusters=n_clusters, random_state=42)
|
|
kmeans.fit(scaled_features)
|
|
|
|
# Add the cluster labels to the original data
|
|
data[f'cluster_{n_clusters}'] = kmeans.labels_
|
|
|
|
print(f"Clustering with {n_clusters} clusters:")
|
|
print(data.groupby(f'cluster_{n_clusters}').size())
|
|
print()
|
|
|
|
# Save the updated data with cluster labels to a new CSV file
|
|
data.to_csv('cache/students_bymode_with_clusters_1.csv', index=False)
|
|
|
|
|
|
|
|
|
|
def cluster_by_mode():
|
|
data = pd.read_csv('cache/students_bymode.csv')
|
|
|
|
# Split features and target
|
|
X = data.drop('g_number', axis=1)
|
|
y = data['g_number']
|
|
|
|
# Train decision tree classifier
|
|
clf = DecisionTreeClassifier()
|
|
clf.fit(X, y)
|
|
|
|
# Visualize decision tree
|
|
dot_data = export_graphviz(clf, out_file=None,
|
|
feature_names=X.columns,
|
|
class_names=y.unique(),
|
|
filled=True, rounded=True,
|
|
special_characters=True)
|
|
graph = graphviz.Source(dot_data)
|
|
graph.render('decision_tree', view=True)
|
|
data.to_csv('cache/students_bymode_with_dt.csv', index=False)
|
|
|
|
|
|
def cluster_by_mode_2():
|
|
|
|
# Load the data from a CSV file
|
|
data = pd.read_csv('cache/students_bymode.csv')
|
|
|
|
# Extract the features (excluding the 'g_number' column)
|
|
features = data.drop('g_number', axis=1)
|
|
|
|
# Scale the features to have zero mean and unit variance
|
|
scaler = StandardScaler()
|
|
scaled_features = scaler.fit_transform(features)
|
|
|
|
# Determine the ideal number of clusters using the elbow method
|
|
inertias = []
|
|
for k in range(4, 40): # Try different values of k (e.g., 1 to 10)
|
|
kmeans = KMeans(n_clusters=k, random_state=42)
|
|
kmeans.fit(scaled_features)
|
|
inertias.append(kmeans.inertia_)
|
|
|
|
# Plot the elbow curve
|
|
import matplotlib.pyplot as plt
|
|
plt.plot(range(4, 40), inertias, marker='o')
|
|
plt.xlabel('Number of Clusters (k)')
|
|
plt.ylabel('Inertia')
|
|
plt.title('Elbow Method')
|
|
plt.show()
|
|
|
|
# Choose the ideal number of clusters based on the elbow curve
|
|
ideal_k = 12 # Adjust this based on your observation
|
|
|
|
# Perform clustering with the ideal number of clusters
|
|
kmeans = KMeans(n_clusters=ideal_k, random_state=42)
|
|
kmeans.fit(scaled_features)
|
|
|
|
|
|
|
|
# Get the cluster labels for each data point
|
|
labels = kmeans.labels_
|
|
|
|
# Add the cluster labels to the original data
|
|
data['Cluster'] = labels
|
|
|
|
# Save the cluster labels to a new CSV file
|
|
data.to_csv('cache/students_bymode_with_clusters_2.csv', index=False)
|
|
|
|
# Get the cluster centers (centroids)
|
|
centroids = scaler.inverse_transform(kmeans.cluster_centers_)
|
|
|
|
# Print the cluster centers
|
|
for i, centroid in enumerate(centroids):
|
|
print(f"Cluster {i} center:")
|
|
for feature, value in zip(features.columns, centroid):
|
|
print(f"{feature}: {value}")
|
|
print()
|
|
|
|
|
|
# Save the trained objects to files
|
|
dump(kmeans, 'kmeans.joblib')
|
|
dump(scaler, 'scaler.joblib')
|
|
|
|
# Load the saved objects for future use
|
|
loaded_kmeans = load('kmeans.joblib')
|
|
loaded_scaler = load('scaler.joblib')
|
|
|
|
# Use the loaded objects for predictions
|
|
new_data_scaled = loaded_scaler.transform(new_data)
|
|
predictions = loaded_kmeans.predict(new_data_scaled)
|
|
|
|
|
|
def cluster_with_new_data():
|
|
## NOT TESTED
|
|
# need to save the kmeans and scaler objects from previous step.
|
|
|
|
# Load the new data
|
|
new_data = pd.read_csv('new_data.csv')
|
|
|
|
# Extract the features from the new data
|
|
new_features = new_data.drop('g_number', axis=1)
|
|
|
|
# Scale the new features using the fitted scaler
|
|
scaled_new_features = scaler.transform(new_features)
|
|
|
|
# Predict the cluster labels for the new data
|
|
new_labels = kmeans.predict(scaled_new_features)
|
|
|
|
# Add the cluster labels to the new data
|
|
new_data['Cluster'] = new_labels
|
|
|
|
if __name__ == "__main__":
|
|
options = { 1: ['get all historical grades from ilearn',get_all] ,
|
|
2: ['process grades csv file',process_grades] ,
|
|
3: ['reorganize full grades file by student', reorganize_grades_student],
|
|
4: ['test shortname parse',nametest] ,
|
|
5: ['test sem codes',codetest] ,
|
|
6: ['get student data from orientations', get_student_orientations],
|
|
7: ['manage course master list', all_course_names],
|
|
8: ['grades to vectors', grades_to_vectors],
|
|
9: ['semester startdates list', semester_dates],
|
|
10: ['normalize course histories', normalize_course_histories],
|
|
11: ['cluster student histories', cluster_student_histories],
|
|
12: ['try to make a schedule', try_make_sched],
|
|
13: ['ES model section predict attendance', exp_smoothing_section_model],
|
|
14: ['section stats by mode', section_stats_bymode],
|
|
15: ['student courses by semester', student_by_semester],
|
|
16: ['LSTM model sections', lstm_model_sections],
|
|
17: ['rearrange section data to yearly form', sections_grouped_by_year_mode],
|
|
30: ['visualize course modes multi semester', visualize_course_modes_multi_semester],
|
|
31: ['Report on student stats', report_student_stats],
|
|
32: ['test rpy', test_rpy],
|
|
33: ['cluster students by mode', cluster_by_mode],
|
|
}
|
|
print ('')
|
|
|
|
if len(sys.argv) > 1 and re.search(r'^\d+',sys.argv[1]):
|
|
resp = int(sys.argv[1])
|
|
print("\n\nPerforming: %s\n\n" % options[resp][0])
|
|
|
|
else:
|
|
print ('')
|
|
for key in options:
|
|
print(str(key) + '.\t' + options[key][0])
|
|
|
|
print('')
|
|
resp = input('Choose: ')
|
|
|
|
# Call the function in the options dict
|
|
options[ int(resp)][1]()
|