canvasapp/stats.py

1423 lines
54 KiB
Python

# statistics
"""
## Investigate: Success rates (grades) of students in:
- online courses (over all)
- sync and async and online live
- teachers/courses that have passed POCR (are all async?)
- teachers that have done more than the minimum training in online teaching
- in person classes, if grades are available
## Data collection
- Choose how many semesters (10?)
- Script 1 - given a CRN and Semester, download all grades
- Check if grades were used and make sense
- Compute mean, % > 70, median, etc.
- Anonymization steps
- replace teacher names w/ id number
- replace student names w/ id number
- replace course names w/ course code
- Script 2 - given all semester schedules, generate lists of:
- CRNs which are online, online live, hybrid, inperson, excluded
- CRNs in which teacher and course have passed pocr (and semester is greater than their pass date)
- CRNs in which teacher passed pocr for a different course (and semester is greater than their pass date)
- CRNs to exclude, for example SP20, because of covid. Possibly SU20 and FA20
- CRNs with are POCR approved
- CRNs in which teacher has done more than the minimum training in online teaching
- Student ids which have participated in the online orientation over a certain threshold
- Next steps: generate the x-reference for what categories teachers are in, and
integrate into the main data file.
- Next steps (June/July 2023)
- add campus, time of day, and sem_order (which semester in their college career did they take it) columns
- Organize rows by students
+ Develop a way to categorize them: by course set and/or score set (cluestering: kmeans, forest, etc)
- Goals
- display and summarize clusters of students on a dashboard
- ongoing categorization (implying course recommendations and interventions) based on it
-
## Hypothesis Testing
-
"""
import codecs, os, warnings, itertools
import json, csv, requests, sys, re
import numpy as np
import pandas as pd
from multiprocessing import Semaphore
from statistics import mean, median, stdev
from pipelines import fetch, url
from courses import getCoursesInTerm, course_enrollment
from localcache import get_course_enrollments
from localcache import query_multiple
from collections import defaultdict
all_grades_file = f"cache/grades_all.csv"
all_courses_file = f"cache/course_grades_all.csv"
all_courses_file2 = f"cache/course_grades_compact.csv"
all_courses_file3 = f"cache/course_grades_full.csv"
all_courses_file4 = "cache/course_grades_full_bystudent.csv"
all_courses_file5 = "cache/courses_passed_bystudent.csv"
student_courses_scores = "cache/courses_student_scores.csv"
student_orientation_participation = f'cache/participation_orientation_courses.json'
def num(s):
if s == '': return 0
s = re.sub(r'\.0','',s)
try:
return int(s)
except ValueError:
return float(s)
def sem_num_to_code(sem_num):
p = re.search(r'^(\d\d\d\d)(\d\d)$', sem_num)
if p:
yr = p.group(1)[2:4]
sem = p.group(2)
lookup = {'10':'wi','30':'sp', '50':'su', '70':'fa'}
return f"{lookup[sem]}{yr}"
return ""
def sem_code_to_num(sem_code): # fa23
p = re.search(r'^([a-z]{2})(\d\d)$', sem_code)
if p:
s = p.group(1)
y = p.group(2)
lookup = {'wi':'10','sp':'30', 'su':'50', 'fa':'70'}
return f"20{y}{lookup[s]}"
return ""
def codetest():
sems = '202330 202310 202270 202250 202230 202210 202170 202150 202130 202070 202050 202030 202010 201970 201950 201930 201910 201870 201850 201830'.split(' ')
codes = 'fa21 wi22 sp23 su23 fa23 wi24'.split(' ')
for s in sems:
print("{}: {}".format(s, sem_num_to_code(s)))
for c in codes:
print("{}: {}".format(c, sem_code_to_num(c)))
def get_all():
terms = '178 177 176 175 174 173 172 171 168 65 64 62 63 61 60 25 26 23 22 21'.split(' ')
sems = '202330 202310 202270 202250 202230 202210 202170 202150 202130 202070 202050 202030 202010 201970 201950 201930 201910 201870 201850 201830'.split(' ')
# Save grades to a CSV file
with open(all_grades_file, "w", newline="") as csvfile:
writer = csv.writer(csvfile)
writer.writerow(["crn", "sem", "coursecode", "s_can_id","g","name", "current", "final"])
for (term,sem) in zip(terms,sems):
print(term,sem,"\n")
courses = getCoursesInTerm(term,get_fresh=0,show=0,active=1)
for c in courses:
print(c['name'])
c_code = c['course_code']
grades(writer, sem, c['id'], c_code)
csvfile.flush()
def grades(writer, sem, COURSE_ID, course_code):
params = { "include[]": ["enrollments", "current_grading_period_scores"] }
grades = fetch(url + f"/api/v1/courses/{COURSE_ID}/users",0, params)
#grades = json.loads(grades.text)
for student in grades:
try:
id = student["id"]
name = student["name"]
g = student["login_id"]
print("\t", name)
if student['enrollments'][0]['type'] == 'StudentEnrollment':
grade = student["enrollments"][0]["grades"]["final_score"]
current = student["enrollments"][0]["grades"]["current_score"]
writer.writerow([COURSE_ID, sem, course_code, id, g, name, current, grade])
except Exception as e:
print("Exception:", e)
def get_student_orientations():
courses = {'iLearn Student Orientation 2022':'9768', # 8170 students
'Kickstart Online Orientation - Transfer':'36', # 6149
'Kickstart Online Orientation - New to College':'35', # 5392
'LIB732 SP18':'3295', # 2193
'LIB732 FA17':'2037', # 1868
'LIB732 SP17':'69', # 1645
'Kickstart Online Orientation - Returning':'37', # 1463
'iLearn Student Orientation 2023':'15924', # 1292
'LIB732 SU17':'1439' # 1281
}
views_bycourse = {}
all_student_ids = set()
# get pageviews of each orientation course
for c,i in courses.items():
print(c)
cache_file_name = f'cache/participation_course_{i}.json'
student_ids = [x[1] for x in get_course_enrollments(i)]
all_student_ids.update(student_ids)
if os.path.exists(cache_file_name):
pv = json.loads(codecs.open(cache_file_name,'r','utf-8').read())
else:
pv = get_student_page_views(i, student_ids)
codecs.open(cache_file_name,'w','utf-8').write(json.dumps(pv,indent=2))
views_bycourse[i] = pv
# add up pageviews for each student
views_bystudent = {}
for student_id in all_student_ids:
views_bystudent[student_id] = sum([views_bycourse[i].get(student_id,0) for i in courses.values()])
codecs.open(student_orientation_participation,'w','utf-8').write(json.dumps(views_bystudent,indent=2))
def get_student_page_views(course_id, student_ids):
page_views = {}
verbose = 0
for student_id in student_ids:
a = f'/api/v1/courses/{course_id}/analytics/users/{student_id}/activity'
response = fetch(url + a, verbose)
page_views[student_id] = sum(response.get('page_views', {}).values())
if verbose: print(page_views)
return page_views
schedules = {}
orientations = {}
def load_schedules():
global schedules
if not schedules:
for f in os.listdir('cache/schedule'):
m = re.search(r'(\w\w\d\d)_sched_expanded\.json', f)
if m:
sem = m.group(1)
schedules[sem] = json.loads( codecs.open('cache/schedule/' + f, 'r', 'utf-8').read() )
def load_orientations():
global orientations
if not orientations:
orientations = json.loads( codecs.open(student_orientation_participation,'r','utf-8').read() )
return orientations
def to_crn_fallback(name):
#print(name)
name = name.lower()
try:
m1 = re.search(r'(\d\d\d\d\d)',name)
if m1:
crn = m1.group(1)
else:
return None,None
m2 = re.search(r'([wispufa][wispufa]\d\d)',name.lower())
if m2:
sem = m2.group(1)
else:
return None, None
#print(name, crn, sem)
return crn, sem
except Exception as e:
#print("Exception: ", e, name)
return None, None
def ilearn_name_to_course_code(iname):
parts = iname.split(' ')
code = parts[0]
return code
def short_name_to_crn(name):
#print(name)
try:
parts = name.split(' ')
code = parts[0]
sem = parts[1]
crn = parts[2]
m_sem = re.search(r'^(\w\w\d\d)$',sem)
if not m_sem:
return to_crn_fallback(name)
m = re.search(r'^(\d\d\d\d\d)$',crn)
if m:
return crn,sem
else:
crn_parts = crn.split('/')
m = re.search(r'^(\d\d\d\d\d)$',crn_parts[0])
if m:
return crn_parts[0],sem
#print("non standard course short name: ", code, sem, crn)
return to_crn_fallback(name)
except Exception as e:
#print("Exception: ", e, name)
return to_crn_fallback(name)
def fixname(n):
return re.sub(r'\s+',' ', n).strip()
def short_name_to_teacher_type_crn_sem(name):
load_schedules()
crn, sem = short_name_to_crn(name)
try:
if sem:
sem = sem.lower()
if sem[0:2]=='wi':
sem = 'sp' + sem[2:]
for course in schedules[sem]:
if course['crn'] == crn:
return fixname(course['teacher']), course['type'], crn, sem
except Exception as e:
return None, None, None, None
return None, None, None, None
pocrs = {}
def load_pocrs():
global pocrs
if not pocrs:
with open('cache/pocr_passed.csv') as csvfile:
csvreader = csv.reader(csvfile)
next(csvreader)
for row in csvreader:
pocrs[row[0] + " " + row[1]] = row[2]
return pocrs
def lookup_pocr(teacher,course,sem):
p = load_pocrs()
pcode = teacher + " " + course
if pcode in p:
sem_passed = sem_code_to_num(p[pcode])
sem_test = sem_code_to_num(sem)
if sem_passed < sem_test:
return True
return False
def nametest():
with open(all_courses_file) as csvfile:
csvreader = csv.reader(csvfile)
next(csvreader)
for row in csvreader:
print(row[0], "-", short_name_to_teacher_type_crn_sem(row[0]))
next(csvreader)
def above_70(li,maximum):
cutoff = 0.7 * maximum
above = list(filter(lambda x: x >= cutoff, li))
return round(len(above)/len(li), 3)
# v1, does a row of averages for each course
def process_one_course_grades(block, output, out_c, teacher_to_code, course_to_code):
fxns = [mean, median, stdev, min, max, len]
c_id = block[0][0]
sem = block[0][1]
course_code = block[0][2]
cur_scores = [num(x[6]) for x in block]
final_scores = [num(x[7]) for x in block]
teacher, mode, crn, sem2 = short_name_to_teacher_type_crn_sem(course_code)
if not teacher:
return
tch_code = teacher_to_code[teacher]
crs_code = course_to_code[course_code]
if len(final_scores) < 2:
return
try:
(cur_mean, cur_median, cur_stdev, cur_min, cur_max, cur_count) = [round(f(cur_scores)) for f in fxns]
(final_mean, final_median, final_stdev, final_min, final_max, final_count) = [round(f(final_scores)) for f in fxns]
cur_pct_passed = above_70(cur_scores, cur_max)
final_pct_passed = above_70(final_scores, final_max)
if final_max == 0: return
scaled_final_scores = [ x / final_max for x in final_scores]
(scl_mean, scl_median, scl_stdev, scl_min, scl_max, scl_count) = [round(f(scaled_final_scores),2) for f in fxns]
good_code = ilearn_name_to_course_code(course_code)
pocr = 1 if lookup_pocr(teacher, good_code, sem2) else 0
output.writerow( [crs_code, good_code, pocr, tch_code, mode, final_pct_passed, scl_mean, scl_median, scl_stdev, scl_min, scl_max, final_count] )
out_c.writerow([crs_code, good_code, pocr, tch_code, mode, final_pct_passed, scl_mean, scl_median, scl_stdev, final_count])
except Exception as e:
print("Exception:", e)
# v2, one line per student/course
def process_one_course_grades_full(block, out_f, teacher_to_code, course_to_code):
fxns = [mean, median, stdev, min, max, len]
c_id = block[0][0]
sem = block[0][1]
course_code = block[0][2]
cur_scores = [num(x[6]) for x in block]
final_scores = [num(x[7]) for x in block]
teacher, mode, crn, sem2 = short_name_to_teacher_type_crn_sem(course_code)
if not teacher:
return
tch_code = teacher_to_code[teacher]
crs_code = course_to_code[course_code]
if len(final_scores) < 2:
return
try:
# "course_code course pocr_status orientation_status teacher_code mode student_id scaled_score"
(final_mean, final_median, final_stdev, final_min, final_max, final_count) = [round(f(final_scores)) for f in fxns]
final_pct_passed = above_70(final_scores, final_max)
if final_max == 0: return
scaled_final_scores = [ x / final_max for x in final_scores]
(scl_mean, scl_median, scl_stdev, scl_min, scl_max, scl_count) = [round(f(scaled_final_scores),2) for f in fxns]
good_code = ilearn_name_to_course_code(course_code)
pocr = 1 if lookup_pocr(teacher, good_code, sem2) else 0
o = load_orientations()
for row in block:
student_id = row[3]
orientation = o[student_id] if student_id in o else 0
scaled_score = round(num(row[7]) / final_max, 2)
out_f.writerow([crs_code, good_code, pocr, orientation, tch_code, mode, student_id, scaled_score])
print(course_code)
except Exception as e:
print("Exception:", e)
def process_grades():
# first loop to get all names
courses_labeled = {}
teacher_to_code = {}
code_to_teacher = {}
course_to_code = {}
code_to_course = {}
index = 1001
crs_index = 4001
with open(all_grades_file, newline="") as csvfile:
csvreader = csv.reader(csvfile)
next(csvreader)
for row in csvreader:
crn_sem = row[0] + '_' + row[1]
if not crn_sem in courses_labeled:
teacher, mode, crn, sem2 = short_name_to_teacher_type_crn_sem(row[2])
courses_labeled[crn_sem] = teacher
if not row[2] in course_to_code:
course_to_code[row[2]] = crs_index
code_to_course[crs_index] = row[2]
crs_index += 1
if teacher:
if not teacher in teacher_to_code:
teacher_to_code[teacher] = index
code_to_teacher[index] = teacher
index += 1
codecs.open('cache/teacher_lookup_codes.json','w','utf-8').write( json.dumps( [teacher_to_code, code_to_teacher], indent=2) )
codecs.open('cache/course_lookup_codes.json','w','utf-8').write( json.dumps( [course_to_code, code_to_course], indent=2) )
out_fullrows = codecs.open(all_courses_file3,'w','utf-8')
out_f = csv.writer(out_fullrows)
out_f.writerow("course_code course pocr_status orientation_status teacher_code mode student_id scaled_score".split(" "))
out_compact = codecs.open(all_courses_file2,'w','utf-8')
out_c = csv.writer(out_compact)
out_c.writerow("course_code course pocr_status teacher_code mode percent_passed scl_mean scl_median scl_stdev count".split(" "))
with open(all_courses_file, "w", newline="") as output_f:
output = csv.writer(output_f)
output.writerow("course_code course pocr_status teacher_code mode percent_passed scl_mean scl_median scl_stdev scl_min scl_max count".split(" "))
with open(all_grades_file, newline="") as csvfile:
csvreader = csv.reader(csvfile)
block = []
current_index = None
next(csvreader)
for row in csvreader:
index = row[0]
if index != current_index:
if block:
process_one_course_grades(block, output, out_c, teacher_to_code, course_to_code)
process_one_course_grades_full(block, out_f, teacher_to_code, course_to_code)
block = []
current_index = index
block.append(row)
if block:
process_one_course_grades(block, output, out_c, teacher_to_code, course_to_code)
process_one_course_grades_full(block, out_f, teacher_to_code, course_to_code)
def reorganize_grades_student():
with open(all_courses_file3, newline="") as csvfile:
csvreader = csv.reader(csvfile)
bystudent = defaultdict(list)
next(csvreader)
for row in csvreader:
st = row[6]
bystudent[st].append(row)
students = sorted(bystudent.keys())
with open(all_courses_file4, "w", newline="") as output_f:
with open(all_courses_file5, "w", newline="") as output_s:
with open(student_courses_scores,'w') as output_scs:
output_s.write("student,courses\n")
output = csv.writer(output_f)
output.writerow("course_code course pocr_status orientation_status teacher_code mode student_id scaled_score".split(" "))
# student id 0 has no courses
output.writerow([0,])
for st in students:
courses = [r[1] for r in bystudent[st]]
scores = [r[7] for r in bystudent[st]]
zipped = zip(courses,scores)
output_scs.write(st + ",")
for c,s in zipped:
output_scs.write(f"{c}|{s},")
output_scs.write("\n")
output_s.write(st + "," + " ".join(courses) + "\n")
for row in bystudent[st]:
output.writerow(row)
def all_course_names_setup():
cc = json.loads(codecs.open('cache/courses/courses_built.json','r','utf-8').read())
courses = {}
for C in cc.values():
name = C['dept'] + C['number']
#print(name)
courses[ name ] = C
#co = codecs.open('cache/courses/names.json','w','utf-8')
#for c in sorted(courses.keys()):
# co.write(c + "\n")
cr = codecs.open('cache/courses/names.json','r','utf-8')
from_data = codecs.open('cache/courses_student_scores.csv','r','utf-8').readlines()
unknown = {}
for line in from_data:
parts = line.split(',')
stu_id = parts[0]
ea = parts[1:]
for C in ea:
each = C.split('|')
name = each[0]
if not name in courses:
unknown[name] = name
#data_courses[each[0]] += 1
for c in sorted(unknown.keys()):
print(c)
#co.write( json.dumps( {'unknown':unknown, 'coursenames':courses}, indent=2 ))
lookup = {}
names = {}
def shell2course(shell):
global lookup, names
if not lookup:
cr = json.loads(codecs.open('cache/courses/names.json','r','utf-8').read())
lookup = cr['unknown']
allcourses = cr['coursenames']
names = allcourses.keys()
if shell in names:
return shell
if shell in lookup:
c = lookup[shell]
if c in names:
return c
#print(f"Can't find course: {shell}")
return ""
def stu_record_line(line):
line = line.strip()
line = line.strip(',')
parts = line.split(',')
stu_id = parts[0]
courses = []
for C in parts[1:]:
courses.append(C.split('|'))
return stu_id, courses
def stu_record_to_vector(line, boolean=0):
id, courses = stu_record_line(line)
yesval = "true" if boolean else 1
noval = "false" if boolean else 0
template = json.loads(codecs.open('cache/courses/course_main_record.json','r','utf-8').read())
lookup = {}
for i,c in enumerate(template):
lookup[c] = i
vector = [noval for x in range(len(template))]
for C in courses:
goodname = shell2course(C[0])
if goodname:
vector[lookup[goodname]] = yesval # C[1] # score
return id,vector,courses
def grades_to_vectors(boolean=0, verbose=0):
grades = codecs.open('cache/courses_student_scores.csv','r','utf-8').readlines()
for L in grades:
id, vector, courses = stu_record_to_vector(L,boolean)
if verbose: print(id, vector)
yield id, vector, courses
def course_main_record():
return json.loads(codecs.open('cache/courses/course_main_record.json','r','utf-8').read())
def courses_to_vector_ordered(course_list):
# each course is (name, semester_order, score)
template = course_main_record()
lookup = {}
for i,c in enumerate(template):
lookup[c] = i
vector = ['0' for x in range(len(template))]
for course,order,score in course_list:
goodname = shell2course(course)
if goodname:
vector[lookup[goodname]] = str(order)
return vector
def courses_to_vector(course_list, boolean=1):
#print(course_list)
yesval = "true" if boolean else 1
noval = "false" if boolean else 0
template = course_main_record()
lookup = {}
for i,c in enumerate(template):
lookup[c] = i
vector = [noval for x in range(len(template))]
for C in course_list:
C = C.strip()
#goodname = shell2course(C[0])
#if goodname:
#print(C)
vector[lookup[C]] = yesval # C[1] # score
#print(vector)
return vector
def course_vector_to_names(vector):
template = course_main_record()
names = []
for i,v in enumerate(vector):
if v:
names.append(template[i])
return names
def all_course_names():
ac = json.loads(codecs.open('cache/courses/courses_built.json','r','utf-8').read())
master_record = []
for C in ac.values():
if C['status'] == 'Draft':
continue
name = C['dept'] + C['number']
master_record.append(name)
master_record = set(master_record)
master_record = list(master_record)
master_record = sorted(master_record)
## Extract from all 'accomplished courses'...
if 0:
complete_list = {}
missing_names = {}
with open(student_courses_scores,'r') as input_f:
for L in input_f:
stu_id, courses = stu_record_line(L)
for C in courses:
real_name = shell2course(C[0])
if real_name:
complete_list[real_name] = 1
else:
missing_names[C[0]] = 1
master_record = sorted(complete_list.keys())
print(f"Found {len(master_record)} courses")
print(master_record)
print(f"Missing {len(missing_names)} courses")
print(missing_names)
mr = codecs.open('cache/courses/course_main_record.json','w','utf-8')
mr.write(json.dumps(master_record,indent=2))
from semesters import semester_list, canvas_label
from semesters import code as semester_order
from localcache import all_students_history
from datetime import datetime, timedelta
def semester_dates():
#print()
for c in canvas_label:
print(semester_list[c])
length = 15
if semester_list[c]['code'][0:2] == 'su':
length = 5
start_date = semester_list[c]['start']
# Convert the date string to a datetime object
date_object = datetime.strptime(start_date, '%m/%d/%y')
start_fmt = date_object.strftime('%a %b %d, %Y')
# Add 15weeks, 5days to the date
new_date = date_object + timedelta(weeks=15)
new_date = new_date + timedelta(days=5)
# Format the new date as a string
new_date_string = new_date.strftime('%m/%d/%y')
end_fmt = new_date.strftime('%a %b %d, %Y')
# Print the new date
print(f"start: {start_fmt}, end: {end_fmt}")
current_student = ""
current_student_block = []
current_student_info = {'first':'', 'last':''}
normalized_blocks = []
ignore_courses = "El,zACCT20,GASPAR".split(",")
seen_courses = []
def course_line_process(line):
global current_student, current_student_block, seen_courses, normalized_blocks, current_student_info
sem = line['term_name']
m1 = re.search(r'^(\d\d\d\d)\s(\w+)$', sem)
if not m1: # is NOT an academic semester, skip
return
uid = line['canvasid']
if uid != current_student:
if current_student_block:
current_student_block.append(current_student_info)
normalized_blocks.append(current_student_block)
current_student_block = []
current_student_info = {'first':semester_list[sem]['code'], 'last':''}
current_student = uid
#print(f"Student: {uid} ({line['user_name']})")
# line is a dict
current_student_info['last'] = semester_list[sem]['code']
year, season = m1.group(1), m1.group(2)
date_format = "%Y-%m-%d %H:%M:%S.%f"
create_dt = datetime.strptime(line['created'], date_format)
update_dt = datetime.strptime(line['updated'], date_format)
sem_start = datetime.strptime(semester_list[sem]['start'], '%m/%d/%y')
course = line['course_name']
c_parts = course.split(' ')
if c_parts[0] in ignore_courses or c_parts[0] in seen_courses:
return
classname = shell2course(c_parts[0])
if not classname:
# print empty dict entry for initial setup
# print(f" \"{c_parts[0]}\": \"\",")
seen_courses.append(c_parts[0])
else:
#
flow = line['workflow']
mark = '+'
if flow == "deleted": mark = '-'
# normal start & finish, give add date
add_day = sem_start - create_dt
add_day = add_day.days
sign = '-'
if add_day < 0:
add_day = -add_day
sign = '+'
#print(f" {mark} {classname} added T{sign}{add_day} {semester_list[sem]['code']}")
temp_usr_name = re.sub(r',','',line['user_name'])
current_student_block.append(f"{uid},{temp_usr_name},{classname},add,T{sign}{add_day},{semester_list[sem]['code']}")
if flow == "deleted":
# deleted, give delete date
del_day = sem_start - update_dt
del_day = del_day.days
sign = '-'
if del_day < 0:
del_day = -del_day
sign = '+'
#print(f" {mark} {classname} deleted T{sign}{del_day} {semester_list[sem]['code']}")
current_student_block.append(f"{uid},{temp_usr_name},{classname},del,T{sign}{del_day},{semester_list[sem]['code']}")
def normalize_course_histories():
global normalized_blocks, current_student_block, current_student_info
all_students_history(course_line_process, limit=99910000)
current_student_block.append(current_student_info)
normalized_blocks.append(current_student_block)
codecs.open('cache/normalized_student_add_drop.json','w','utf-8').write(json.dumps(normalized_blocks,indent=2))
# let's see if we can get grades...
grades_by_student_course = defaultdict(dict)
print("Doing grades...")
with codecs.open('cache/courses_student_scores.csv','r','utf-8') as gradesfile:
for s in gradesfile:
parts = s.split(',')
stu = int(parts[0])
#print(stu)
for c in parts[1:]:
try:
#print(c)
crs,gra = c.split('|')
grades_by_student_course[stu][crs] = gra
except Exception as e:
pass
# go through again
print("Second pass of grades and student history...")
student_history = codecs.open('cache/normalized_student_history.csv','w','utf-8')
student_history.write("studentid,studentname,course,action,when,grade,sem_name,first_sem,last_sem,tenure_length,sem_index\n")
semester_order.reverse()
for blk in normalized_blocks:
info = blk[-1]
first = semester_order.index(info['first']) + 1
last = semester_order.index(info['last']) + 1
length = last - first + 1
for course in blk[:-1]:
parts = course.split(',')
#print(parts)
sem = parts[5]
sem_index = semester_order.index(sem) - first + 2
stu = int(parts[0])
crs = parts[2]
grade = ""
if stu in grades_by_student_course:
if crs in grades_by_student_course[stu]:
grade = grades_by_student_course[stu][crs]
student_history.write(",".join([parts[0], parts[1], parts[2], parts[3], parts[4], grade, parts[5], str(first), str(last), str(length), str(sem_index), ]) + '\n')
# make "unified records" or one line per student
student_history_2 = codecs.open('cache/normalized_student_history2.csv','w','utf-8')
allcourse = course_main_record()
#print(allcourse)
template = ['studentid', 'studentname', 'tenure_length']
template.extend(allcourse)
#print(template)
student_history_2.write( ",".join(template) + "\n" )
for blk in normalized_blocks:
student_block = []
info = blk[-1]
first = semester_order.index(info['first']) + 1
last = semester_order.index(info['last']) + 1
length = last - first + 1
temp_course_holder = {}
temp_course_grade_holder = {}
for course in blk[:-1]:
parts = course.split(',')
#print(parts)
sem = parts[5]
sem_index = semester_order.index(sem) - first + 2
stu = int(parts[0])
crs = parts[2]
if parts[3] == 'add':
temp_course_holder[crs] = sem_index
elif parts[3] == 'del' and crs in temp_course_holder:
del temp_course_holder[crs]
# now the temp_course_holder has the courses and semesters
for crs,sem_index in temp_course_holder.items():
grade = ""
if stu in grades_by_student_course:
if crs in grades_by_student_course[stu]:
grade = grades_by_student_course[stu][crs]
this_record = (crs, sem_index, grade)
student_block.append(this_record)
student_vector = [ parts[0], parts[1], str(length) ]
student_vector.extend(courses_to_vector_ordered(student_block))
student_history_2.write(",".join(student_vector) + '\n')
#print(student_vector)
def cluster_student_histories():
infile = 'cache/courses_student_scores.csv'
import pandas as pd
import matplotlib.pyplot as plt
from kneed import KneeLocator
from sklearn.datasets import make_blobs
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.preprocessing import StandardScaler
df = pd.read_csv(infile)
def dept(s):
parts = s.split(' ')
return parts[0]
def try_make_sched():
term = "fa23"
sched = requests.get(f"http://gavilan.cc/schedule/{term}_sched.json").json()
#print(json.dumps(sched,indent=2))
d = "CSIS"
courses = [ [x['code'], x['crn']] for x in sched if dept(x['code'])==d ]
teachers = { x['teacher'] for x in sched if dept(x['code'])==d }
print(courses)
print(teachers)
def sched_lookup_tables():
# Renumber the semesters
# sp16 su16 fa16 wi17 sp17 su17 fa17 wi18
#semesters = "sp18 su18 fa18 wi19 sp19 su19 fa19 wi20 sp20 su20 fa20 wi21 sp21 su21 fa21 wi22 sp22 su22 fa22 wi23 sp23 su23 fa23 wi24 sp24 su24 fa24 wi25 sp25 su25 fa25 wi26".split(" ")
sem_fourcode = "sp18 su18 fa18 sp19 su19 fa19 sp20 su20 fa20 sp21 su21 fa21 sp22 su22 fa22 sp23 su23 fa23 sp24 su24 fa24 sp25 su25 fa25".split(" ")
int_numbers = [x for x in range(1,len(sem_fourcode)+1)]
fourcode_2_int = {semester: number for semester, number in zip(sem_fourcode, int_numbers)}
int_2_fourcode = {v: k for k, v in fourcode_2_int.items()}
sis_2_fourcode = {}
fourcode_2_sis = {}
yr = 2018
sems = ['30','50','70']
i = 0
semcodes = []
while yr < 2026:
for s in sems:
semcodes.append(f"{yr}{s}")
sis_2_fourcode[f"{yr}{s}"] = sem_fourcode[i]
fourcode_2_sis[sis_2_fourcode[f"{yr}{s}"]] = f"{yr}{s}"
#print(f"UPDATE schedule SET semsis={yr}{s} WHERE sem='{semesters[i]}';")
i += 1
yr += 1
return fourcode_2_int, int_2_fourcode, sis_2_fourcode, fourcode_2_sis, semcodes
def section_stats_bymode():
data = query_multiple("SELECT code, semsis, COUNT(id) AS sections, sum(act) filter (WHERE type='in-person') AS inperson, sum(act) filter (WHERE type='online') AS online, sum(act) filter (WHERE type='hybrid') AS hybrid, sum(act) filter (WHERE type='online live') AS onlinelive FROM schedule GROUP BY code, semsis ORDER BY code, semsis;", 'cache/canvas_data/data20231012.db')
import pandas as pd
df = pd.DataFrame(data)
df.fillna(0,inplace=True)
for L in 'sections,inperson,online,hybrid,onlinelive'.split(','):
df[L] = df[L].astype(int)
print(df)
df.to_csv('cache/section_stats_bymode.csv')
return df
def section_stats():
# for each course, (ENG1A) how many are enrolled in each all sections?
# (and break down by mode,time,location,etc)
#
# for each course, how many are first semester gav students?
#
data = query_multiple("SELECT * FROM schedule ORDER BY code,id", 'cache/canvas_data/data20231012.db')
fourcode_2_int, int_2_fourcode, sis_2_fourcode, fourcode_2_sis, semcodes = sched_lookup_tables()
# Assuming your data is in a list of dictionaries called data
df = pd.DataFrame(data)
# Drop the specified columns
df = df.drop(columns=['id', 'crn', 'units', 'teacher', 'start', 'end', 'loc', 'cap'])
codecs.open('cache/sem_mapping.json','w','utf-8').write(json.dumps(fourcode_2_int,indent=2))
df['sem'] = df['sem'].map(fourcode_2_int)
df.set_index('sem', inplace=True)
return df
def simple_exp_smoothing_section_model():
sout = codecs.open('cache/section_predictions.txt','w','utf-8')
from statsmodels.tsa.api import SimpleExpSmoothing
warnings.filterwarnings("ignore")
periods = 3
start = 19
df = section_stats()
print(df)
df = df.sort_index()
predictions = {}
for course_code in df['code'].unique():
try:
print(course_code)
sout.write(course_code + "\n")
this_set = df[df['code'] == course_code]['act']
this_set = this_set.groupby('sem').sum()
#this_set.fillna(method='ffill', inplace=True)
#this_set.fillna(0, inplace=True)
# Create a new index with all required semesters
new_index = np.arange(this_set.index.min(), this_set.index.max()+1)
# Reindex the DataFrame and fill missing values with 0
this_set = this_set.reindex(new_index, fill_value=0)
print(this_set.to_string())
sout.write(this_set.to_string() + "\n")
model = SimpleExpSmoothing(this_set)
fit = model.fit(smoothing_level=0.2) # initiate with a smoothing level of 0.2
# Later modify above line based on if your data has high or low variability
#prediction = fit.forecast(start=32,end=34) # predict attendance for the next 3 semesters
prediction = fit.predict(start=start,end=start+4)
print(prediction)
sout.write(str(prediction) + "\n")
sout.flush()
predictions[course_code] = prediction
except Exception as e:
print(f"Model creation failed for {course_code} due to {str(e)}")
sout.write(f"Model creation failed for {course_code} due to {str(e)}\n")
"""
model = ARIMA(this_set, order=(1,1,1)) #ARIMA params (p, d, q)
model_fit = model.fit()
forecast_result = model_fit.forecast(steps=periods)
if forecast_result:
predictions[course_code] = forecast_result[0]
else:
print(f"No prediction for {course_code}. Skipping...")"""
# statistics - use a smooth exponential model to predict the next 3 semesters of enrollment
# Doesn't really seem to get the patterns.
def exp_smoothing_section_model():
sout = codecs.open('cache/section_predictions.txt','w','utf-8')
from statsmodels.tsa.api import ExponentialSmoothing
warnings.filterwarnings("ignore")
periods = 3
start = 19
fourcode_2_int, int_2_fourcode, sis_2_fourcode, fourcode_2_sis, semcodes = sched_lookup_tables()
df = section_stats()
print(df)
df = df.sort_index()
predictions = {}
for course_code in df['code'].unique():
try:
print(course_code)
#sout.write(course_code + "\n")
this_set = df[df['code'] == course_code]['act']
this_set = this_set.groupby('sem').sum()
#this_set.fillna(method='ffill', inplace=True)
#this_set.fillna(0, inplace=True)
# Create a new index with all required semesters
new_index = np.arange(this_set.index.min(), this_set.index.max()+1)
# Reindex the DataFrame and fill missing values with 0
this_set = this_set.reindex(new_index, fill_value=0)
print(this_set.to_string())
for i,v in this_set.items():
sout.write(f"{course_code},{int_2_fourcode[i]},{v}\n")
model = ExponentialSmoothing(this_set, seasonal_periods=4, trend='add', seasonal='add')
fit = model.fit()
prediction = fit.predict(start=start,end=start+4)
print(prediction)
for i,v in prediction.items():
v = int(v)
if v<0: v=0
sout.write(f"{course_code},{int_2_fourcode[i]}, {v}\n")
sout.flush()
predictions[course_code] = prediction
except Exception as e:
print(f"Model creation failed for {course_code} due to {str(e)}")
#sout.write(f"Model creation failed for {course_code} due to {str(e)}\n")
def student_by_semester():
query = """
SELECT u.name, u.canvasid, s.code, s.semsis FROM users u
JOIN enrollment e ON u.id = e.user_id
JOIN courses c ON c.id = e.course_id
JOIN terms t ON c.termid = t.id
JOIN schedule s ON c.schedule = s.id
WHERE e.type='StudentEnrollment' AND e.workflow='active'
ORDER BY u.sortablename, s.semsis;
"""
df = pd.DataFrame(query_multiple(query, 'cache/canvas_data/data20231012.db'))
# Apply groupby and aggregate the courses in each semester in a comma-separated string
df['courses'] = df.groupby(['name','canvasid','semsis'])['code'].transform(lambda x : ' / '.join(x))
# Removing duplicates
df = df[['name','canvasid','semsis','courses']].drop_duplicates()
# Create pivot table
df_pivot = df.pivot_table(values='courses', index=['name','canvasid'], columns='semsis', aggfunc='first').reset_index()
# Adding prefix to new columns names to recognize them
df_pivot.columns = [str(col) + '_sem' if isinstance(col, int) else col for col in df_pivot.columns]
df_pivot.to_csv('cache/student_by_semester.csv')
def sections_grouped_by_year_mode():
df = section_stats_bymode()
# list of unique courses
df_all_courses = df['code'].unique()
# list of unique semesters
df_all_semesters = df['semsis'].unique()
df_all_semesters.sort()
raw_data = {}
for line in df:
print(line['semsis'])
sis = str(line['semsis'])
year = sis[0:4]
raw_data[ f"{line['code']}{year}"] = [line['inperson'],line['online'],line['hybrid'],line['onlinelive']]
print(raw_data)
return
for course in df_all_courses:
c = str(course)
template = {'code':[c,c,c], 'semsis':[], 'inperson':[], 'online':[], 'hybrid':[], 'onlinelive':[]}
# group semesters in to groups of 3 by year
for i in df_all_semesters:
j = str(i)
year = j[0:4]
print(f"{i} ({year})")
# for each course, for each group of 3 semesters, fill in values, using 0 if necessary
# ...
def lstm_model_sections():
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
# Preprocessing
# Normalize inputs for better performance
df = section_stats_bymode()
print(df)
scaler = MinMaxScaler(feature_range=(0, 1))
dataset_scaled = scaler.fit_transform(df.drop(['code', 'semsis'], axis=1))
print("scaled:")
print(df)
# Split features and targets (Assuming you want to predict 'online' enrollments)
X = dataset_scaled[:, 1:]
Y = dataset_scaled[:,0:1]
# Train / Test split
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=0)
# Reshape input to be [samples, time steps, features] which is required for LSTM
x_train = np.reshape(x_train, (x_train.shape[0], x_train.shape[1], 1))
x_test = np.reshape(x_test, (x_test.shape[0], x_test.shape[1], 1))
print("x_train shape:", x_train.shape)
print(x_train)
print("\n\nTraining...\n\n")
# LSTM architecture
model = Sequential()
model.add(LSTM(50, input_shape=(X.shape[1], 1))) # 50 LSTM blocks
model.add(Dense(1)) # Since we are predicting only 1 output ('online' enrollments)
model.compile(loss='mean_squared_error', optimizer='adam')
model.fit(x_train, y_train, epochs=5, batch_size=1) # Training the model
# Prediction
scaler_predict = MinMaxScaler()
scaler_predict.fit_transform(df[['online']])
trainPredict = model.predict(x_train)
testPredict = model.predict(x_test)
# Invert predictions (Due to normalization)
trainPredict = scaler_predict.inverse_transform(trainPredict)
testPredict = scaler_predict.inverse_transform(testPredict)
# Now you have your future prediction in testPredict.
print("Predictions:")
print(testPredict)
np.savetxt('cache/section_predictions_lstm.txt',testPredict, fmt='%f')
# I'm lost here...
df
def visualize_course_modes_multi_semester():
import plotly.express as px
from plotly.subplots import make_subplots
seasons = {'sp':'30','su':'50','fa':'70'}
semcodes = "sp18 su18 fa18 sp19 su19 fa19 sp20 su20 fa20 sp21 su21 fa21 sp22 su22 fa22 sp23 su23 fa23 sp24".split(" ")
# sems = {'sp23':'202330','su23':'202350','fa23':'202370'}
sems = { x:'20' + x[2:] + seasons[x[:2]] for x in semcodes }
sem_dfs = []
sem_dfs_depts = []
for s in sems.keys():
sched = requests.get(f"http://gavilan.cc/schedule/{s}_sched_expanded.json").json()
for crs in sched:
if 'extra' in crs: del crs['extra']
crs['dept'] = crs['code'].split(' ')[0]
df = pd.DataFrame(sched)
df_depts = df.copy()
df_depts = df_depts.drop(columns=['crn','sec','code','cmp','name','days','time','rem','wl_cap','wl_act','wl_rem','teacher','date','loc','ztc','time_start','time_end','start','end','doy'])
df = df.drop(columns=['crn','sec','code','cmp','name','days','time','rem','wl_cap','wl_act','wl_rem','teacher','date','loc','ztc','time_start','time_end','start','end','doy'])
dept_counts = grouped_by_dept = df_depts.groupby(['dept','type']).size().reset_index(name='count')
grouped_by_mode = df['type'].value_counts().reset_index()
grouped_by_dept["semester"] = sems[s]
grouped_by_mode["semester"] = sems[s]
#print(dept_counts)
sem_dfs.append(grouped_by_mode)
sem_dfs_depts.append(grouped_by_dept)
#grouped_json = grouped_by_dept.to_json(orient='records')
#j = json.loads(grouped_json)
#print(json.dumps(j,indent=2))
#grouped_by_dept.columns = ['Department', 'Count'] # rename the column names appropriately
#fig = px.bar(grouped_by_dept, x='Department', y='Count', title='Section Counts by Department')
#fig.write_html(f"cache/output_{s}.html")
combined_data = pd.concat(sem_dfs, axis=0)
combined_data = combined_data.rename(columns={'type':'count','index':'type'})
combined_data.reset_index(drop=True,inplace=True)
pivoted_data = combined_data.pivot(index='semester', columns='type', values='count')
pivoted_data.reset_index(inplace=True)
fig = px.bar(pivoted_data, x='semester',y=['hybrid', 'in-person', 'online', 'online live'], barmode='stack',
title='Course Delivery by Semester',
color_discrete_sequence=["#000066","#660000","#333366","#9400D3"])
fig.write_html(f"cache/sections_by_deliverymode.html")
combined_data_depts = pd.concat(sem_dfs_depts, axis=0)
combined_data_depts.reset_index(drop=True,inplace=True)
#print(combined_data_depts)
combined_data_depts.to_csv('cache/section_delivery_by_dept.csv')
'''pivoted_data_depts = combined_data_depts.pivot(index='semester', columns='type', values='count')
pivoted_data_depts.reset_index(inplace=True)
fig = px.bar(pivoted_data_depts, x='semester',y=['hybrid', 'in-person', 'online', 'online live'], barmode='stack',
title='Course Delivery by Semester',
color_discrete_sequence=["#000066","#660000","#333366","#9400D3"])
fig.write_html(f"cache/sections_depts_by_deliverymode.html")'''
unique_depts = combined_data_depts['dept'].unique()
fig = make_subplots(rows=len(unique_depts), cols=1,
subplot_titles=unique_depts,
)
for i, dept in enumerate(unique_depts, start=1):
#if i>1: break
# Filter the dataframe for the current department
dept_data = combined_data_depts[combined_data_depts['dept'] == dept]
# Pivot the data frame
pivoted_dept_data = dept_data.pivot(index='semester', columns='type', values='count').reset_index()
pivoted_dept_data.fillna(0,inplace=True)
print(pivoted_dept_data)
# Plot the data
columns_to_plot = ['hybrid', 'in-person', 'online', 'online live']
valid_columns = [col for col in columns_to_plot if col in pivoted_dept_data.columns]
fig_sub = px.bar(pivoted_dept_data, x='semester', y=valid_columns, barmode='stack',
#title=f'Course Delivery by Semester for {dept}',
color_discrete_sequence=["#000066","#660000","#333366","#9400D3"])
fig.add_traces(fig_sub['data'], rows=[i]*len(fig_sub['data']), cols=[1]*len(fig_sub['data']))
fig.update_layout(height=70*len(fig['data']), width=1100, showlegend=False)
fig.write_html(f"cache/sections_depts_by_deliverymode.html")
# given a list of classes, report back about the student on one row of info
def student_history_analysis(sh):
from functools import reduce
semesters_set = set()
num_sems = 0
num_course = len(sh)
num_units = 0
units_online = 0
units_inperson = 0
units_hybrid = 0
units_ol = 0
fa_23_units = 0
fa_23_online_units = 0
fa23_courses = 0
fa23_onlinecourses = 0
#un_list = [ float(x['units'].split('-')[0].split('/')[0]) for x in sh ]
#num_units = reduce(lambda x,y: x+y, un_list)
for section in sh:
semesters_set.add(section['sis'])
units = float(section['units'].split('-')[0].split('/')[0])
num_units += units
if section['type'] == 'in-person': units_inperson += units
if section['type'] == 'online': units_online += units
if section['type'] == 'hybrid': units_hybrid += units
if section['type'] == 'online live': units_ol += units
if section['sis'] == '202370':
fa_23_units += units
fa23_courses += 1
if not section['type'] == 'in-person':
fa_23_online_units += units
fa23_onlinecourses += 1
num_sems = len(semesters_set)
if num_units == 0:
pct_online = 0
else:
pct_online = round(100 * (units_online+units_hybrid+units_ol) / num_units, 1)
if fa_23_units == 0:
fa_23_pct_online = 0
else:
fa_23_pct_online = round(100 * (fa_23_online_units) / fa_23_units, 1)
if fa23_courses == 0:
fa23_pct_course_online = 0
else:
fa23_pct_course_online = round(100 * (fa23_onlinecourses) / fa23_courses, 1)
summary = [units, num_course, f"\"{sh[0]['sortablename']}\",{sh[0]['canvasid']},{num_sems},{num_course},{num_units},{units_online},{units_inperson},{units_hybrid},{units_ol},{pct_online},{fa_23_units},{fa_23_online_units},{fa_23_pct_online},{fa23_courses},{fa23_onlinecourses},{fa23_pct_course_online}"]
return summary
def report_student_stats():
from localcache import users_with_history, students_current_semester
from itertools import groupby
import plotly.graph_objects as go
import plotly.io as pio
import numpy as np
u = users_with_history()
this_sem = [x['canvasid'] for x in students_current_semester()]
df = pd.DataFrame(u)
filtered_df = df[df['canvasid'].isin(this_sem)]
filtered_df.to_csv('cache/student_history_current_students.csv',index=False)
oo = codecs.open('cache/student_units.txt','w','utf-8')
oo.write("name,id,num_sems,num_course,num_units,units_online,units_inperson,units_hybrid,units_ol,percent_online,fa23_units,fa23_onlineunits,fa23_pct_online,fa23_num_courses,fa23_num_onlinecourses,fa23_percent_online_course\n")
# Now group by that key
def kk(x): return x['canvasid']
grouped_dict = {key:list(group) for key, group in groupby(u, kk)}
shorter = []
percentages = []
for k,g in grouped_dict.items():
if k in this_sem:
h = student_history_analysis(g)
#oo.write(json.dumps(h[2],indent=2)+ "\n")
oo.write( str(h[2]) + "\n")
shorter.append(h)
p = h[2].split(',')[-1]
percentages.append(float(p))
else:
print(f"Skipping {k}")
#print(this_sem)
#oo.write('units,courses\n')
#shorter.sort(key=lambda x: x[0], reverse=True)
#for s in shorter:
# print(s[2])
# #oo.write(f"{s[0]},{s[1]}\n")
# #print('\n\n')
# Create a histogram
fig = go.Figure(data=[go.Histogram(x=percentages, xbins=dict(start=0,end=101, size=10))])
# Save the figure in an HTML file
pio.write_html(fig, 'cache/student_pct_onlinecourse.html')
if __name__ == "__main__":
options = { 1: ['get all historical grades from ilearn',get_all] ,
2: ['process grades csv file',process_grades] ,
3: ['reorganize full grades file by student', reorganize_grades_student],
4: ['test shortname parse',nametest] ,
5: ['test sem codes',codetest] ,
6: ['get student data from orientations', get_student_orientations],
7: ['manage course master list', all_course_names],
8: ['grades to vectors', grades_to_vectors],
9: ['semester startdates list', semester_dates],
10: ['normalize course histories', normalize_course_histories],
11: ['cluster student histories', cluster_student_histories],
12: ['try to make a schedule', try_make_sched],
13: ['ES model section predict attendance', exp_smoothing_section_model],
14: ['section stats by mode', section_stats_bymode],
15: ['student courses by semester', student_by_semester],
16: ['LSTM model sections', lstm_model_sections],
17: ['rearrange section data to yearly form', sections_grouped_by_year_mode],
30: ['visualize course modes multi semester', visualize_course_modes_multi_semester],
31: ['Report on student stats', report_student_stats],
}
print ('')
if len(sys.argv) > 1 and re.search(r'^\d+',sys.argv[1]):
resp = int(sys.argv[1])
print("\n\nPerforming: %s\n\n" % options[resp][0])
else:
print ('')
for key in options:
print(str(key) + '.\t' + options[key][0])
print('')
resp = input('Choose: ')
# Call the function in the options dict
options[ int(resp)][1]()