# statistics """ ## Investigate: Success rates (grades) of students in: - online courses (over all) - sync and async and online live - teachers/courses that have passed POCR (are all async?) - teachers that have done more than the minimum training in online teaching - in person classes, if grades are available ## Data collection - Choose how many semesters (10?) - Script 1 - given a CRN and Semester, download all grades - Check if grades were used and make sense - Compute mean, % > 70, median, etc. - Anonymization steps - replace teacher names w/ id number - replace student names w/ id number - replace course names w/ course code - Script 2 - given all semester schedules, generate lists of: - CRNs which are online, online live, hybrid, inperson, excluded - CRNs in which teacher and course have passed pocr (and semester is greater than their pass date) - CRNs in which teacher passed pocr for a different course (and semester is greater than their pass date) - CRNs to exclude, for example SP20, because of covid. Possibly SU20 and FA20 - CRNs with are POCR approved - CRNs in which teacher has done more than the minimum training in online teaching - Student ids which have participated in the online orientation over a certain threshold - Next steps: generate the x-reference for what categories teachers are in, and integrate into the main data file. - Next steps (June/July 2023) - add campus, time of day, and sem_order (which semester in their college career did they take it) columns - Organize rows by students + Develop a way to categorize them: by course set and/or score set (cluestering: kmeans, forest, etc) - Goals - display and summarize clusters of students on a dashboard - ongoing categorization (implying course recommendations and interventions) based on it - ## Hypothesis Testing - """ import codecs, os, warnings, itertools import json, csv, requests, sys, re import numpy as np import pandas as pd from multiprocessing import Semaphore from statistics import mean, median, stdev from pipelines import fetch, url from courses import getCoursesInTerm, course_enrollment from localcache import get_course_enrollments from localcache import query_multiple from collections import defaultdict all_grades_file = f"cache/grades_all.csv" all_courses_file = f"cache/course_grades_all.csv" all_courses_file2 = f"cache/course_grades_compact.csv" all_courses_file3 = f"cache/course_grades_full.csv" all_courses_file4 = "cache/course_grades_full_bystudent.csv" all_courses_file5 = "cache/courses_passed_bystudent.csv" student_courses_scores = "cache/courses_student_scores.csv" student_orientation_participation = f'cache/participation_orientation_courses.json' def num(s): if s == '': return 0 s = re.sub(r'\.0','',s) try: return int(s) except ValueError: return float(s) def sem_num_to_code(sem_num): p = re.search(r'^(\d\d\d\d)(\d\d)$', sem_num) if p: yr = p.group(1)[2:4] sem = p.group(2) lookup = {'10':'wi','30':'sp', '50':'su', '70':'fa'} return f"{lookup[sem]}{yr}" return "" def sem_code_to_num(sem_code): # fa23 p = re.search(r'^([a-z]{2})(\d\d)$', sem_code) if p: s = p.group(1) y = p.group(2) lookup = {'wi':'10','sp':'30', 'su':'50', 'fa':'70'} return f"20{y}{lookup[s]}" return "" def codetest(): sems = '202330 202310 202270 202250 202230 202210 202170 202150 202130 202070 202050 202030 202010 201970 201950 201930 201910 201870 201850 201830'.split(' ') codes = 'fa21 wi22 sp23 su23 fa23 wi24'.split(' ') for s in sems: print("{}: {}".format(s, sem_num_to_code(s))) for c in codes: print("{}: {}".format(c, sem_code_to_num(c))) def get_all(): terms = '178 177 176 175 174 173 172 171 168 65 64 62 63 61 60 25 26 23 22 21'.split(' ') sems = '202330 202310 202270 202250 202230 202210 202170 202150 202130 202070 202050 202030 202010 201970 201950 201930 201910 201870 201850 201830'.split(' ') # Save grades to a CSV file with open(all_grades_file, "w", newline="") as csvfile: writer = csv.writer(csvfile) writer.writerow(["crn", "sem", "coursecode", "s_can_id","g","name", "current", "final"]) for (term,sem) in zip(terms,sems): print(term,sem,"\n") courses = getCoursesInTerm(term,get_fresh=0,show=0,active=1) for c in courses: print(c['name']) c_code = c['course_code'] grades(writer, sem, c['id'], c_code) csvfile.flush() def grades(writer, sem, COURSE_ID, course_code): params = { "include[]": ["enrollments", "current_grading_period_scores"] } grades = fetch(url + f"/api/v1/courses/{COURSE_ID}/users",0, params) #grades = json.loads(grades.text) for student in grades: try: id = student["id"] name = student["name"] g = student["login_id"] print("\t", name) if student['enrollments'][0]['type'] == 'StudentEnrollment': grade = student["enrollments"][0]["grades"]["final_score"] current = student["enrollments"][0]["grades"]["current_score"] writer.writerow([COURSE_ID, sem, course_code, id, g, name, current, grade]) except Exception as e: print("Exception:", e) def get_student_orientations(): courses = {'iLearn Student Orientation 2022':'9768', # 8170 students 'Kickstart Online Orientation - Transfer':'36', # 6149 'Kickstart Online Orientation - New to College':'35', # 5392 'LIB732 SP18':'3295', # 2193 'LIB732 FA17':'2037', # 1868 'LIB732 SP17':'69', # 1645 'Kickstart Online Orientation - Returning':'37', # 1463 'iLearn Student Orientation 2023':'15924', # 1292 'LIB732 SU17':'1439' # 1281 } views_bycourse = {} all_student_ids = set() # get pageviews of each orientation course for c,i in courses.items(): print(c) cache_file_name = f'cache/participation_course_{i}.json' student_ids = [x[1] for x in get_course_enrollments(i)] all_student_ids.update(student_ids) if os.path.exists(cache_file_name): pv = json.loads(codecs.open(cache_file_name,'r','utf-8').read()) else: pv = get_student_page_views(i, student_ids) codecs.open(cache_file_name,'w','utf-8').write(json.dumps(pv,indent=2)) views_bycourse[i] = pv # add up pageviews for each student views_bystudent = {} for student_id in all_student_ids: views_bystudent[student_id] = sum([views_bycourse[i].get(student_id,0) for i in courses.values()]) codecs.open(student_orientation_participation,'w','utf-8').write(json.dumps(views_bystudent,indent=2)) def get_student_page_views(course_id, student_ids): page_views = {} verbose = 0 for student_id in student_ids: a = f'/api/v1/courses/{course_id}/analytics/users/{student_id}/activity' response = fetch(url + a, verbose) page_views[student_id] = sum(response.get('page_views', {}).values()) if verbose: print(page_views) return page_views schedules = {} orientations = {} def load_schedules(): global schedules if not schedules: for f in os.listdir('cache/schedule'): m = re.search(r'(\w\w\d\d)_sched_expanded\.json', f) if m: sem = m.group(1) schedules[sem] = json.loads( codecs.open('cache/schedule/' + f, 'r', 'utf-8').read() ) def load_orientations(): global orientations if not orientations: orientations = json.loads( codecs.open(student_orientation_participation,'r','utf-8').read() ) return orientations def to_crn_fallback(name): #print(name) name = name.lower() try: m1 = re.search(r'(\d\d\d\d\d)',name) if m1: crn = m1.group(1) else: return None,None m2 = re.search(r'([wispufa][wispufa]\d\d)',name.lower()) if m2: sem = m2.group(1) else: return None, None #print(name, crn, sem) return crn, sem except Exception as e: #print("Exception: ", e, name) return None, None def ilearn_name_to_course_code(iname): parts = iname.split(' ') code = parts[0] return code def short_name_to_crn(name): #print(name) try: parts = name.split(' ') code = parts[0] sem = parts[1] crn = parts[2] m_sem = re.search(r'^(\w\w\d\d)$',sem) if not m_sem: return to_crn_fallback(name) m = re.search(r'^(\d\d\d\d\d)$',crn) if m: return crn,sem else: crn_parts = crn.split('/') m = re.search(r'^(\d\d\d\d\d)$',crn_parts[0]) if m: return crn_parts[0],sem #print("non standard course short name: ", code, sem, crn) return to_crn_fallback(name) except Exception as e: #print("Exception: ", e, name) return to_crn_fallback(name) def fixname(n): return re.sub(r'\s+',' ', n).strip() def short_name_to_teacher_type_crn_sem(name): load_schedules() crn, sem = short_name_to_crn(name) try: if sem: sem = sem.lower() if sem[0:2]=='wi': sem = 'sp' + sem[2:] for course in schedules[sem]: if course['crn'] == crn: return fixname(course['teacher']), course['type'], crn, sem except Exception as e: return None, None, None, None return None, None, None, None pocrs = {} def load_pocrs(): global pocrs if not pocrs: with open('cache/pocr_passed.csv') as csvfile: csvreader = csv.reader(csvfile) next(csvreader) for row in csvreader: pocrs[row[0] + " " + row[1]] = row[2] return pocrs def lookup_pocr(teacher,course,sem): p = load_pocrs() pcode = teacher + " " + course if pcode in p: sem_passed = sem_code_to_num(p[pcode]) sem_test = sem_code_to_num(sem) if sem_passed < sem_test: return True return False def nametest(): with open(all_courses_file) as csvfile: csvreader = csv.reader(csvfile) next(csvreader) for row in csvreader: print(row[0], "-", short_name_to_teacher_type_crn_sem(row[0])) next(csvreader) def above_70(li,maximum): cutoff = 0.7 * maximum above = list(filter(lambda x: x >= cutoff, li)) return round(len(above)/len(li), 3) # v1, does a row of averages for each course def process_one_course_grades(block, output, out_c, teacher_to_code, course_to_code): fxns = [mean, median, stdev, min, max, len] c_id = block[0][0] sem = block[0][1] course_code = block[0][2] cur_scores = [num(x[6]) for x in block] final_scores = [num(x[7]) for x in block] teacher, mode, crn, sem2 = short_name_to_teacher_type_crn_sem(course_code) if not teacher: return tch_code = teacher_to_code[teacher] crs_code = course_to_code[course_code] if len(final_scores) < 2: return try: (cur_mean, cur_median, cur_stdev, cur_min, cur_max, cur_count) = [round(f(cur_scores)) for f in fxns] (final_mean, final_median, final_stdev, final_min, final_max, final_count) = [round(f(final_scores)) for f in fxns] cur_pct_passed = above_70(cur_scores, cur_max) final_pct_passed = above_70(final_scores, final_max) if final_max == 0: return scaled_final_scores = [ x / final_max for x in final_scores] (scl_mean, scl_median, scl_stdev, scl_min, scl_max, scl_count) = [round(f(scaled_final_scores),2) for f in fxns] good_code = ilearn_name_to_course_code(course_code) pocr = 1 if lookup_pocr(teacher, good_code, sem2) else 0 output.writerow( [crs_code, good_code, pocr, tch_code, mode, final_pct_passed, scl_mean, scl_median, scl_stdev, scl_min, scl_max, final_count] ) out_c.writerow([crs_code, good_code, pocr, tch_code, mode, final_pct_passed, scl_mean, scl_median, scl_stdev, final_count]) except Exception as e: print("Exception:", e) # v2, one line per student/course def process_one_course_grades_full(block, out_f, teacher_to_code, course_to_code): fxns = [mean, median, stdev, min, max, len] c_id = block[0][0] sem = block[0][1] course_code = block[0][2] cur_scores = [num(x[6]) for x in block] final_scores = [num(x[7]) for x in block] teacher, mode, crn, sem2 = short_name_to_teacher_type_crn_sem(course_code) if not teacher: return tch_code = teacher_to_code[teacher] crs_code = course_to_code[course_code] if len(final_scores) < 2: return try: # "course_code course pocr_status orientation_status teacher_code mode student_id scaled_score" (final_mean, final_median, final_stdev, final_min, final_max, final_count) = [round(f(final_scores)) for f in fxns] final_pct_passed = above_70(final_scores, final_max) if final_max == 0: return scaled_final_scores = [ x / final_max for x in final_scores] (scl_mean, scl_median, scl_stdev, scl_min, scl_max, scl_count) = [round(f(scaled_final_scores),2) for f in fxns] good_code = ilearn_name_to_course_code(course_code) pocr = 1 if lookup_pocr(teacher, good_code, sem2) else 0 o = load_orientations() for row in block: student_id = row[3] orientation = o[student_id] if student_id in o else 0 scaled_score = round(num(row[7]) / final_max, 2) out_f.writerow([crs_code, good_code, pocr, orientation, tch_code, mode, student_id, scaled_score]) print(course_code) except Exception as e: print("Exception:", e) def process_grades(): # first loop to get all names courses_labeled = {} teacher_to_code = {} code_to_teacher = {} course_to_code = {} code_to_course = {} index = 1001 crs_index = 4001 with open(all_grades_file, newline="") as csvfile: csvreader = csv.reader(csvfile) next(csvreader) for row in csvreader: crn_sem = row[0] + '_' + row[1] if not crn_sem in courses_labeled: teacher, mode, crn, sem2 = short_name_to_teacher_type_crn_sem(row[2]) courses_labeled[crn_sem] = teacher if not row[2] in course_to_code: course_to_code[row[2]] = crs_index code_to_course[crs_index] = row[2] crs_index += 1 if teacher: if not teacher in teacher_to_code: teacher_to_code[teacher] = index code_to_teacher[index] = teacher index += 1 codecs.open('cache/teacher_lookup_codes.json','w','utf-8').write( json.dumps( [teacher_to_code, code_to_teacher], indent=2) ) codecs.open('cache/course_lookup_codes.json','w','utf-8').write( json.dumps( [course_to_code, code_to_course], indent=2) ) out_fullrows = codecs.open(all_courses_file3,'w','utf-8') out_f = csv.writer(out_fullrows) out_f.writerow("course_code course pocr_status orientation_status teacher_code mode student_id scaled_score".split(" ")) out_compact = codecs.open(all_courses_file2,'w','utf-8') out_c = csv.writer(out_compact) out_c.writerow("course_code course pocr_status teacher_code mode percent_passed scl_mean scl_median scl_stdev count".split(" ")) with open(all_courses_file, "w", newline="") as output_f: output = csv.writer(output_f) output.writerow("course_code course pocr_status teacher_code mode percent_passed scl_mean scl_median scl_stdev scl_min scl_max count".split(" ")) with open(all_grades_file, newline="") as csvfile: csvreader = csv.reader(csvfile) block = [] current_index = None next(csvreader) for row in csvreader: index = row[0] if index != current_index: if block: process_one_course_grades(block, output, out_c, teacher_to_code, course_to_code) process_one_course_grades_full(block, out_f, teacher_to_code, course_to_code) block = [] current_index = index block.append(row) if block: process_one_course_grades(block, output, out_c, teacher_to_code, course_to_code) process_one_course_grades_full(block, out_f, teacher_to_code, course_to_code) def reorganize_grades_student(): with open(all_courses_file3, newline="") as csvfile: csvreader = csv.reader(csvfile) bystudent = defaultdict(list) next(csvreader) for row in csvreader: st = row[6] bystudent[st].append(row) students = sorted(bystudent.keys()) with open(all_courses_file4, "w", newline="") as output_f: with open(all_courses_file5, "w", newline="") as output_s: with open(student_courses_scores,'w') as output_scs: output_s.write("student,courses\n") output = csv.writer(output_f) output.writerow("course_code course pocr_status orientation_status teacher_code mode student_id scaled_score".split(" ")) # student id 0 has no courses output.writerow([0,]) for st in students: courses = [r[1] for r in bystudent[st]] scores = [r[7] for r in bystudent[st]] zipped = zip(courses,scores) output_scs.write(st + ",") for c,s in zipped: output_scs.write(f"{c}|{s},") output_scs.write("\n") output_s.write(st + "," + " ".join(courses) + "\n") for row in bystudent[st]: output.writerow(row) def all_course_names_setup(): cc = json.loads(codecs.open('cache/courses/courses_built.json','r','utf-8').read()) courses = {} for C in cc.values(): name = C['dept'] + C['number'] #print(name) courses[ name ] = C #co = codecs.open('cache/courses/names.json','w','utf-8') #for c in sorted(courses.keys()): # co.write(c + "\n") cr = codecs.open('cache/courses/names.json','r','utf-8') from_data = codecs.open('cache/courses_student_scores.csv','r','utf-8').readlines() unknown = {} for line in from_data: parts = line.split(',') stu_id = parts[0] ea = parts[1:] for C in ea: each = C.split('|') name = each[0] if not name in courses: unknown[name] = name #data_courses[each[0]] += 1 for c in sorted(unknown.keys()): print(c) #co.write( json.dumps( {'unknown':unknown, 'coursenames':courses}, indent=2 )) lookup = {} names = {} def shell2course(shell): global lookup, names if not lookup: cr = json.loads(codecs.open('cache/courses/names.json','r','utf-8').read()) lookup = cr['unknown'] allcourses = cr['coursenames'] names = allcourses.keys() if shell in names: return shell if shell in lookup: c = lookup[shell] if c in names: return c #print(f"Can't find course: {shell}") return "" def stu_record_line(line): line = line.strip() line = line.strip(',') parts = line.split(',') stu_id = parts[0] courses = [] for C in parts[1:]: courses.append(C.split('|')) return stu_id, courses def stu_record_to_vector(line, boolean=0): id, courses = stu_record_line(line) yesval = "true" if boolean else 1 noval = "false" if boolean else 0 template = json.loads(codecs.open('cache/courses/course_main_record.json','r','utf-8').read()) lookup = {} for i,c in enumerate(template): lookup[c] = i vector = [noval for x in range(len(template))] for C in courses: goodname = shell2course(C[0]) if goodname: vector[lookup[goodname]] = yesval # C[1] # score return id,vector,courses def grades_to_vectors(boolean=0, verbose=0): grades = codecs.open('cache/courses_student_scores.csv','r','utf-8').readlines() for L in grades: id, vector, courses = stu_record_to_vector(L,boolean) if verbose: print(id, vector) yield id, vector, courses def course_main_record(): return json.loads(codecs.open('cache/courses/course_main_record.json','r','utf-8').read()) def courses_to_vector_ordered(course_list): # each course is (name, semester_order, score) template = course_main_record() lookup = {} for i,c in enumerate(template): lookup[c] = i vector = ['0' for x in range(len(template))] for course,order,score in course_list: goodname = shell2course(course) if goodname: vector[lookup[goodname]] = str(order) return vector def courses_to_vector(course_list, boolean=1): #print(course_list) yesval = "true" if boolean else 1 noval = "false" if boolean else 0 template = course_main_record() lookup = {} for i,c in enumerate(template): lookup[c] = i vector = [noval for x in range(len(template))] for C in course_list: C = C.strip() #goodname = shell2course(C[0]) #if goodname: #print(C) vector[lookup[C]] = yesval # C[1] # score #print(vector) return vector def course_vector_to_names(vector): template = course_main_record() names = [] for i,v in enumerate(vector): if v: names.append(template[i]) return names def all_course_names(): ac = json.loads(codecs.open('cache/courses/courses_built.json','r','utf-8').read()) master_record = [] for C in ac.values(): if C['status'] == 'Draft': continue name = C['dept'] + C['number'] master_record.append(name) master_record = set(master_record) master_record = list(master_record) master_record = sorted(master_record) ## Extract from all 'accomplished courses'... if 0: complete_list = {} missing_names = {} with open(student_courses_scores,'r') as input_f: for L in input_f: stu_id, courses = stu_record_line(L) for C in courses: real_name = shell2course(C[0]) if real_name: complete_list[real_name] = 1 else: missing_names[C[0]] = 1 master_record = sorted(complete_list.keys()) print(f"Found {len(master_record)} courses") print(master_record) print(f"Missing {len(missing_names)} courses") print(missing_names) mr = codecs.open('cache/courses/course_main_record.json','w','utf-8') mr.write(json.dumps(master_record,indent=2)) from semesters import semester_list, canvas_label from semesters import code as semester_order from localcache import all_students_history from datetime import datetime, timedelta def semester_dates(): #print() for c in canvas_label: print(semester_list[c]) length = 15 if semester_list[c]['code'][0:2] == 'su': length = 5 start_date = semester_list[c]['start'] # Convert the date string to a datetime object date_object = datetime.strptime(start_date, '%m/%d/%y') start_fmt = date_object.strftime('%a %b %d, %Y') # Add 15weeks, 5days to the date new_date = date_object + timedelta(weeks=15) new_date = new_date + timedelta(days=5) # Format the new date as a string new_date_string = new_date.strftime('%m/%d/%y') end_fmt = new_date.strftime('%a %b %d, %Y') # Print the new date print(f"start: {start_fmt}, end: {end_fmt}") current_student = "" current_student_block = [] current_student_info = {'first':'', 'last':''} normalized_blocks = [] ignore_courses = "El,zACCT20,GASPAR".split(",") seen_courses = [] def course_line_process(line): global current_student, current_student_block, seen_courses, normalized_blocks, current_student_info sem = line['term_name'] m1 = re.search(r'^(\d\d\d\d)\s(\w+)$', sem) if not m1: # is NOT an academic semester, skip return uid = line['canvasid'] if uid != current_student: if current_student_block: current_student_block.append(current_student_info) normalized_blocks.append(current_student_block) current_student_block = [] current_student_info = {'first':semester_list[sem]['code'], 'last':''} current_student = uid #print(f"Student: {uid} ({line['user_name']})") # line is a dict current_student_info['last'] = semester_list[sem]['code'] year, season = m1.group(1), m1.group(2) date_format = "%Y-%m-%d %H:%M:%S.%f" create_dt = datetime.strptime(line['created'], date_format) update_dt = datetime.strptime(line['updated'], date_format) sem_start = datetime.strptime(semester_list[sem]['start'], '%m/%d/%y') course = line['course_name'] c_parts = course.split(' ') if c_parts[0] in ignore_courses or c_parts[0] in seen_courses: return classname = shell2course(c_parts[0]) if not classname: # print empty dict entry for initial setup # print(f" \"{c_parts[0]}\": \"\",") seen_courses.append(c_parts[0]) else: # flow = line['workflow'] mark = '+' if flow == "deleted": mark = '-' # normal start & finish, give add date add_day = sem_start - create_dt add_day = add_day.days sign = '-' if add_day < 0: add_day = -add_day sign = '+' #print(f" {mark} {classname} added T{sign}{add_day} {semester_list[sem]['code']}") temp_usr_name = re.sub(r',','',line['user_name']) current_student_block.append(f"{uid},{temp_usr_name},{classname},add,T{sign}{add_day},{semester_list[sem]['code']}") if flow == "deleted": # deleted, give delete date del_day = sem_start - update_dt del_day = del_day.days sign = '-' if del_day < 0: del_day = -del_day sign = '+' #print(f" {mark} {classname} deleted T{sign}{del_day} {semester_list[sem]['code']}") current_student_block.append(f"{uid},{temp_usr_name},{classname},del,T{sign}{del_day},{semester_list[sem]['code']}") def normalize_course_histories(): global normalized_blocks, current_student_block, current_student_info all_students_history(course_line_process, limit=99910000) current_student_block.append(current_student_info) normalized_blocks.append(current_student_block) codecs.open('cache/normalized_student_add_drop.json','w','utf-8').write(json.dumps(normalized_blocks,indent=2)) # let's see if we can get grades... grades_by_student_course = defaultdict(dict) print("Doing grades...") with codecs.open('cache/courses_student_scores.csv','r','utf-8') as gradesfile: for s in gradesfile: parts = s.split(',') stu = int(parts[0]) #print(stu) for c in parts[1:]: try: #print(c) crs,gra = c.split('|') grades_by_student_course[stu][crs] = gra except Exception as e: pass # go through again print("Second pass of grades and student history...") student_history = codecs.open('cache/normalized_student_history.csv','w','utf-8') student_history.write("studentid,studentname,course,action,when,grade,sem_name,first_sem,last_sem,tenure_length,sem_index\n") semester_order.reverse() for blk in normalized_blocks: info = blk[-1] first = semester_order.index(info['first']) + 1 last = semester_order.index(info['last']) + 1 length = last - first + 1 for course in blk[:-1]: parts = course.split(',') #print(parts) sem = parts[5] sem_index = semester_order.index(sem) - first + 2 stu = int(parts[0]) crs = parts[2] grade = "" if stu in grades_by_student_course: if crs in grades_by_student_course[stu]: grade = grades_by_student_course[stu][crs] student_history.write(",".join([parts[0], parts[1], parts[2], parts[3], parts[4], grade, parts[5], str(first), str(last), str(length), str(sem_index), ]) + '\n') # make "unified records" or one line per student student_history_2 = codecs.open('cache/normalized_student_history2.csv','w','utf-8') allcourse = course_main_record() #print(allcourse) template = ['studentid', 'studentname', 'tenure_length'] template.extend(allcourse) #print(template) student_history_2.write( ",".join(template) + "\n" ) for blk in normalized_blocks: student_block = [] info = blk[-1] first = semester_order.index(info['first']) + 1 last = semester_order.index(info['last']) + 1 length = last - first + 1 temp_course_holder = {} temp_course_grade_holder = {} for course in blk[:-1]: parts = course.split(',') #print(parts) sem = parts[5] sem_index = semester_order.index(sem) - first + 2 stu = int(parts[0]) crs = parts[2] if parts[3] == 'add': temp_course_holder[crs] = sem_index elif parts[3] == 'del' and crs in temp_course_holder: del temp_course_holder[crs] # now the temp_course_holder has the courses and semesters for crs,sem_index in temp_course_holder.items(): grade = "" if stu in grades_by_student_course: if crs in grades_by_student_course[stu]: grade = grades_by_student_course[stu][crs] this_record = (crs, sem_index, grade) student_block.append(this_record) student_vector = [ parts[0], parts[1], str(length) ] student_vector.extend(courses_to_vector_ordered(student_block)) student_history_2.write(",".join(student_vector) + '\n') #print(student_vector) def cluster_student_histories(): infile = 'cache/courses_student_scores.csv' import pandas as pd import matplotlib.pyplot as plt from kneed import KneeLocator from sklearn.datasets import make_blobs from sklearn.cluster import KMeans from sklearn.metrics import silhouette_score from sklearn.preprocessing import StandardScaler df = pd.read_csv(infile) def dept(s): parts = s.split(' ') return parts[0] def try_make_sched(): term = "fa23" sched = requests.get(f"http://gavilan.cc/schedule/{term}_sched.json").json() #print(json.dumps(sched,indent=2)) d = "CSIS" courses = [ [x['code'], x['crn']] for x in sched if dept(x['code'])==d ] teachers = { x['teacher'] for x in sched if dept(x['code'])==d } print(courses) print(teachers) def sched_lookup_tables(): # Renumber the semesters # sp16 su16 fa16 wi17 sp17 su17 fa17 wi18 #semesters = "sp18 su18 fa18 wi19 sp19 su19 fa19 wi20 sp20 su20 fa20 wi21 sp21 su21 fa21 wi22 sp22 su22 fa22 wi23 sp23 su23 fa23 wi24 sp24 su24 fa24 wi25 sp25 su25 fa25 wi26".split(" ") sem_fourcode = "sp18 su18 fa18 sp19 su19 fa19 sp20 su20 fa20 sp21 su21 fa21 sp22 su22 fa22 sp23 su23 fa23 sp24 su24 fa24 sp25 su25 fa25".split(" ") int_numbers = [x for x in range(1,len(sem_fourcode)+1)] fourcode_2_int = {semester: number for semester, number in zip(sem_fourcode, int_numbers)} int_2_fourcode = {v: k for k, v in fourcode_2_int.items()} sis_2_fourcode = {} fourcode_2_sis = {} yr = 2018 sems = ['30','50','70'] i = 0 semcodes = [] while yr < 2026: for s in sems: semcodes.append(f"{yr}{s}") sis_2_fourcode[f"{yr}{s}"] = sem_fourcode[i] fourcode_2_sis[sis_2_fourcode[f"{yr}{s}"]] = f"{yr}{s}" #print(f"UPDATE schedule SET semsis={yr}{s} WHERE sem='{semesters[i]}';") i += 1 yr += 1 return fourcode_2_int, int_2_fourcode, sis_2_fourcode, fourcode_2_sis, semcodes def section_stats_bymode(): data = query_multiple("SELECT code, semsis, COUNT(id) AS sections, sum(act) filter (WHERE type='in-person') AS inperson, sum(act) filter (WHERE type='online') AS online, sum(act) filter (WHERE type='hybrid') AS hybrid, sum(act) filter (WHERE type='online live') AS onlinelive FROM schedule GROUP BY code, semsis ORDER BY code, semsis;", 'cache/canvas_data/data20231012.db') import pandas as pd df = pd.DataFrame(data) df.fillna(0,inplace=True) for L in 'sections,inperson,online,hybrid,onlinelive'.split(','): df[L] = df[L].astype(int) print(df) df.to_csv('cache/section_stats_bymode.csv') return df def section_stats(): # for each course, (ENG1A) how many are enrolled in each all sections? # (and break down by mode,time,location,etc) # # for each course, how many are first semester gav students? # data = query_multiple("SELECT * FROM schedule ORDER BY code,id", 'cache/canvas_data/data20231012.db') fourcode_2_int, int_2_fourcode, sis_2_fourcode, fourcode_2_sis, semcodes = sched_lookup_tables() # Assuming your data is in a list of dictionaries called data df = pd.DataFrame(data) # Drop the specified columns df = df.drop(columns=['id', 'crn', 'units', 'teacher', 'start', 'end', 'loc', 'cap']) codecs.open('cache/sem_mapping.json','w','utf-8').write(json.dumps(fourcode_2_int,indent=2)) df['sem'] = df['sem'].map(fourcode_2_int) df.set_index('sem', inplace=True) return df def simple_exp_smoothing_section_model(): sout = codecs.open('cache/section_predictions.txt','w','utf-8') from statsmodels.tsa.api import SimpleExpSmoothing warnings.filterwarnings("ignore") periods = 3 start = 19 df = section_stats() print(df) df = df.sort_index() predictions = {} for course_code in df['code'].unique(): try: print(course_code) sout.write(course_code + "\n") this_set = df[df['code'] == course_code]['act'] this_set = this_set.groupby('sem').sum() #this_set.fillna(method='ffill', inplace=True) #this_set.fillna(0, inplace=True) # Create a new index with all required semesters new_index = np.arange(this_set.index.min(), this_set.index.max()+1) # Reindex the DataFrame and fill missing values with 0 this_set = this_set.reindex(new_index, fill_value=0) print(this_set.to_string()) sout.write(this_set.to_string() + "\n") model = SimpleExpSmoothing(this_set) fit = model.fit(smoothing_level=0.2) # initiate with a smoothing level of 0.2 # Later modify above line based on if your data has high or low variability #prediction = fit.forecast(start=32,end=34) # predict attendance for the next 3 semesters prediction = fit.predict(start=start,end=start+4) print(prediction) sout.write(str(prediction) + "\n") sout.flush() predictions[course_code] = prediction except Exception as e: print(f"Model creation failed for {course_code} due to {str(e)}") sout.write(f"Model creation failed for {course_code} due to {str(e)}\n") """ model = ARIMA(this_set, order=(1,1,1)) #ARIMA params (p, d, q) model_fit = model.fit() forecast_result = model_fit.forecast(steps=periods) if forecast_result: predictions[course_code] = forecast_result[0] else: print(f"No prediction for {course_code}. Skipping...")""" # statistics - use a smooth exponential model to predict the next 3 semesters of enrollment # Doesn't really seem to get the patterns. def exp_smoothing_section_model(): sout = codecs.open('cache/section_predictions.txt','w','utf-8') from statsmodels.tsa.api import ExponentialSmoothing warnings.filterwarnings("ignore") periods = 3 start = 19 fourcode_2_int, int_2_fourcode, sis_2_fourcode, fourcode_2_sis, semcodes = sched_lookup_tables() df = section_stats() print(df) df = df.sort_index() predictions = {} for course_code in df['code'].unique(): try: print(course_code) #sout.write(course_code + "\n") this_set = df[df['code'] == course_code]['act'] this_set = this_set.groupby('sem').sum() #this_set.fillna(method='ffill', inplace=True) #this_set.fillna(0, inplace=True) # Create a new index with all required semesters new_index = np.arange(this_set.index.min(), this_set.index.max()+1) # Reindex the DataFrame and fill missing values with 0 this_set = this_set.reindex(new_index, fill_value=0) print(this_set.to_string()) for i,v in this_set.items(): sout.write(f"{course_code},{int_2_fourcode[i]},{v}\n") model = ExponentialSmoothing(this_set, seasonal_periods=4, trend='add', seasonal='add') fit = model.fit() prediction = fit.predict(start=start,end=start+4) print(prediction) for i,v in prediction.items(): v = int(v) if v<0: v=0 sout.write(f"{course_code},{int_2_fourcode[i]}, {v}\n") sout.flush() predictions[course_code] = prediction except Exception as e: print(f"Model creation failed for {course_code} due to {str(e)}") #sout.write(f"Model creation failed for {course_code} due to {str(e)}\n") def student_by_semester(): query = """ SELECT u.name, u.canvasid, s.code, s.semsis FROM users u JOIN enrollment e ON u.id = e.user_id JOIN courses c ON c.id = e.course_id JOIN terms t ON c.termid = t.id JOIN schedule s ON c.schedule = s.id WHERE e.type='StudentEnrollment' AND e.workflow='active' ORDER BY u.sortablename, s.semsis; """ df = pd.DataFrame(query_multiple(query, 'cache/canvas_data/data20231012.db')) # Apply groupby and aggregate the courses in each semester in a comma-separated string df['courses'] = df.groupby(['name','canvasid','semsis'])['code'].transform(lambda x : ' / '.join(x)) # Removing duplicates df = df[['name','canvasid','semsis','courses']].drop_duplicates() # Create pivot table df_pivot = df.pivot_table(values='courses', index=['name','canvasid'], columns='semsis', aggfunc='first').reset_index() # Adding prefix to new columns names to recognize them df_pivot.columns = [str(col) + '_sem' if isinstance(col, int) else col for col in df_pivot.columns] df_pivot.to_csv('cache/student_by_semester.csv') def sections_grouped_by_year_mode(): df = section_stats_bymode() # list of unique courses df_all_courses = df['code'].unique() # list of unique semesters df_all_semesters = df['semsis'].unique() df_all_semesters.sort() raw_data = {} for line in df: print(line['semsis']) sis = str(line['semsis']) year = sis[0:4] raw_data[ f"{line['code']}{year}"] = [line['inperson'],line['online'],line['hybrid'],line['onlinelive']] print(raw_data) return for course in df_all_courses: c = str(course) template = {'code':[c,c,c], 'semsis':[], 'inperson':[], 'online':[], 'hybrid':[], 'onlinelive':[]} # group semesters in to groups of 3 by year for i in df_all_semesters: j = str(i) year = j[0:4] print(f"{i} ({year})") # for each course, for each group of 3 semesters, fill in values, using 0 if necessary # ... def lstm_model_sections(): from keras.models import Sequential from keras.layers import Dense from keras.layers import LSTM from sklearn.preprocessing import MinMaxScaler from sklearn.model_selection import train_test_split # Preprocessing # Normalize inputs for better performance df = section_stats_bymode() print(df) scaler = MinMaxScaler(feature_range=(0, 1)) dataset_scaled = scaler.fit_transform(df.drop(['code', 'semsis'], axis=1)) print("scaled:") print(df) # Split features and targets (Assuming you want to predict 'online' enrollments) X = dataset_scaled[:, 1:] Y = dataset_scaled[:,0:1] # Train / Test split x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=0) # Reshape input to be [samples, time steps, features] which is required for LSTM x_train = np.reshape(x_train, (x_train.shape[0], x_train.shape[1], 1)) x_test = np.reshape(x_test, (x_test.shape[0], x_test.shape[1], 1)) print("x_train shape:", x_train.shape) print(x_train) print("\n\nTraining...\n\n") # LSTM architecture model = Sequential() model.add(LSTM(50, input_shape=(X.shape[1], 1))) # 50 LSTM blocks model.add(Dense(1)) # Since we are predicting only 1 output ('online' enrollments) model.compile(loss='mean_squared_error', optimizer='adam') model.fit(x_train, y_train, epochs=5, batch_size=1) # Training the model # Prediction scaler_predict = MinMaxScaler() scaler_predict.fit_transform(df[['online']]) trainPredict = model.predict(x_train) testPredict = model.predict(x_test) # Invert predictions (Due to normalization) trainPredict = scaler_predict.inverse_transform(trainPredict) testPredict = scaler_predict.inverse_transform(testPredict) # Now you have your future prediction in testPredict. print("Predictions:") print(testPredict) np.savetxt('cache/section_predictions_lstm.txt',testPredict, fmt='%f') # I'm lost here... df def visualize_course_modes_multi_semester(): import plotly.express as px from plotly.subplots import make_subplots seasons = {'sp':'30','su':'50','fa':'70'} semcodes = "sp18 su18 fa18 sp19 su19 fa19 sp20 su20 fa20 sp21 su21 fa21 sp22 su22 fa22 sp23 su23 fa23 sp24".split(" ") # sems = {'sp23':'202330','su23':'202350','fa23':'202370'} sems = { x:'20' + x[2:] + seasons[x[:2]] for x in semcodes } sem_dfs = [] sem_dfs_depts = [] for s in sems.keys(): sched = requests.get(f"http://gavilan.cc/schedule/{s}_sched_expanded.json").json() for crs in sched: if 'extra' in crs: del crs['extra'] crs['dept'] = crs['code'].split(' ')[0] df = pd.DataFrame(sched) df_depts = df.copy() df_depts = df_depts.drop(columns=['crn','sec','code','cmp','name','days','time','rem','wl_cap','wl_act','wl_rem','teacher','date','loc','ztc','time_start','time_end','start','end','doy']) df = df.drop(columns=['crn','sec','code','cmp','name','days','time','rem','wl_cap','wl_act','wl_rem','teacher','date','loc','ztc','time_start','time_end','start','end','doy']) dept_counts = grouped_by_dept = df_depts.groupby(['dept','type']).size().reset_index(name='count') grouped_by_mode = df['type'].value_counts().reset_index() grouped_by_dept["semester"] = sems[s] grouped_by_mode["semester"] = sems[s] #print(dept_counts) sem_dfs.append(grouped_by_mode) sem_dfs_depts.append(grouped_by_dept) #grouped_json = grouped_by_dept.to_json(orient='records') #j = json.loads(grouped_json) #print(json.dumps(j,indent=2)) #grouped_by_dept.columns = ['Department', 'Count'] # rename the column names appropriately #fig = px.bar(grouped_by_dept, x='Department', y='Count', title='Section Counts by Department') #fig.write_html(f"cache/output_{s}.html") combined_data = pd.concat(sem_dfs, axis=0) combined_data = combined_data.rename(columns={'type':'count','index':'type'}) combined_data.reset_index(drop=True,inplace=True) pivoted_data = combined_data.pivot(index='semester', columns='type', values='count') pivoted_data.reset_index(inplace=True) fig = px.bar(pivoted_data, x='semester',y=['hybrid', 'in-person', 'online', 'online live'], barmode='stack', title='Course Delivery by Semester', color_discrete_sequence=["#000066","#660000","#333366","#9400D3"]) fig.write_html(f"cache/sections_by_deliverymode.html") combined_data_depts = pd.concat(sem_dfs_depts, axis=0) combined_data_depts.reset_index(drop=True,inplace=True) #print(combined_data_depts) combined_data_depts.to_csv('cache/section_delivery_by_dept.csv') '''pivoted_data_depts = combined_data_depts.pivot(index='semester', columns='type', values='count') pivoted_data_depts.reset_index(inplace=True) fig = px.bar(pivoted_data_depts, x='semester',y=['hybrid', 'in-person', 'online', 'online live'], barmode='stack', title='Course Delivery by Semester', color_discrete_sequence=["#000066","#660000","#333366","#9400D3"]) fig.write_html(f"cache/sections_depts_by_deliverymode.html")''' unique_depts = combined_data_depts['dept'].unique() fig = make_subplots(rows=len(unique_depts), cols=1, subplot_titles=unique_depts, ) for i, dept in enumerate(unique_depts, start=1): #if i>1: break # Filter the dataframe for the current department dept_data = combined_data_depts[combined_data_depts['dept'] == dept] # Pivot the data frame pivoted_dept_data = dept_data.pivot(index='semester', columns='type', values='count').reset_index() pivoted_dept_data.fillna(0,inplace=True) print(pivoted_dept_data) # Plot the data columns_to_plot = ['hybrid', 'in-person', 'online', 'online live'] valid_columns = [col for col in columns_to_plot if col in pivoted_dept_data.columns] fig_sub = px.bar(pivoted_dept_data, x='semester', y=valid_columns, barmode='stack', #title=f'Course Delivery by Semester for {dept}', color_discrete_sequence=["#000066","#660000","#333366","#9400D3"]) fig.add_traces(fig_sub['data'], rows=[i]*len(fig_sub['data']), cols=[1]*len(fig_sub['data'])) fig.update_layout(height=70*len(fig['data']), width=1100, showlegend=False) fig.write_html(f"cache/sections_depts_by_deliverymode.html") # given a list of classes, report back about the student on one row of info def student_history_analysis(sh): from functools import reduce semesters_set = set() num_sems = 0 num_course = len(sh) num_units = 0 units_online = 0 units_inperson = 0 units_hybrid = 0 units_ol = 0 fa_23_units = 0 fa_23_online_units = 0 fa23_courses = 0 fa23_onlinecourses = 0 #un_list = [ float(x['units'].split('-')[0].split('/')[0]) for x in sh ] #num_units = reduce(lambda x,y: x+y, un_list) for section in sh: semesters_set.add(section['sis']) units = float(section['units'].split('-')[0].split('/')[0]) num_units += units if section['type'] == 'in-person': units_inperson += units if section['type'] == 'online': units_online += units if section['type'] == 'hybrid': units_hybrid += units if section['type'] == 'online live': units_ol += units if section['sis'] == '202370': fa_23_units += units fa23_courses += 1 if not section['type'] == 'in-person': fa_23_online_units += units fa23_onlinecourses += 1 num_sems = len(semesters_set) if num_units == 0: pct_online = 0 else: pct_online = round(100 * (units_online+units_hybrid+units_ol) / num_units, 1) if fa_23_units == 0: fa_23_pct_online = 0 else: fa_23_pct_online = round(100 * (fa_23_online_units) / fa_23_units, 1) if fa23_courses == 0: fa23_pct_course_online = 0 else: fa23_pct_course_online = round(100 * (fa23_onlinecourses) / fa23_courses, 1) summary = [units, num_course, f"\"{sh[0]['sortablename']}\",{sh[0]['canvasid']},{num_sems},{num_course},{num_units},{units_online},{units_inperson},{units_hybrid},{units_ol},{pct_online},{fa_23_units},{fa_23_online_units},{fa_23_pct_online},{fa23_courses},{fa23_onlinecourses},{fa23_pct_course_online}"] return summary def report_student_stats(): from localcache import users_with_history, students_current_semester from itertools import groupby import plotly.graph_objects as go import plotly.io as pio import numpy as np u = users_with_history() this_sem = [x['canvasid'] for x in students_current_semester()] df = pd.DataFrame(u) filtered_df = df[df['canvasid'].isin(this_sem)] filtered_df.to_csv('cache/student_history_current_students.csv',index=False) oo = codecs.open('cache/student_units.txt','w','utf-8') oo.write("name,id,num_sems,num_course,num_units,units_online,units_inperson,units_hybrid,units_ol,percent_online,fa23_units,fa23_onlineunits,fa23_pct_online,fa23_num_courses,fa23_num_onlinecourses,fa23_percent_online_course\n") # Now group by that key def kk(x): return x['canvasid'] grouped_dict = {key:list(group) for key, group in groupby(u, kk)} shorter = [] percentages = [] for k,g in grouped_dict.items(): if k in this_sem: h = student_history_analysis(g) #oo.write(json.dumps(h[2],indent=2)+ "\n") oo.write( str(h[2]) + "\n") shorter.append(h) p = h[2].split(',')[-1] percentages.append(float(p)) else: print(f"Skipping {k}") #print(this_sem) #oo.write('units,courses\n') #shorter.sort(key=lambda x: x[0], reverse=True) #for s in shorter: # print(s[2]) # #oo.write(f"{s[0]},{s[1]}\n") # #print('\n\n') # Create a histogram fig = go.Figure(data=[go.Histogram(x=percentages, xbins=dict(start=0,end=101, size=10))]) # Save the figure in an HTML file pio.write_html(fig, 'cache/student_pct_onlinecourse.html') def test_rpy(): from rpy2 import robjects from rpy2.robjects import Formula, Environment from rpy2.robjects.vectors import IntVector, FloatVector from rpy2.robjects.lib import grid from rpy2.robjects.packages import importr, data from rpy2.rinterface import RRuntimeWarning import warnings # The R 'print' function rprint = robjects.globalenv.get("print") stats = importr('stats') grdevices = importr('grDevices') base = importr('base') datasets = importr('datasets') grid.activate() import math, datetime import rpy2.robjects.lib.ggplot2 as ggplot2 import rpy2.robjects as ro from rpy2.robjects.packages import importr base = importr('base') mtcars = data(datasets).fetch('mtcars')['mtcars'] pp = ggplot2.ggplot(mtcars) + \ ggplot2.aes_string(x='wt', y='mpg', col='factor(cyl)') + \ ggplot2.geom_point() + \ ggplot2.geom_smooth(ggplot2.aes_string(group = 'cyl'), method = 'lm') pp.plot() def test_rpy2(): import rpy2 print(rpy2.__version__) import rpy2.robjects as robjects from rpy2.robjects.packages import importr # import R's "base" package base = importr('base') # import R's "utils" package utils = importr('utils') pi = robjects.r['pi'] print(f"pi={pi[0]}") if __name__ == "__main__": options = { 1: ['get all historical grades from ilearn',get_all] , 2: ['process grades csv file',process_grades] , 3: ['reorganize full grades file by student', reorganize_grades_student], 4: ['test shortname parse',nametest] , 5: ['test sem codes',codetest] , 6: ['get student data from orientations', get_student_orientations], 7: ['manage course master list', all_course_names], 8: ['grades to vectors', grades_to_vectors], 9: ['semester startdates list', semester_dates], 10: ['normalize course histories', normalize_course_histories], 11: ['cluster student histories', cluster_student_histories], 12: ['try to make a schedule', try_make_sched], 13: ['ES model section predict attendance', exp_smoothing_section_model], 14: ['section stats by mode', section_stats_bymode], 15: ['student courses by semester', student_by_semester], 16: ['LSTM model sections', lstm_model_sections], 17: ['rearrange section data to yearly form', sections_grouped_by_year_mode], 30: ['visualize course modes multi semester', visualize_course_modes_multi_semester], 31: ['Report on student stats', report_student_stats], 32: ['test rpy', test_rpy], } print ('') if len(sys.argv) > 1 and re.search(r'^\d+',sys.argv[1]): resp = int(sys.argv[1]) print("\n\nPerforming: %s\n\n" % options[resp][0]) else: print ('') for key in options: print(str(key) + '.\t' + options[key][0]) print('') resp = input('Choose: ') # Call the function in the options dict options[ int(resp)][1]()