canvasapp/stats.py

916 lines
33 KiB
Python

# statistics
"""
## Investigate: Success rates (grades) of students in:
- online courses (over all)
- sync and async and online live
- teachers/courses that have passed POCR (are all async?)
- teachers that have done more than the minimum training in online teaching
- in person classes, if grades are available
## Data collection
- Choose how many semesters (10?)
- Script 1 - given a CRN and Semester, download all grades
- Check if grades were used and make sense
- Compute mean, % > 70, median, etc.
- Anonymization steps
- replace teacher names w/ id number
- replace student names w/ id number
- replace course names w/ course code
- Script 2 - given all semester schedules, generate lists of:
- CRNs which are online, online live, hybrid, inperson, excluded
- CRNs in which teacher and course have passed pocr (and semester is greater than their pass date)
- CRNs in which teacher passed pocr for a different course (and semester is greater than their pass date)
- CRNs to exclude, for example SP20, because of covid. Possibly SU20 and FA20
- CRNs with are POCR approved
- CRNs in which teacher has done more than the minimum training in online teaching
- Student ids which have participated in the online orientation over a certain threshold
- Next steps: generate the x-reference for what categories teachers are in, and
integrate into the main data file.
- Next steps (June/July 2023)
- add campus, time of day, and sem_order (which semester in their college career did they take it) columns
- Organize rows by students
+ Develop a way to categorize them: by course set and/or score set (cluestering: kmeans, forest, etc)
- Goals
- display and summarize clusters of students on a dashboard
- ongoing categorization (implying course recommendations and interventions) based on it
-
## Hypothesis Testing
-
"""
import codecs, os
import json, csv, requests, sys, re
from multiprocessing import Semaphore
from statistics import mean, median, stdev
from pipelines import fetch, url
from courses import getCoursesInTerm, course_enrollment
from localcache import get_course_enrollments
from collections import defaultdict
all_grades_file = f"cache/grades_all.csv"
all_courses_file = f"cache/course_grades_all.csv"
all_courses_file2 = f"cache/course_grades_compact.csv"
all_courses_file3 = f"cache/course_grades_full.csv"
all_courses_file4 = "cache/course_grades_full_bystudent.csv"
all_courses_file5 = "cache/courses_passed_bystudent.csv"
student_courses_scores = "cache/courses_student_scores.csv"
student_orientation_participation = f'cache/participation_orientation_courses.json'
def num(s):
if s == '': return 0
s = re.sub(r'\.0','',s)
try:
return int(s)
except ValueError:
return float(s)
def sem_num_to_code(sem_num):
p = re.search(r'^(\d\d\d\d)(\d\d)$', sem_num)
if p:
yr = p.group(1)[2:4]
sem = p.group(2)
lookup = {'10':'wi','30':'sp', '50':'su', '70':'fa'}
return f"{lookup[sem]}{yr}"
return ""
def sem_code_to_num(sem_code): # fa23
p = re.search(r'^([a-z]{2})(\d\d)$', sem_code)
if p:
s = p.group(1)
y = p.group(2)
lookup = {'wi':'10','sp':'30', 'su':'50', 'fa':'70'}
return f"20{y}{lookup[s]}"
return ""
def codetest():
sems = '202330 202310 202270 202250 202230 202210 202170 202150 202130 202070 202050 202030 202010 201970 201950 201930 201910 201870 201850 201830'.split(' ')
codes = 'fa21 wi22 sp23 su23 fa23 wi24'.split(' ')
for s in sems:
print("{}: {}".format(s, sem_num_to_code(s)))
for c in codes:
print("{}: {}".format(c, sem_code_to_num(c)))
def get_all():
terms = '178 177 176 175 174 173 172 171 168 65 64 62 63 61 60 25 26 23 22 21'.split(' ')
sems = '202330 202310 202270 202250 202230 202210 202170 202150 202130 202070 202050 202030 202010 201970 201950 201930 201910 201870 201850 201830'.split(' ')
# Save grades to a CSV file
with open(all_grades_file, "w", newline="") as csvfile:
writer = csv.writer(csvfile)
writer.writerow(["crn", "sem", "coursecode", "s_can_id","g","name", "current", "final"])
for (term,sem) in zip(terms,sems):
print(term,sem,"\n")
courses = getCoursesInTerm(term,get_fresh=0,show=0,active=1)
for c in courses:
print(c['name'])
c_code = c['course_code']
grades(writer, sem, c['id'], c_code)
csvfile.flush()
def grades(writer, sem, COURSE_ID, course_code):
params = { "include[]": ["enrollments", "current_grading_period_scores"] }
grades = fetch(url + f"/api/v1/courses/{COURSE_ID}/users",0, params)
#grades = json.loads(grades.text)
for student in grades:
try:
id = student["id"]
name = student["name"]
g = student["login_id"]
print("\t", name)
if student['enrollments'][0]['type'] == 'StudentEnrollment':
grade = student["enrollments"][0]["grades"]["final_score"]
current = student["enrollments"][0]["grades"]["current_score"]
writer.writerow([COURSE_ID, sem, course_code, id, g, name, current, grade])
except Exception as e:
print("Exception:", e)
def get_student_orientations():
courses = {'iLearn Student Orientation 2022':'9768', # 8170 students
'Kickstart Online Orientation - Transfer':'36', # 6149
'Kickstart Online Orientation - New to College':'35', # 5392
'LIB732 SP18':'3295', # 2193
'LIB732 FA17':'2037', # 1868
'LIB732 SP17':'69', # 1645
'Kickstart Online Orientation - Returning':'37', # 1463
'iLearn Student Orientation 2023':'15924', # 1292
'LIB732 SU17':'1439' # 1281
}
views_bycourse = {}
all_student_ids = set()
# get pageviews of each orientation course
for c,i in courses.items():
print(c)
cache_file_name = f'cache/participation_course_{i}.json'
student_ids = [x[1] for x in get_course_enrollments(i)]
all_student_ids.update(student_ids)
if os.path.exists(cache_file_name):
pv = json.loads(codecs.open(cache_file_name,'r','utf-8').read())
else:
pv = get_student_page_views(i, student_ids)
codecs.open(cache_file_name,'w','utf-8').write(json.dumps(pv,indent=2))
views_bycourse[i] = pv
# add up pageviews for each student
views_bystudent = {}
for student_id in all_student_ids:
views_bystudent[student_id] = sum([views_bycourse[i].get(student_id,0) for i in courses.values()])
codecs.open(student_orientation_participation,'w','utf-8').write(json.dumps(views_bystudent,indent=2))
def get_student_page_views(course_id, student_ids):
page_views = {}
verbose = 0
for student_id in student_ids:
a = f'/api/v1/courses/{course_id}/analytics/users/{student_id}/activity'
response = fetch(url + a, verbose)
page_views[student_id] = sum(response.get('page_views', {}).values())
if verbose: print(page_views)
return page_views
schedules = {}
orientations = {}
def load_schedules():
global schedules
if not schedules:
for f in os.listdir('cache/schedule'):
m = re.search(r'(\w\w\d\d)_sched_expanded\.json', f)
if m:
sem = m.group(1)
schedules[sem] = json.loads( codecs.open('cache/schedule/' + f, 'r', 'utf-8').read() )
def load_orientations():
global orientations
if not orientations:
orientations = json.loads( codecs.open(student_orientation_participation,'r','utf-8').read() )
return orientations
def to_crn_fallback(name):
#print(name)
name = name.lower()
try:
m1 = re.search(r'(\d\d\d\d\d)',name)
if m1:
crn = m1.group(1)
else:
return None,None
m2 = re.search(r'([wispufa][wispufa]\d\d)',name.lower())
if m2:
sem = m2.group(1)
else:
return None, None
#print(name, crn, sem)
return crn, sem
except Exception as e:
#print("Exception: ", e, name)
return None, None
def ilearn_name_to_course_code(iname):
parts = iname.split(' ')
code = parts[0]
return code
def short_name_to_crn(name):
#print(name)
try:
parts = name.split(' ')
code = parts[0]
sem = parts[1]
crn = parts[2]
m_sem = re.search(r'^(\w\w\d\d)$',sem)
if not m_sem:
return to_crn_fallback(name)
m = re.search(r'^(\d\d\d\d\d)$',crn)
if m:
return crn,sem
else:
crn_parts = crn.split('/')
m = re.search(r'^(\d\d\d\d\d)$',crn_parts[0])
if m:
return crn_parts[0],sem
#print("non standard course short name: ", code, sem, crn)
return to_crn_fallback(name)
except Exception as e:
#print("Exception: ", e, name)
return to_crn_fallback(name)
def fixname(n):
return re.sub(r'\s+',' ', n).strip()
def short_name_to_teacher_type_crn_sem(name):
load_schedules()
crn, sem = short_name_to_crn(name)
try:
if sem:
sem = sem.lower()
if sem[0:2]=='wi':
sem = 'sp' + sem[2:]
for course in schedules[sem]:
if course['crn'] == crn:
return fixname(course['teacher']), course['type'], crn, sem
except Exception as e:
return None, None, None, None
return None, None, None, None
pocrs = {}
def load_pocrs():
global pocrs
if not pocrs:
with open('cache/pocr_passed.csv') as csvfile:
csvreader = csv.reader(csvfile)
next(csvreader)
for row in csvreader:
pocrs[row[0] + " " + row[1]] = row[2]
return pocrs
def lookup_pocr(teacher,course,sem):
p = load_pocrs()
pcode = teacher + " " + course
if pcode in p:
sem_passed = sem_code_to_num(p[pcode])
sem_test = sem_code_to_num(sem)
if sem_passed < sem_test:
return True
return False
def nametest():
with open(all_courses_file) as csvfile:
csvreader = csv.reader(csvfile)
next(csvreader)
for row in csvreader:
print(row[0], "-", short_name_to_teacher_type_crn_sem(row[0]))
next(csvreader)
def above_70(li,maximum):
cutoff = 0.7 * maximum
above = list(filter(lambda x: x >= cutoff, li))
return round(len(above)/len(li), 3)
# v1, does a row of averages for each course
def process_one_course_grades(block, output, out_c, teacher_to_code, course_to_code):
fxns = [mean, median, stdev, min, max, len]
c_id = block[0][0]
sem = block[0][1]
course_code = block[0][2]
cur_scores = [num(x[6]) for x in block]
final_scores = [num(x[7]) for x in block]
teacher, mode, crn, sem2 = short_name_to_teacher_type_crn_sem(course_code)
if not teacher:
return
tch_code = teacher_to_code[teacher]
crs_code = course_to_code[course_code]
if len(final_scores) < 2:
return
try:
(cur_mean, cur_median, cur_stdev, cur_min, cur_max, cur_count) = [round(f(cur_scores)) for f in fxns]
(final_mean, final_median, final_stdev, final_min, final_max, final_count) = [round(f(final_scores)) for f in fxns]
cur_pct_passed = above_70(cur_scores, cur_max)
final_pct_passed = above_70(final_scores, final_max)
if final_max == 0: return
scaled_final_scores = [ x / final_max for x in final_scores]
(scl_mean, scl_median, scl_stdev, scl_min, scl_max, scl_count) = [round(f(scaled_final_scores),2) for f in fxns]
good_code = ilearn_name_to_course_code(course_code)
pocr = 1 if lookup_pocr(teacher, good_code, sem2) else 0
output.writerow( [crs_code, good_code, pocr, tch_code, mode, final_pct_passed, scl_mean, scl_median, scl_stdev, scl_min, scl_max, final_count] )
out_c.writerow([crs_code, good_code, pocr, tch_code, mode, final_pct_passed, scl_mean, scl_median, scl_stdev, final_count])
except Exception as e:
print("Exception:", e)
# v2, one line per student/course
def process_one_course_grades_full(block, out_f, teacher_to_code, course_to_code):
fxns = [mean, median, stdev, min, max, len]
c_id = block[0][0]
sem = block[0][1]
course_code = block[0][2]
cur_scores = [num(x[6]) for x in block]
final_scores = [num(x[7]) for x in block]
teacher, mode, crn, sem2 = short_name_to_teacher_type_crn_sem(course_code)
if not teacher:
return
tch_code = teacher_to_code[teacher]
crs_code = course_to_code[course_code]
if len(final_scores) < 2:
return
try:
# "course_code course pocr_status orientation_status teacher_code mode student_id scaled_score"
(final_mean, final_median, final_stdev, final_min, final_max, final_count) = [round(f(final_scores)) for f in fxns]
final_pct_passed = above_70(final_scores, final_max)
if final_max == 0: return
scaled_final_scores = [ x / final_max for x in final_scores]
(scl_mean, scl_median, scl_stdev, scl_min, scl_max, scl_count) = [round(f(scaled_final_scores),2) for f in fxns]
good_code = ilearn_name_to_course_code(course_code)
pocr = 1 if lookup_pocr(teacher, good_code, sem2) else 0
o = load_orientations()
for row in block:
student_id = row[3]
orientation = o[student_id] if student_id in o else 0
scaled_score = round(num(row[7]) / final_max, 2)
out_f.writerow([crs_code, good_code, pocr, orientation, tch_code, mode, student_id, scaled_score])
print(course_code)
except Exception as e:
print("Exception:", e)
def process_grades():
# first loop to get all names
courses_labeled = {}
teacher_to_code = {}
code_to_teacher = {}
course_to_code = {}
code_to_course = {}
index = 1001
crs_index = 4001
with open(all_grades_file, newline="") as csvfile:
csvreader = csv.reader(csvfile)
next(csvreader)
for row in csvreader:
crn_sem = row[0] + '_' + row[1]
if not crn_sem in courses_labeled:
teacher, mode, crn, sem2 = short_name_to_teacher_type_crn_sem(row[2])
courses_labeled[crn_sem] = teacher
if not row[2] in course_to_code:
course_to_code[row[2]] = crs_index
code_to_course[crs_index] = row[2]
crs_index += 1
if teacher:
if not teacher in teacher_to_code:
teacher_to_code[teacher] = index
code_to_teacher[index] = teacher
index += 1
codecs.open('cache/teacher_lookup_codes.json','w','utf-8').write( json.dumps( [teacher_to_code, code_to_teacher], indent=2) )
codecs.open('cache/course_lookup_codes.json','w','utf-8').write( json.dumps( [course_to_code, code_to_course], indent=2) )
out_fullrows = codecs.open(all_courses_file3,'w','utf-8')
out_f = csv.writer(out_fullrows)
out_f.writerow("course_code course pocr_status orientation_status teacher_code mode student_id scaled_score".split(" "))
out_compact = codecs.open(all_courses_file2,'w','utf-8')
out_c = csv.writer(out_compact)
out_c.writerow("course_code course pocr_status teacher_code mode percent_passed scl_mean scl_median scl_stdev count".split(" "))
with open(all_courses_file, "w", newline="") as output_f:
output = csv.writer(output_f)
output.writerow("course_code course pocr_status teacher_code mode percent_passed scl_mean scl_median scl_stdev scl_min scl_max count".split(" "))
with open(all_grades_file, newline="") as csvfile:
csvreader = csv.reader(csvfile)
block = []
current_index = None
next(csvreader)
for row in csvreader:
index = row[0]
if index != current_index:
if block:
process_one_course_grades(block, output, out_c, teacher_to_code, course_to_code)
process_one_course_grades_full(block, out_f, teacher_to_code, course_to_code)
block = []
current_index = index
block.append(row)
if block:
process_one_course_grades(block, output, out_c, teacher_to_code, course_to_code)
process_one_course_grades_full(block, out_f, teacher_to_code, course_to_code)
def reorganize_grades_student():
with open(all_courses_file3, newline="") as csvfile:
csvreader = csv.reader(csvfile)
bystudent = defaultdict(list)
next(csvreader)
for row in csvreader:
st = row[6]
bystudent[st].append(row)
students = sorted(bystudent.keys())
with open(all_courses_file4, "w", newline="") as output_f:
with open(all_courses_file5, "w", newline="") as output_s:
with open(student_courses_scores,'w') as output_scs:
output_s.write("student,courses\n")
output = csv.writer(output_f)
output.writerow("course_code course pocr_status orientation_status teacher_code mode student_id scaled_score".split(" "))
# student id 0 has no courses
output.writerow([0,])
for st in students:
courses = [r[1] for r in bystudent[st]]
scores = [r[7] for r in bystudent[st]]
zipped = zip(courses,scores)
output_scs.write(st + ",")
for c,s in zipped:
output_scs.write(f"{c}|{s},")
output_scs.write("\n")
output_s.write(st + "," + " ".join(courses) + "\n")
for row in bystudent[st]:
output.writerow(row)
def all_course_names_setup():
cc = json.loads(codecs.open('cache/courses/courses_built.json','r','utf-8').read())
courses = {}
for C in cc.values():
name = C['dept'] + C['number']
#print(name)
courses[ name ] = C
#co = codecs.open('cache/courses/names.json','w','utf-8')
#for c in sorted(courses.keys()):
# co.write(c + "\n")
cr = codecs.open('cache/courses/names.json','r','utf-8')
from_data = codecs.open('cache/courses_student_scores.csv','r','utf-8').readlines()
unknown = {}
for line in from_data:
parts = line.split(',')
stu_id = parts[0]
ea = parts[1:]
for C in ea:
each = C.split('|')
name = each[0]
if not name in courses:
unknown[name] = name
#data_courses[each[0]] += 1
for c in sorted(unknown.keys()):
print(c)
#co.write( json.dumps( {'unknown':unknown, 'coursenames':courses}, indent=2 ))
lookup = {}
names = {}
def shell2course(shell):
global lookup, names
if not lookup:
cr = json.loads(codecs.open('cache/courses/names.json','r','utf-8').read())
lookup = cr['unknown']
allcourses = cr['coursenames']
names = allcourses.keys()
if shell in names:
return shell
if shell in lookup:
c = lookup[shell]
if c in names:
return c
#print(f"Can't find course: {shell}")
return ""
def stu_record_line(line):
line = line.strip()
line = line.strip(',')
parts = line.split(',')
stu_id = parts[0]
courses = []
for C in parts[1:]:
courses.append(C.split('|'))
return stu_id, courses
def stu_record_to_vector(line, boolean=0):
id, courses = stu_record_line(line)
yesval = "true" if boolean else 1
noval = "false" if boolean else 0
template = json.loads(codecs.open('cache/courses/course_main_record.json','r','utf-8').read())
lookup = {}
for i,c in enumerate(template):
lookup[c] = i
vector = [noval for x in range(len(template))]
for C in courses:
goodname = shell2course(C[0])
if goodname:
vector[lookup[goodname]] = yesval # C[1] # score
return id,vector,courses
def grades_to_vectors(boolean=0, verbose=0):
grades = codecs.open('cache/courses_student_scores.csv','r','utf-8').readlines()
for L in grades:
id, vector, courses = stu_record_to_vector(L,boolean)
if verbose: print(id, vector)
yield id, vector, courses
def course_main_record():
return json.loads(codecs.open('cache/courses/course_main_record.json','r','utf-8').read())
def courses_to_vector_ordered(course_list):
# each course is (name, semester_order, score)
template = course_main_record()
lookup = {}
for i,c in enumerate(template):
lookup[c] = i
vector = ['0' for x in range(len(template))]
for course,order,score in course_list:
goodname = shell2course(course)
if goodname:
vector[lookup[goodname]] = str(order)
return vector
def courses_to_vector(course_list, boolean=1):
#print(course_list)
yesval = "true" if boolean else 1
noval = "false" if boolean else 0
template = course_main_record()
lookup = {}
for i,c in enumerate(template):
lookup[c] = i
vector = [noval for x in range(len(template))]
for C in course_list:
C = C.strip()
#goodname = shell2course(C[0])
#if goodname:
#print(C)
vector[lookup[C]] = yesval # C[1] # score
#print(vector)
return vector
def course_vector_to_names(vector):
template = course_main_record()
names = []
for i,v in enumerate(vector):
if v:
names.append(template[i])
return names
def all_course_names():
ac = json.loads(codecs.open('cache/courses/courses_built.json','r','utf-8').read())
master_record = []
for C in ac.values():
if C['status'] == 'Draft':
continue
name = C['dept'] + C['number']
master_record.append(name)
master_record = set(master_record)
master_record = list(master_record)
master_record = sorted(master_record)
## Extract from all 'accomplished courses'...
if 0:
complete_list = {}
missing_names = {}
with open(student_courses_scores,'r') as input_f:
for L in input_f:
stu_id, courses = stu_record_line(L)
for C in courses:
real_name = shell2course(C[0])
if real_name:
complete_list[real_name] = 1
else:
missing_names[C[0]] = 1
master_record = sorted(complete_list.keys())
print(f"Found {len(master_record)} courses")
print(master_record)
print(f"Missing {len(missing_names)} courses")
print(missing_names)
mr = codecs.open('cache/courses/course_main_record.json','w','utf-8')
mr.write(json.dumps(master_record,indent=2))
from semesters import semester_list, canvas_label
from semesters import code as semester_order
from localcache import all_students_history
from datetime import datetime, timedelta
def semester_dates():
#print()
for c in canvas_label:
print(semester_list[c])
length = 15
if semester_list[c]['code'][0:2] == 'su':
length = 5
start_date = semester_list[c]['start']
# Convert the date string to a datetime object
date_object = datetime.strptime(start_date, '%m/%d/%y')
start_fmt = date_object.strftime('%a %b %d, %Y')
# Add 15weeks, 5days to the date
new_date = date_object + timedelta(weeks=15)
new_date = new_date + timedelta(days=5)
# Format the new date as a string
new_date_string = new_date.strftime('%m/%d/%y')
end_fmt = new_date.strftime('%a %b %d, %Y')
# Print the new date
print(f"start: {start_fmt}, end: {end_fmt}")
current_student = ""
current_student_block = []
current_student_info = {'first':'', 'last':''}
normalized_blocks = []
ignore_courses = "El,zACCT20,GASPAR".split(",")
seen_courses = []
def course_line_process(line):
global current_student, current_student_block, seen_courses, normalized_blocks, current_student_info
sem = line['term_name']
m1 = re.search(r'^(\d\d\d\d)\s(\w+)$', sem)
if not m1: # is NOT an academic semester, skip
return
uid = line['canvasid']
if uid != current_student:
if current_student_block:
current_student_block.append(current_student_info)
normalized_blocks.append(current_student_block)
current_student_block = []
current_student_info = {'first':semester_list[sem]['code'], 'last':''}
current_student = uid
#print(f"Student: {uid} ({line['user_name']})")
# line is a dict
current_student_info['last'] = semester_list[sem]['code']
year, season = m1.group(1), m1.group(2)
date_format = "%Y-%m-%d %H:%M:%S.%f"
create_dt = datetime.strptime(line['created'], date_format)
update_dt = datetime.strptime(line['updated'], date_format)
sem_start = datetime.strptime(semester_list[sem]['start'], '%m/%d/%y')
course = line['course_name']
c_parts = course.split(' ')
if c_parts[0] in ignore_courses or c_parts[0] in seen_courses:
return
classname = shell2course(c_parts[0])
if not classname:
# print empty dict entry for initial setup
# print(f" \"{c_parts[0]}\": \"\",")
seen_courses.append(c_parts[0])
else:
#
flow = line['workflow']
mark = '+'
if flow == "deleted": mark = '-'
# normal start & finish, give add date
add_day = sem_start - create_dt
add_day = add_day.days
sign = '-'
if add_day < 0:
add_day = -add_day
sign = '+'
#print(f" {mark} {classname} added T{sign}{add_day} {semester_list[sem]['code']}")
temp_usr_name = re.sub(r',','',line['user_name'])
current_student_block.append(f"{uid},{temp_usr_name},{classname},add,T{sign}{add_day},{semester_list[sem]['code']}")
if flow == "deleted":
# deleted, give delete date
del_day = sem_start - update_dt
del_day = del_day.days
sign = '-'
if del_day < 0:
del_day = -del_day
sign = '+'
#print(f" {mark} {classname} deleted T{sign}{del_day} {semester_list[sem]['code']}")
current_student_block.append(f"{uid},{temp_usr_name},{classname},del,T{sign}{del_day},{semester_list[sem]['code']}")
def normalize_course_histories():
global normalized_blocks, current_student_block, current_student_info
all_students_history(course_line_process, limit=99910000)
current_student_block.append(current_student_info)
normalized_blocks.append(current_student_block)
codecs.open('cache/normalized_student_add_drop.json','w','utf-8').write(json.dumps(normalized_blocks,indent=2))
# let's see if we can get grades...
grades_by_student_course = defaultdict(dict)
print("Doing grades...")
with codecs.open('cache/courses_student_scores.csv','r','utf-8') as gradesfile:
for s in gradesfile:
parts = s.split(',')
stu = int(parts[0])
#print(stu)
for c in parts[1:]:
try:
#print(c)
crs,gra = c.split('|')
grades_by_student_course[stu][crs] = gra
except Exception as e:
pass
# go through again
print("Second pass of grades and student history...")
student_history = codecs.open('cache/normalized_student_history.csv','w','utf-8')
student_history.write("studentid,studentname,course,action,when,grade,sem_name,first_sem,last_sem,tenure_length,sem_index\n")
semester_order.reverse()
for blk in normalized_blocks:
info = blk[-1]
first = semester_order.index(info['first']) + 1
last = semester_order.index(info['last']) + 1
length = last - first + 1
for course in blk[:-1]:
parts = course.split(',')
#print(parts)
sem = parts[5]
sem_index = semester_order.index(sem) - first + 2
stu = int(parts[0])
crs = parts[2]
grade = ""
if stu in grades_by_student_course:
if crs in grades_by_student_course[stu]:
grade = grades_by_student_course[stu][crs]
student_history.write(",".join([parts[0], parts[1], parts[2], parts[3], parts[4], grade, parts[5], str(first), str(last), str(length), str(sem_index), ]) + '\n')
# make "unified records" or one line per student
student_history_2 = codecs.open('cache/normalized_student_history2.csv','w','utf-8')
allcourse = course_main_record()
#print(allcourse)
template = ['studentid', 'studentname', 'tenure_length']
template.extend(allcourse)
#print(template)
student_history_2.write( ",".join(template) + "\n" )
for blk in normalized_blocks:
student_block = []
info = blk[-1]
first = semester_order.index(info['first']) + 1
last = semester_order.index(info['last']) + 1
length = last - first + 1
temp_course_holder = {}
temp_course_grade_holder = {}
for course in blk[:-1]:
parts = course.split(',')
#print(parts)
sem = parts[5]
sem_index = semester_order.index(sem) - first + 2
stu = int(parts[0])
crs = parts[2]
if parts[3] == 'add':
temp_course_holder[crs] = sem_index
elif parts[3] == 'del' and crs in temp_course_holder:
del temp_course_holder[crs]
# now the temp_course_holder has the courses and semesters
for crs,sem_index in temp_course_holder.items():
grade = ""
if stu in grades_by_student_course:
if crs in grades_by_student_course[stu]:
grade = grades_by_student_course[stu][crs]
this_record = (crs, sem_index, grade)
student_block.append(this_record)
student_vector = [ parts[0], parts[1], str(length) ]
student_vector.extend(courses_to_vector_ordered(student_block))
student_history_2.write(",".join(student_vector) + '\n')
#print(student_vector)
def cluster_student_histories():
infile = 'cache/courses_student_scores.csv'
import pandas as pd
import matplotlib.pyplot as plt
from kneed import KneeLocator
from sklearn.datasets import make_blobs
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.preprocessing import StandardScaler
df = pd.read_csv(infile)
if __name__ == "__main__":
options = { 1: ['get all historical grades from ilearn',get_all] ,
2: ['process grades csv file',process_grades] ,
3: ['reorganize full grades file by student', reorganize_grades_student],
4: ['test shortname parse',nametest] ,
5: ['test sem codes',codetest] ,
6: ['get student data from orientations', get_student_orientations],
7: ['manage course master list', all_course_names],
8: ['grades to vectors', grades_to_vectors],
9: ['semester startdates list', semester_dates],
10: ['normalize course histories', normalize_course_histories],
11: ['cluster student histories', cluster_student_histories],
}
print ('')
if len(sys.argv) > 1 and re.search(r'^\d+',sys.argv[1]):
resp = int(sys.argv[1])
print("\n\nPerforming: %s\n\n" % options[resp][0])
else:
print ('')
for key in options:
print(str(key) + '.\t' + options[key][0])
print('')
resp = input('Choose: ')
# Call the function in the options dict
options[ int(resp)][1]()