canvasapp/stats.py

# statistics

"""
## Investigate: Success rates (grades) of students in:

- online courses (over all)
- sync and async and online live
- teachers/courses that have passed POCR (are all async?)
- teachers that have done more than the minimum training in online teaching
- in person classes, if grades are available


## Data collection

- Choose how many semesters (10?)
- Script 1 - given a CRN and Semester, download all grades
    - Check if grades were used and make sense
    - Compute mean, % > 70, median, etc.

- Anonymization steps
    - replace teacher names w/ id number
    - replace student names w/ id number
    - replace course names w/ course code

- Script 2 - given all semester schedules, generate lists of:
    - CRNs which are online, online live, hybrid, inperson, excluded
    - CRNs in which teacher and course have passed pocr (and semester is greater than their pass date)
    - CRNs in which teacher passed pocr for a different course (and semester is greater than their pass date)
    - CRNs to exclude, for example SP20, because of covid. Possibly SU20 and FA20

    - CRNs with are POCR approved
    - CRNs in which teacher has done more than the minimum training in online teaching

    - Student ids which have participated in the online orientation over a certain threshold

- Next steps: generate the x-reference for what categories teachers are in, and
  integrate into the main data file.


- Next steps (June/July 2023)
    - add campus, time of day, and sem_order (which semester in their college career did they take it) columns
    - Organize rows by students
    + Develop a way to categorize them: by course set and/or score set (cluestering: kmeans, forest, etc)

    - Goals
        - display and summarize clusters of students on a dashboard
        - ongoing categorization (implying course recommendations and interventions) based on it
        -


## Hypothesis Testing

-
"""
import codecs, os, warnings, itertools
import json, csv, requests, sys, re
import numpy as np
import pandas as pd
from multiprocessing import Semaphore
from statistics import mean, median, stdev
from pipelines import fetch, url
from courses import getCoursesInTerm, course_enrollment
from localcache import get_course_enrollments
from localcache import query_multiple
from collections import defaultdict

all_grades_file = f"cache/grades_all.csv"
all_courses_file = f"cache/course_grades_all.csv"
all_courses_file2 = f"cache/course_grades_compact.csv"
all_courses_file3 = f"cache/course_grades_full.csv"
all_courses_file4 = "cache/course_grades_full_bystudent.csv"
all_courses_file5 = "cache/courses_passed_bystudent.csv"
student_courses_scores = "cache/courses_student_scores.csv"
student_orientation_participation = f'cache/participation_orientation_courses.json'


def num(s):
    if s == '': return 0
    s = re.sub(r'\.0','',s)
    try:
        return int(s)
    except ValueError:
        return float(s)


def sem_num_to_code(sem_num):
    p = re.search(r'^(\d\d\d\d)(\d\d)$', sem_num)
    if p:
        yr = p.group(1)[2:4]
        sem = p.group(2)
        lookup = {'10':'wi','30':'sp', '50':'su', '70':'fa'}
        return f"{lookup[sem]}{yr}"
    return ""

def sem_code_to_num(sem_code):    # fa23
    p = re.search(r'^([a-z]{2})(\d\d)$', sem_code)
    if p:
        s = p.group(1)
        y = p.group(2)
        lookup = {'wi':'10','sp':'30', 'su':'50', 'fa':'70'}
        return f"20{y}{lookup[s]}"
    return ""

def codetest():
    sems = '202330 202310 202270 202250 202230 202210 202170 202150 202130 202070 202050 202030 202010 201970 201950 201930 201910 201870 201850 201830'.split(' ')
    codes = 'fa21 wi22 sp23 su23 fa23 wi24'.split(' ')
    for s in sems:
        print("{}: {}".format(s, sem_num_to_code(s)))

    for c in codes:
        print("{}: {}".format(c, sem_code_to_num(c)))

def get_all():
    terms = '178 177 176 175 174 173 172 171 168 65 64 62 63 61 60 25 26 23 22 21'.split(' ')
    sems = '202330 202310 202270 202250 202230 202210 202170 202150 202130 202070 202050 202030 202010 201970 201950 201930 201910 201870 201850 201830'.split(' ')
    # Save grades to a CSV file
    with open(all_grades_file, "w", newline="") as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(["crn", "sem", "coursecode", "s_can_id","g","name", "current", "final"])
        for (term,sem) in zip(terms,sems):
            print(term,sem,"\n")
            courses = getCoursesInTerm(term,get_fresh=0,show=0,active=1)
            for c in courses:
                print(c['name'])
                c_code = c['course_code']
                grades(writer, sem, c['id'], c_code)
                csvfile.flush()


def grades(writer, sem, COURSE_ID, course_code):
    params = { "include[]": ["enrollments", "current_grading_period_scores"] }
    grades = fetch(url + f"/api/v1/courses/{COURSE_ID}/users",0, params)
    #grades = json.loads(grades.text)

    for student in grades:
        try:
            id = student["id"]
            name = student["name"]
            g = student["login_id"]
            print("\t", name)
            if student['enrollments'][0]['type'] == 'StudentEnrollment':
                grade = student["enrollments"][0]["grades"]["final_score"]
                current = student["enrollments"][0]["grades"]["current_score"]
                writer.writerow([COURSE_ID, sem, course_code, id, g, name, current, grade])
        except Exception as e:
            print("Exception:", e)


def get_student_orientations():
    courses = {'iLearn Student Orientation 2022':'9768',                # 8170 students
               'Kickstart Online Orientation - Transfer':'36',          # 6149
               'Kickstart Online Orientation - New to College':'35',    # 5392
               'LIB732 SP18':'3295',                                    # 2193
               'LIB732 FA17':'2037',                                    # 1868
               'LIB732 SP17':'69',                                      # 1645
               'Kickstart Online Orientation - Returning':'37',         # 1463
               'iLearn Student Orientation 2023':'15924',               # 1292
               'LIB732 SU17':'1439'                                     # 1281
    }

    views_bycourse = {}
    all_student_ids = set()

    # get pageviews of each orientation course
    for c,i in courses.items():
        print(c)
        cache_file_name = f'cache/participation_course_{i}.json'
        student_ids = [x[1] for x in get_course_enrollments(i)]
        all_student_ids.update(student_ids)
        if os.path.exists(cache_file_name):
            pv = json.loads(codecs.open(cache_file_name,'r','utf-8').read())
        else:
            pv = get_student_page_views(i, student_ids)
            codecs.open(cache_file_name,'w','utf-8').write(json.dumps(pv,indent=2))
        views_bycourse[i] = pv

    # add up pageviews for each student
    views_bystudent = {}
    for student_id in all_student_ids:
        views_bystudent[student_id] = sum([views_bycourse[i].get(student_id,0) for i in courses.values()])
    codecs.open(student_orientation_participation,'w','utf-8').write(json.dumps(views_bystudent,indent=2))

def get_student_page_views(course_id, student_ids):
    page_views = {}
    verbose = 0

    for student_id in student_ids:
        a = f'/api/v1/courses/{course_id}/analytics/users/{student_id}/activity'
        response = fetch(url + a, verbose)
        page_views[student_id] = sum(response.get('page_views', {}).values())

    if verbose: print(page_views)
    return page_views

schedules = {}
orientations = {}

def load_schedules():
    global schedules
    if not schedules:
        for f in os.listdir('cache/schedule'):
            m = re.search(r'(\w\w\d\d)_sched_expanded\.json', f)
            if m:
                sem = m.group(1)
                schedules[sem] = json.loads( codecs.open('cache/schedule/' + f, 'r', 'utf-8').read() )

def load_orientations():
    global orientations
    if not orientations:
        orientations = json.loads( codecs.open(student_orientation_participation,'r','utf-8').read() )
    return orientations


def to_crn_fallback(name):
    #print(name)
    name = name.lower()
    try:
        m1 = re.search(r'(\d\d\d\d\d)',name)
        if m1:
            crn = m1.group(1)
        else:
            return None,None
        m2 = re.search(r'([wispufa][wispufa]\d\d)',name.lower())
        if m2:
            sem = m2.group(1)
        else:
            return None, None
        #print(name, crn, sem)
        return crn, sem
    except Exception as e:
        #print("Exception: ", e, name)
        return None, None

def ilearn_name_to_course_code(iname):
    parts = iname.split(' ')
    code = parts[0]
    return code

def short_name_to_crn(name):
    #print(name)
    try:
        parts = name.split(' ')
        code = parts[0]
        sem = parts[1]
        crn = parts[2]
        m_sem = re.search(r'^(\w\w\d\d)$',sem)
        if not m_sem:
            return to_crn_fallback(name)
        m = re.search(r'^(\d\d\d\d\d)$',crn)
        if m:
            return crn,sem
        else:
            crn_parts = crn.split('/')
            m = re.search(r'^(\d\d\d\d\d)$',crn_parts[0])
            if m:
                return crn_parts[0],sem
            #print("non standard course short name: ", code, sem, crn)
            return to_crn_fallback(name)
    except Exception as e:
        #print("Exception: ", e, name)
        return to_crn_fallback(name)


def fixname(n):
    return re.sub(r'\s+',' ', n).strip()


def short_name_to_teacher_type_crn_sem(name):
    load_schedules()
    crn, sem = short_name_to_crn(name)

    try:
        if sem:
            sem = sem.lower()
            if sem[0:2]=='wi':
                sem = 'sp' + sem[2:]
            for course in schedules[sem]:
                if course['crn'] == crn:
                    return fixname(course['teacher']), course['type'], crn, sem
    except Exception as e:
        return None, None, None, None

    return None, None, None, None

pocrs = {}

def load_pocrs():
    global pocrs
    if not pocrs:
        with open('cache/pocr_passed.csv') as csvfile:
            csvreader = csv.reader(csvfile)
            next(csvreader)
            for row in csvreader:
                pocrs[row[0] + " " + row[1]] = row[2]
    return pocrs

def lookup_pocr(teacher,course,sem):
    p = load_pocrs()
    pcode = teacher + " " + course
    if pcode in p:
        sem_passed = sem_code_to_num(p[pcode])
        sem_test = sem_code_to_num(sem)
        if sem_passed < sem_test:
            return True
    return False

def nametest():
    with open(all_courses_file) as csvfile:
        csvreader = csv.reader(csvfile)
        next(csvreader)

        for row in csvreader:
            print(row[0], "-", short_name_to_teacher_type_crn_sem(row[0]))
            next(csvreader)

def above_70(li,maximum):
    cutoff = 0.7 * maximum
    above = list(filter(lambda x: x >= cutoff, li))
    return round(len(above)/len(li), 3)


# v1, does a row of averages for each course
def process_one_course_grades(block, output, out_c, teacher_to_code, course_to_code):
    fxns = [mean, median, stdev, min, max, len]
    c_id = block[0][0]
    sem = block[0][1]
    course_code = block[0][2]
    cur_scores = [num(x[6]) for x in block]
    final_scores = [num(x[7]) for x in block]
    teacher, mode, crn, sem2 = short_name_to_teacher_type_crn_sem(course_code)
    if not teacher:
        return
    tch_code = teacher_to_code[teacher]
    crs_code = course_to_code[course_code]
    if len(final_scores) < 2:
        return
    try:
        (cur_mean, cur_median, cur_stdev, cur_min, cur_max, cur_count) = [round(f(cur_scores)) for f in fxns]
        (final_mean, final_median, final_stdev, final_min, final_max, final_count) = [round(f(final_scores)) for f in fxns]

        cur_pct_passed = above_70(cur_scores, cur_max)
        final_pct_passed = above_70(final_scores, final_max)

        if final_max == 0: return

        scaled_final_scores = [ x / final_max for x in final_scores]
        (scl_mean, scl_median, scl_stdev, scl_min, scl_max, scl_count) = [round(f(scaled_final_scores),2) for f in fxns]

        good_code = ilearn_name_to_course_code(course_code)
        pocr = 1 if lookup_pocr(teacher, good_code, sem2) else 0

        output.writerow( [crs_code, good_code, pocr, tch_code, mode, final_pct_passed, scl_mean, scl_median, scl_stdev, scl_min, scl_max, final_count] )
        out_c.writerow([crs_code, good_code, pocr, tch_code, mode, final_pct_passed, scl_mean, scl_median, scl_stdev, final_count])
    except Exception as e:
        print("Exception:", e)


# v2, one line per student/course
def process_one_course_grades_full(block, out_f, teacher_to_code, course_to_code):
    fxns = [mean, median, stdev, min, max, len]
    c_id = block[0][0]
    sem = block[0][1]
    course_code = block[0][2]
    cur_scores = [num(x[6]) for x in block]
    final_scores = [num(x[7]) for x in block]
    teacher, mode, crn, sem2 = short_name_to_teacher_type_crn_sem(course_code)
    if not teacher:
        return
    tch_code = teacher_to_code[teacher]
    crs_code = course_to_code[course_code]
    if len(final_scores) < 2:
        return
    try:

        # "course_code course pocr_status orientation_status teacher_code mode student_id scaled_score"

        (final_mean, final_median, final_stdev, final_min, final_max, final_count) = [round(f(final_scores)) for f in fxns]
        final_pct_passed = above_70(final_scores, final_max)

        if final_max == 0: return

        scaled_final_scores = [ x / final_max for x in final_scores]
        (scl_mean, scl_median, scl_stdev, scl_min, scl_max, scl_count) = [round(f(scaled_final_scores),2) for f in fxns]

        good_code = ilearn_name_to_course_code(course_code)
        pocr = 1 if lookup_pocr(teacher, good_code, sem2) else 0

        o = load_orientations()

        for row in block:
            student_id = row[3]
            orientation = o[student_id] if student_id in o else 0
            scaled_score = round(num(row[7]) / final_max, 2)
            out_f.writerow([crs_code, good_code, pocr, orientation, tch_code, mode, student_id, scaled_score])
        print(course_code)
    except Exception as e:
        print("Exception:", e)

def process_grades():
    # first loop to get all names
    courses_labeled = {}
    teacher_to_code = {}
    code_to_teacher = {}

    course_to_code = {}
    code_to_course = {}

    index = 1001
    crs_index = 4001

    with open(all_grades_file, newline="") as csvfile:
        csvreader = csv.reader(csvfile)
        next(csvreader)
        for row in csvreader:
            crn_sem = row[0] + '_' + row[1]
            if not crn_sem in courses_labeled:
                teacher, mode, crn, sem2 = short_name_to_teacher_type_crn_sem(row[2])
                courses_labeled[crn_sem] = teacher

                if not row[2] in course_to_code:
                    course_to_code[row[2]] = crs_index
                    code_to_course[crs_index] = row[2]
                    crs_index += 1

                if teacher:
                    if not teacher in teacher_to_code:
                        teacher_to_code[teacher] = index
                        code_to_teacher[index] = teacher
                        index += 1
    codecs.open('cache/teacher_lookup_codes.json','w','utf-8').write( json.dumps( [teacher_to_code, code_to_teacher], indent=2) )
    codecs.open('cache/course_lookup_codes.json','w','utf-8').write( json.dumps( [course_to_code, code_to_course], indent=2) )

    out_fullrows = codecs.open(all_courses_file3,'w','utf-8')
    out_f = csv.writer(out_fullrows)
    out_f.writerow("course_code course pocr_status orientation_status teacher_code mode student_id scaled_score".split(" "))

    out_compact = codecs.open(all_courses_file2,'w','utf-8')
    out_c = csv.writer(out_compact)
    out_c.writerow("course_code course pocr_status teacher_code mode percent_passed scl_mean scl_median scl_stdev count".split(" "))
    with open(all_courses_file, "w", newline="") as output_f:
        output = csv.writer(output_f)
        output.writerow("course_code course pocr_status teacher_code mode percent_passed scl_mean scl_median scl_stdev scl_min scl_max count".split(" "))

        with open(all_grades_file, newline="") as csvfile:
            csvreader = csv.reader(csvfile)
            block = []
            current_index = None

            next(csvreader)

            for row in csvreader:
                index = row[0]

                if index != current_index:
                    if block:
                        process_one_course_grades(block, output, out_c, teacher_to_code, course_to_code)
                        process_one_course_grades_full(block, out_f, teacher_to_code, course_to_code)
                    block = []
                    current_index = index

                block.append(row)

            if block:
                process_one_course_grades(block, output, out_c, teacher_to_code, course_to_code)
                process_one_course_grades_full(block, out_f, teacher_to_code, course_to_code)


def reorganize_grades_student():
    with open(all_courses_file3, newline="") as csvfile:
        csvreader = csv.reader(csvfile)
        bystudent = defaultdict(list)

        next(csvreader)

        for row in csvreader:
            st = row[6]
            bystudent[st].append(row)

        students = sorted(bystudent.keys())
        with open(all_courses_file4, "w", newline="") as output_f:
            with open(all_courses_file5, "w", newline="") as output_s:
                with open(student_courses_scores,'w') as output_scs:
                    output_s.write("student,courses\n")
                    output = csv.writer(output_f)
                    output.writerow("course_code course pocr_status orientation_status teacher_code mode student_id scaled_score".split(" "))
                    # student id 0 has no courses
                    output.writerow([0,])
                    for st in students:
                        courses = [r[1] for r in bystudent[st]]
                        scores = [r[7] for r in bystudent[st]]
                        zipped = zip(courses,scores)
                        output_scs.write(st + ",")
                        for c,s in zipped:
                            output_scs.write(f"{c}|{s},")
                        output_scs.write("\n")
                        output_s.write(st + "," + " ".join(courses) + "\n")
                        for row in bystudent[st]:
                            output.writerow(row)


def all_course_names_setup():
    cc = json.loads(codecs.open('cache/courses/courses_built.json','r','utf-8').read())
    courses = {}
    for C in cc.values():
        name = C['dept'] + C['number']
        #print(name)
        courses[ name ] = C

    #co = codecs.open('cache/courses/names.json','w','utf-8')
    #for c in sorted(courses.keys()):
    #   co.write(c + "\n")

    cr = codecs.open('cache/courses/names.json','r','utf-8')


    from_data = codecs.open('cache/courses_student_scores.csv','r','utf-8').readlines()
    unknown = {}
    for line in from_data:
        parts = line.split(',')
        stu_id = parts[0]
        ea = parts[1:]
        for C in ea:
            each = C.split('|')
            name = each[0]
            if not name in courses:
                unknown[name] = name
            #data_courses[each[0]] += 1
    for c in sorted(unknown.keys()):
        print(c)

    #co.write( json.dumps( {'unknown':unknown, 'coursenames':courses}, indent=2 ))


lookup = {}
names = {}

def shell2course(shell):
    global lookup, names
    if not lookup:
        cr = json.loads(codecs.open('cache/courses/names.json','r','utf-8').read())
        lookup = cr['unknown']
        allcourses = cr['coursenames']
        names = allcourses.keys()

    if shell in names:
        return shell
    if shell in lookup:
        c = lookup[shell]
        if c in names:
            return c
    #print(f"Can't find course: {shell}")
    return ""


def stu_record_line(line):
    line = line.strip()
    line = line.strip(',')
    parts = line.split(',')
    stu_id = parts[0]
    courses = []
    for C in parts[1:]:
        courses.append(C.split('|'))
    return stu_id, courses

def stu_record_to_vector(line, boolean=0):
    id, courses = stu_record_line(line)

    yesval = "true" if boolean else 1
    noval = "false" if boolean else 0

    template = json.loads(codecs.open('cache/courses/course_main_record.json','r','utf-8').read())
    lookup = {}
    for i,c in enumerate(template):
        lookup[c] = i
    vector = [noval for x in range(len(template))]
    for C in courses:
        goodname = shell2course(C[0])
        if goodname:
            vector[lookup[goodname]] = yesval     #  C[1]   # score
    return id,vector,courses


def grades_to_vectors(boolean=0, verbose=0):
    grades = codecs.open('cache/courses_student_scores.csv','r','utf-8').readlines()
    for L in grades:
        id, vector, courses = stu_record_to_vector(L,boolean)
        if verbose: print(id, vector)
        yield id, vector, courses

def course_main_record():
    return json.loads(codecs.open('cache/courses/course_main_record.json','r','utf-8').read())


def courses_to_vector_ordered(course_list):
    # each course is (name, semester_order, score)
    template = course_main_record()
    lookup = {}
    for i,c in enumerate(template):
        lookup[c] = i
    vector = ['0' for x in range(len(template))]
    for course,order,score in course_list:
        goodname = shell2course(course)
        if goodname:
            vector[lookup[goodname]] = str(order)
    return vector

def courses_to_vector(course_list, boolean=1):
    #print(course_list)
    yesval = "true" if boolean else 1
    noval = "false" if boolean else 0
    template = course_main_record()
    lookup = {}
    for i,c in enumerate(template):
        lookup[c] = i
    vector = [noval for x in range(len(template))]
    for C in course_list:
        C = C.strip()
        #goodname = shell2course(C[0])
        #if goodname:
        #print(C)
        vector[lookup[C]] = yesval     #  C[1]   # score
    #print(vector)
    return vector

def course_vector_to_names(vector):
    template = course_main_record()
    names = []
    for i,v in enumerate(vector):
        if v:
            names.append(template[i])
    return names


def all_course_names():
    ac = json.loads(codecs.open('cache/courses/courses_built.json','r','utf-8').read())
    master_record = []
    for C in ac.values():
        if C['status'] == 'Draft':
            continue
        name = C['dept'] + C['number']
        master_record.append(name)
    master_record = set(master_record)
    master_record = list(master_record)
    master_record = sorted(master_record)

    ## Extract from all 'accomplished courses'...
    if 0:
        complete_list = {}
        missing_names = {}
        with open(student_courses_scores,'r') as input_f:
            for L in input_f:
                stu_id, courses = stu_record_line(L)
                for C in courses:
                    real_name = shell2course(C[0])
                    if real_name:
                        complete_list[real_name] = 1
                    else:
                        missing_names[C[0]] = 1
        master_record = sorted(complete_list.keys())
        print(f"Found {len(master_record)} courses")
        print(master_record)
        print(f"Missing {len(missing_names)} courses")
        print(missing_names)
    mr = codecs.open('cache/courses/course_main_record.json','w','utf-8')
    mr.write(json.dumps(master_record,indent=2))


from semesters import sems_by_human_name, canvas_label
from semesters import code as semester_order
from localcache import all_students_history
from datetime import datetime, timedelta

def semester_dates():
    #print()
    for c in canvas_label:
        print(sems_by_human_name[c])

        length = 15
        if sems_by_human_name[c]['code'][0:2] == 'su':
            length = 5
        start_date = sems_by_human_name[c]['start']
        # Convert the date string to a datetime object
        date_object = datetime.strptime(start_date, '%m/%d/%y')
        start_fmt = date_object.strftime('%a %b %d, %Y')

        # Add 15weeks, 5days to the date
        new_date = date_object + timedelta(weeks=15)
        new_date = new_date + timedelta(days=5)

        # Format the new date as a string
        new_date_string = new_date.strftime('%m/%d/%y')
        end_fmt = new_date.strftime('%a %b %d, %Y')

        # Print the new date
        print(f"start: {start_fmt}, end: {end_fmt}")


current_student = ""
current_student_block = []
current_student_info = {'first':'', 'last':''}
normalized_blocks = []

ignore_courses = "El,zACCT20,GASPAR".split(",")
seen_courses = []

def course_line_process(line):
    global current_student, current_student_block, seen_courses, normalized_blocks, current_student_info

    sem = line['term_name']
    m1 = re.search(r'^(\d\d\d\d)\s(\w+)$', sem)
    if not m1:   # is NOT an academic semester, skip
        return

    uid = line['canvasid']
    if uid != current_student:
        if current_student_block:
            current_student_block.append(current_student_info)
            normalized_blocks.append(current_student_block)
        current_student_block = []
        current_student_info = {'first':sems_by_human_name[sem]['code'], 'last':''}
        current_student = uid
        #print(f"Student: {uid} ({line['user_name']})")

    # line is a dict
    current_student_info['last'] = sems_by_human_name[sem]['code']
    year, season = m1.group(1), m1.group(2)
    date_format = "%Y-%m-%d %H:%M:%S.%f"
    create_dt = datetime.strptime(line['created'], date_format)
    update_dt = datetime.strptime(line['updated'], date_format)
    sem_start = datetime.strptime(sems_by_human_name[sem]['start'], '%m/%d/%y')

    course = line['course_name']
    c_parts = course.split(' ')
    if c_parts[0] in ignore_courses or c_parts[0] in seen_courses:
        return
    classname = shell2course(c_parts[0])
    if not classname:
        # print empty dict entry for initial setup
        # print(f"    \"{c_parts[0]}\": \"\",")
        seen_courses.append(c_parts[0])
    else:
        #
        flow = line['workflow']
        mark = '+'
        if flow == "deleted": mark = '-'
        # normal start & finish, give add date
        add_day = sem_start - create_dt
        add_day = add_day.days
        sign = '-'
        if add_day < 0:
            add_day = -add_day
            sign = '+'
        #print(f"    {mark} {classname} added T{sign}{add_day} {semester_list[sem]['code']}")
        temp_usr_name = re.sub(r',','',line['user_name'])
        current_student_block.append(f"{uid},{temp_usr_name},{classname},add,T{sign}{add_day},{sems_by_human_name[sem]['code']}")
        if flow == "deleted":
            # deleted, give delete date
            del_day = sem_start - update_dt
            del_day = del_day.days
            sign = '-'
            if del_day < 0:
                del_day = -del_day
                sign = '+'
            #print(f"    {mark} {classname} deleted T{sign}{del_day} {semester_list[sem]['code']}")
            current_student_block.append(f"{uid},{temp_usr_name},{classname},del,T{sign}{del_day},{sems_by_human_name[sem]['code']}")


def normalize_course_histories():
    global normalized_blocks, current_student_block, current_student_info
    all_students_history(course_line_process, limit=99910000)
    current_student_block.append(current_student_info)
    normalized_blocks.append(current_student_block)

    codecs.open('cache/normalized_student_add_drop.json','w','utf-8').write(json.dumps(normalized_blocks,indent=2))

    # let's see if we can get grades...
    grades_by_student_course = defaultdict(dict)
    print("Doing grades...")
    with codecs.open('cache/courses_student_scores.csv','r','utf-8') as gradesfile:
        for s in gradesfile:
            parts = s.split(',')
            stu = int(parts[0])
            #print(stu)
            for c in parts[1:]:
                try:
                    #print(c)
                    crs,gra = c.split('|')
                    grades_by_student_course[stu][crs] = gra
                except Exception as e:
                    pass

    # go through again
    print("Second pass of grades and student history...")
    student_history = codecs.open('cache/normalized_student_history.csv','w','utf-8')
    student_history.write("studentid,studentname,course,action,when,grade,sem_name,first_sem,last_sem,tenure_length,sem_index\n")
    semester_order.reverse()
    for blk in normalized_blocks:
        info = blk[-1]
        first = semester_order.index(info['first']) + 1
        last = semester_order.index(info['last']) + 1
        length = last - first + 1

        for course in blk[:-1]:
            parts = course.split(',')
            #print(parts)
            sem = parts[5]
            sem_index = semester_order.index(sem) - first + 2
            stu = int(parts[0])
            crs = parts[2]
            grade = ""
            if stu in grades_by_student_course:
                if crs in grades_by_student_course[stu]:
                    grade = grades_by_student_course[stu][crs]

            student_history.write(",".join([parts[0], parts[1], parts[2], parts[3], parts[4], grade, parts[5], str(first), str(last), str(length), str(sem_index), ]) + '\n')

    # make "unified records" or one line per student
    student_history_2 = codecs.open('cache/normalized_student_history2.csv','w','utf-8')
    allcourse = course_main_record()
    #print(allcourse)
    template = ['studentid', 'studentname', 'tenure_length']
    template.extend(allcourse)
    #print(template)
    student_history_2.write( ",".join(template) + "\n" )

    for blk in normalized_blocks:
        student_block = []
        info = blk[-1]
        first = semester_order.index(info['first']) + 1
        last = semester_order.index(info['last']) + 1
        length = last - first + 1

        temp_course_holder = {}
        temp_course_grade_holder = {}

        for course in blk[:-1]:
            parts = course.split(',')
            #print(parts)
            sem = parts[5]
            sem_index = semester_order.index(sem) - first + 2
            stu = int(parts[0])
            crs = parts[2]
            if parts[3] == 'add':
                temp_course_holder[crs] = sem_index
            elif parts[3] == 'del' and crs in temp_course_holder:
                del temp_course_holder[crs]

        # now the temp_course_holder has the courses and semesters
        for crs,sem_index in temp_course_holder.items():
            grade = ""
            if stu in grades_by_student_course:
                if crs in grades_by_student_course[stu]:
                    grade = grades_by_student_course[stu][crs]
            this_record = (crs, sem_index, grade)
            student_block.append(this_record)
        student_vector = [ parts[0], parts[1], str(length) ]
        student_vector.extend(courses_to_vector_ordered(student_block))

        student_history_2.write(",".join(student_vector) + '\n')
        #print(student_vector)

def cluster_student_histories():
    infile = 'cache/courses_student_scores.csv'

    import pandas as pd
    import matplotlib.pyplot as plt
    from kneed import KneeLocator
    from sklearn.datasets import make_blobs
    from sklearn.cluster import KMeans
    from sklearn.metrics import silhouette_score
    from sklearn.preprocessing import StandardScaler

    df = pd.read_csv(infile)

def dept(s):
    parts = s.split(' ')
    return parts[0]

def try_make_sched():
    term = "fa23"
    sched = requests.get(f"http://gavilan.cc/schedule/{term}_sched.json").json()
    #print(json.dumps(sched,indent=2))

    d = "CSIS"
    courses = [ [x['code'], x['crn']] for x in sched if dept(x['code'])==d ]
    teachers = { x['teacher'] for x in sched if dept(x['code'])==d }

    print(courses)
    print(teachers)


def sched_lookup_tables():

    # Renumber the semesters
    # sp16 su16 fa16 wi17 sp17 su17 fa17 wi18
    #semesters = "sp18 su18 fa18 wi19 sp19 su19 fa19 wi20 sp20 su20 fa20 wi21 sp21 su21 fa21 wi22 sp22 su22 fa22 wi23 sp23 su23 fa23 wi24 sp24 su24 fa24 wi25 sp25 su25 fa25 wi26".split(" ")

    sem_fourcode = "sp18 su18 fa18 sp19 su19 fa19 sp20 su20 fa20 sp21 su21 fa21 sp22 su22 fa22 sp23 su23 fa23 sp24 su24 fa24 sp25 su25 fa25".split(" ")
    int_numbers = [x for x in range(1,len(sem_fourcode)+1)]
    fourcode_2_int = {semester: number for semester, number in zip(sem_fourcode, int_numbers)}
    int_2_fourcode = {v: k for k, v in fourcode_2_int.items()}

    sis_2_fourcode = {}
    fourcode_2_sis = {}
    yr = 2018
    sems = ['30','50','70']
    i = 0
    semcodes = []
    while yr < 2026:
        for s in sems:
            semcodes.append(f"{yr}{s}")
            sis_2_fourcode[f"{yr}{s}"] = sem_fourcode[i]
            fourcode_2_sis[sis_2_fourcode[f"{yr}{s}"]] = f"{yr}{s}"
            #print(f"UPDATE schedule SET semsis={yr}{s} WHERE sem='{semesters[i]}';")
            i += 1
        yr += 1
    return fourcode_2_int, int_2_fourcode, sis_2_fourcode, fourcode_2_sis, semcodes


def section_stats_bymode():
    data = query_multiple("SELECT code, semsis, COUNT(id) AS sections, sum(act) filter (WHERE type='in-person') AS inperson, sum(act) filter (WHERE type='online') AS online, sum(act) filter (WHERE type='hybrid') AS hybrid, sum(act) filter (WHERE type='online live') AS onlinelive FROM schedule GROUP BY code, semsis ORDER BY code, semsis;", 'cache/canvas_data/data20231012.db')
    import pandas as pd
    df = pd.DataFrame(data)
    df.fillna(0,inplace=True)
    for L in 'sections,inperson,online,hybrid,onlinelive'.split(','):
        df[L] = df[L].astype(int)
    print(df)
    df.to_csv('cache/section_stats_bymode.csv')
    return df

def section_stats():
    # for each course, (ENG1A) how many are enrolled in each all sections?
    # (and break down by mode,time,location,etc)
    #
    # for each course, how many are first semester gav students?
    #
    data = query_multiple("SELECT * FROM schedule ORDER BY code,id", 'cache/canvas_data/data20231012.db')

    fourcode_2_int, int_2_fourcode, sis_2_fourcode, fourcode_2_sis, semcodes = sched_lookup_tables()

    # Assuming your data is in a list of dictionaries called data
    df = pd.DataFrame(data)

    # Drop the specified columns
    df = df.drop(columns=['id', 'crn', 'units', 'teacher', 'start', 'end', 'loc', 'cap'])

    codecs.open('cache/sem_mapping.json','w','utf-8').write(json.dumps(fourcode_2_int,indent=2))

    df['sem'] = df['sem'].map(fourcode_2_int)
    df.set_index('sem', inplace=True)
    return df

def simple_exp_smoothing_section_model():
    sout = codecs.open('cache/section_predictions.txt','w','utf-8')
    from statsmodels.tsa.api import SimpleExpSmoothing
    warnings.filterwarnings("ignore")
    periods = 3
    start = 19

    df = section_stats()
    print(df)
    df = df.sort_index()

    predictions = {}
    for course_code in df['code'].unique():
        try:
            print(course_code)
            sout.write(course_code + "\n")
            this_set = df[df['code'] == course_code]['act']
            this_set = this_set.groupby('sem').sum()
            #this_set.fillna(method='ffill', inplace=True)
            #this_set.fillna(0, inplace=True)

            # Create a new index with all required semesters
            new_index = np.arange(this_set.index.min(), this_set.index.max()+1)

            # Reindex the DataFrame and fill missing values with 0
            this_set = this_set.reindex(new_index, fill_value=0)

            print(this_set.to_string())

            sout.write(this_set.to_string() + "\n")
            model = SimpleExpSmoothing(this_set)
            fit = model.fit(smoothing_level=0.2)  # initiate with a smoothing level of 0.2
            # Later modify above line based on if your data has high or low variability

            #prediction = fit.forecast(start=32,end=34)  # predict attendance for the next 3 semesters
            prediction = fit.predict(start=start,end=start+4)
            print(prediction)
            sout.write(str(prediction) + "\n")
            sout.flush()
            predictions[course_code] = prediction
        except Exception as e:
            print(f"Model creation failed for {course_code} due to {str(e)}")
            sout.write(f"Model creation failed for {course_code} due to {str(e)}\n")
            """
            model = ARIMA(this_set, order=(1,1,1)) #ARIMA params (p, d, q)
            model_fit = model.fit()
            forecast_result = model_fit.forecast(steps=periods)
            if forecast_result:
                predictions[course_code] = forecast_result[0]
            else:
                print(f"No prediction for {course_code}. Skipping...")"""


# statistics - use a smooth exponential model to predict the next 3 semesters of enrollment
# Doesn't really seem to get the patterns.
def exp_smoothing_section_model():
    sout = codecs.open('cache/section_predictions.txt','w','utf-8')
    from statsmodels.tsa.api import ExponentialSmoothing
    warnings.filterwarnings("ignore")
    periods = 3
    start = 19

    fourcode_2_int, int_2_fourcode, sis_2_fourcode, fourcode_2_sis, semcodes = sched_lookup_tables()

    df = section_stats()
    print(df)
    df = df.sort_index()

    predictions = {}
    for course_code in df['code'].unique():
        try:
            print(course_code)
            #sout.write(course_code + "\n")
            this_set = df[df['code'] == course_code]['act']
            this_set = this_set.groupby('sem').sum()
            #this_set.fillna(method='ffill', inplace=True)
            #this_set.fillna(0, inplace=True)

            # Create a new index with all required semesters
            new_index = np.arange(this_set.index.min(), this_set.index.max()+1)

            # Reindex the DataFrame and fill missing values with 0
            this_set = this_set.reindex(new_index, fill_value=0)

            print(this_set.to_string())

            for i,v in this_set.items():
                sout.write(f"{course_code},{int_2_fourcode[i]},{v}\n")

            model = ExponentialSmoothing(this_set, seasonal_periods=4, trend='add', seasonal='add')
            fit = model.fit()

            prediction = fit.predict(start=start,end=start+4)
            print(prediction)
            for i,v in prediction.items():
                v = int(v)
                if v<0: v=0
                sout.write(f"{course_code},{int_2_fourcode[i]}, {v}\n")
            sout.flush()
            predictions[course_code] = prediction
        except Exception as e:
            print(f"Model creation failed for {course_code} due to {str(e)}")
            #sout.write(f"Model creation failed for {course_code} due to {str(e)}\n")

def student_by_semester():

    query = """
    SELECT u.name, u.canvasid, s.code, s.semsis FROM users u
    JOIN enrollment e ON u.id = e.user_id
    JOIN courses c ON c.id = e.course_id
    JOIN terms t ON c.termid = t.id
    JOIN schedule s ON c.schedule = s.id
    WHERE e.type='StudentEnrollment' AND e.workflow='active'
    ORDER BY u.sortablename, s.semsis;
    """

    df = pd.DataFrame(query_multiple(query, 'cache/canvas_data/data20231012.db'))

    # Apply groupby and aggregate the courses in each semester in a comma-separated string
    df['courses'] = df.groupby(['name','canvasid','semsis'])['code'].transform(lambda x : ' / '.join(x))

    # Removing duplicates
    df = df[['name','canvasid','semsis','courses']].drop_duplicates()

    # Create pivot table
    df_pivot = df.pivot_table(values='courses', index=['name','canvasid'], columns='semsis', aggfunc='first').reset_index()

    # Adding prefix to new columns names to recognize them
    df_pivot.columns = [str(col) + '_sem' if isinstance(col, int) else col for col in df_pivot.columns]

    df_pivot.to_csv('cache/student_by_semester.csv')


def sections_grouped_by_year_mode():
    df = section_stats_bymode()

    # list of unique courses
    df_all_courses = df['code'].unique()

    # list of unique semesters
    df_all_semesters = df['semsis'].unique()
    df_all_semesters.sort()


    raw_data = {}
    for line in df:
        print(line['semsis'])
        sis = str(line['semsis'])
        year = sis[0:4]
        raw_data[ f"{line['code']}{year}"] = [line['inperson'],line['online'],line['hybrid'],line['onlinelive']]
    print(raw_data)
    return

    for course in df_all_courses:
        c = str(course)
        template = {'code':[c,c,c], 'semsis':[], 'inperson':[], 'online':[], 'hybrid':[], 'onlinelive':[]}

    # group semesters in to groups of 3 by year
    for i in df_all_semesters:
        j = str(i)
        year = j[0:4]
        print(f"{i} ({year})")

    # for each course, for each group of 3 semesters, fill in values, using 0 if necessary

    # ...

def lstm_model_sections():
    from keras.models import Sequential
    from keras.layers import Dense
    from keras.layers import LSTM
    from sklearn.preprocessing import MinMaxScaler
    from sklearn.model_selection import train_test_split

    # Preprocessing

    # Normalize inputs for better performance
    df = section_stats_bymode()
    print(df)
    scaler = MinMaxScaler(feature_range=(0, 1))
    dataset_scaled = scaler.fit_transform(df.drop(['code', 'semsis'], axis=1))
    print("scaled:")
    print(df)

    # Split features and targets (Assuming you want to predict 'online' enrollments)
    X = dataset_scaled[:, 1:]
    Y = dataset_scaled[:,0:1]

    # Train / Test split
    x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=0)

    # Reshape input to be [samples, time steps, features] which is required for LSTM
    x_train = np.reshape(x_train, (x_train.shape[0], x_train.shape[1], 1))
    x_test = np.reshape(x_test, (x_test.shape[0], x_test.shape[1], 1))

    print("x_train shape:", x_train.shape)
    print(x_train)

    print("\n\nTraining...\n\n")

    # LSTM architecture
    model = Sequential()
    model.add(LSTM(50, input_shape=(X.shape[1], 1))) # 50 LSTM blocks
    model.add(Dense(1)) # Since we are predicting only 1 output ('online' enrollments)
    model.compile(loss='mean_squared_error', optimizer='adam')
    model.fit(x_train, y_train, epochs=5, batch_size=1) # Training the model

    # Prediction
    scaler_predict = MinMaxScaler()
    scaler_predict.fit_transform(df[['online']])
    trainPredict = model.predict(x_train)
    testPredict = model.predict(x_test)

    # Invert predictions (Due to normalization)
    trainPredict = scaler_predict.inverse_transform(trainPredict)
    testPredict = scaler_predict.inverse_transform(testPredict)

    # Now you have your future prediction in testPredict.

    print("Predictions:")
    print(testPredict)
    np.savetxt('cache/section_predictions_lstm.txt',testPredict, fmt='%f')

    # I'm lost here...
    df


def visualize_course_modes_multi_semester():
    import plotly.express as px
    from plotly.subplots import make_subplots
    seasons = {'sp':'30','su':'50','fa':'70'}
    semcodes = "sp18 su18 fa18 sp19 su19 fa19 sp20 su20 fa20 sp21 su21 fa21 sp22 su22 fa22 sp23 su23 fa23 sp24 su24 fa24".split(" ")
    sems = { x:'20' + x[2:] + seasons[x[:2]] for x in semcodes }
    sem_dfs = []
    sem_dfs_depts = []
    for s in sems.keys():
        print(f"fetching {s}")
        sched = requests.get(f"http://gavilan.cc/schedule/{s}_sched_expanded.json").json()
        for crs in sched:
            if 'extra' in crs: del crs['extra']
            crs['dept'] = crs['code'].split(' ')[0]
        df = pd.DataFrame(sched)
        df_depts = df.copy()
        df_depts = df_depts.drop(columns=['crn','sec','code','cmp','name','days','time','rem','wl_cap','wl_act','wl_rem','teacher','date','loc','ztc','time_start','time_end','start','end','doy'])
        df = df.drop(columns=['crn','sec','code','cmp','name','days','time','rem','wl_cap','wl_act','wl_rem','teacher','date','loc','ztc','time_start','time_end','start','end','doy'])
        grouped_by_dept = df_depts.groupby(['dept','type']).size().reset_index(name='count')
        grouped_by_mode = df['type'].value_counts().reset_index()
        grouped_by_dept["semester"] = sems[s]
        grouped_by_mode["semester"] = sems[s]
        sem_dfs.append(grouped_by_mode)
        sem_dfs_depts.append(grouped_by_dept)

        #grouped_json = grouped_by_dept.to_json(orient='records')
        #j = json.loads(grouped_json)
        #print(json.dumps(j,indent=2))

        #grouped_by_dept.columns = ['Department', 'Count']   # rename the column names appropriately
        #fig = px.bar(grouped_by_dept, x='Department', y='Count', title='Section Counts by Department')
        #fig.write_html(f"cache/output_{s}.html")


    combined_data = pd.concat(sem_dfs, axis=0)
    #print(combined_data)
    #combined_data = combined_data.rename(columns={'type':'count','index':'type'})
    #print(combined_data)
    combined_data.reset_index(drop=True,inplace=True)
    #print(combined_data)
    pivoted_data = combined_data.pivot(index='semester', columns='type', values='count')
    pivoted_data.reset_index(inplace=True)

    fig = px.bar(pivoted_data, x='semester',y=['hybrid', 'in-person', 'online', 'online live'], barmode='stack',
                    title='Course Delivery by Semester',
                    color_discrete_sequence=["#000066","#660000","#333366","#9400D3"])
    fig.update_layout(height=200*len(fig['data']))
    fig.write_html(f"cache/sections_by_deliverymode.html")
    print(f"wrote to:  cache/sections_by_deliverymode.html")

    combined_data_depts = pd.concat(sem_dfs_depts, axis=0)
    combined_data_depts.reset_index(drop=True,inplace=True)
    #print(combined_data_depts)
    combined_data_depts.to_csv('cache/section_delivery_by_dept.csv')
    '''pivoted_data_depts = combined_data_depts.pivot(index='semester', columns='type', values='count')
    pivoted_data_depts.reset_index(inplace=True)

    fig = px.bar(pivoted_data_depts, x='semester',y=['hybrid', 'in-person', 'online', 'online live'], barmode='stack',
                    title='Course Delivery by Semester',
                    color_discrete_sequence=["#000066","#660000","#333366","#9400D3"])
    fig.write_html(f"cache/sections_depts_by_deliverymode.html")'''

    unique_depts = combined_data_depts['dept'].unique()
    fig = make_subplots(rows=len(unique_depts), cols=1,
                        subplot_titles=unique_depts,
                        )

    print("\n\nindividual departments\n\n")
    for i, dept in enumerate(unique_depts, start=1):
        #if i>1: break
        # Filter the dataframe for the current department
        print(f"{dept}")
        dept_data = combined_data_depts[combined_data_depts['dept'] == dept]

        # Pivot the data frame
        pivoted_dept_data = dept_data.pivot(index='semester', columns='type', values='count').reset_index()
        pivoted_dept_data.fillna(0,inplace=True)
        #print(pivoted_dept_data)

        # Plot the data
        columns_to_plot = ['hybrid', 'in-person', 'online', 'online live']
        valid_columns = [col for col in columns_to_plot if col in pivoted_dept_data.columns]

        # to avoid futurewarning
#        print(f"  {valid_columns}")
#        if len(valid_columns)==1: valid_columns = valid_columns[0]
#        print(f"  {valid_columns}")

        fig_sub = px.bar(pivoted_dept_data, x='semester', y=valid_columns, barmode='stack',
                        #title=f'Course Delivery by Semester for {dept}',
                        color_discrete_sequence=["#000066","#660000","#333366","#9400D3"])
        fig.add_traces(fig_sub['data'], rows=[i]*len(fig_sub['data']), cols=[1]*len(fig_sub['data']))
    fig.update_layout(height=70*len(fig['data']), width=1100, showlegend=False)
    fig.write_html(f"cache/sections_depts_by_deliverymode.html")


# given a list of classes, report back about the student on one row of info
def student_history_analysis(sh):
    from functools import reduce
    semesters_set = set()
    num_sems = 0
    num_course = len(sh)
    num_units = 0
    units_online = 0
    units_inperson = 0
    units_hybrid = 0
    units_ol = 0
    fa_23_units = 0
    fa_23_online_units = 0
    fa23_courses = 0
    fa23_onlinecourses = 0

    #un_list = [ float(x['units'].split('-')[0].split('/')[0]) for x in sh ]
    #num_units = reduce(lambda x,y: x+y, un_list)
    for section in sh:
        semesters_set.add(section['sis'])
        units = float(section['units'].split('-')[0].split('/')[0])
        num_units += units
        if section['type'] == 'in-person': units_inperson += units
        if section['type'] == 'online': units_online += units
        if section['type'] == 'hybrid': units_hybrid += units
        if section['type'] == 'online live': units_ol += units

        if section['sis'] == '202370':
            fa_23_units += units
            fa23_courses += 1
            if not section['type'] == 'in-person':
                fa_23_online_units += units
                fa23_onlinecourses += 1

    num_sems = len(semesters_set)
    if num_units == 0:
        pct_online = 0
    else:
        pct_online = round(100 * (units_online+units_hybrid+units_ol) / num_units, 1)

    if fa_23_units == 0:
        fa_23_pct_online = 0
    else:
        fa_23_pct_online = round(100 * (fa_23_online_units) / fa_23_units, 1)

    if fa23_courses == 0:
        fa23_pct_course_online = 0
    else:
        fa23_pct_course_online = round(100 * (fa23_onlinecourses) / fa23_courses, 1)
    summary = [units, num_course, f"\"{sh[0]['sortablename']}\",{sh[0]['canvasid']},{num_sems},{num_course},{num_units},{units_online},{units_inperson},{units_hybrid},{units_ol},{pct_online},{fa_23_units},{fa_23_online_units},{fa_23_pct_online},{fa23_courses},{fa23_onlinecourses},{fa23_pct_course_online}"]
    return summary

def report_student_stats():
    from localcache import users_with_history, students_current_semester
    from itertools import groupby
    import plotly.graph_objects as go
    import plotly.io as pio
    import numpy as np

    u = users_with_history()
    this_sem = [x['canvasid'] for x in students_current_semester()]

    df = pd.DataFrame(u)
    filtered_df = df[df['canvasid'].isin(this_sem)]
    filtered_df.to_csv('cache/student_history_current_students.csv',index=False)

    oo = codecs.open('cache/student_units.txt','w','utf-8')
    oo.write("name,id,num_sems,num_course,num_units,units_online,units_inperson,units_hybrid,units_ol,percent_online,fa23_units,fa23_onlineunits,fa23_pct_online,fa23_num_courses,fa23_num_onlinecourses,fa23_percent_online_course\n")
    # Now group by that key
    def kk(x): return x['canvasid']
    grouped_dict = {key:list(group) for key, group in groupby(u, kk)}

    shorter = []
    percentages = []

    for k,g in grouped_dict.items():
        if k in this_sem:
            h = student_history_analysis(g)
            #oo.write(json.dumps(h[2],indent=2)+ "\n")
            oo.write( str(h[2]) + "\n")
            shorter.append(h)
            p = h[2].split(',')[-1]
            percentages.append(float(p))
        else:
            print(f"Skipping {k}")
    #print(this_sem)
    #oo.write('units,courses\n')
    #shorter.sort(key=lambda x: x[0], reverse=True)
    #for s in shorter:
    #    print(s[2])
    #    #oo.write(f"{s[0]},{s[1]}\n")
    #    #print('\n\n')

    # Create a histogram
    fig = go.Figure(data=[go.Histogram(x=percentages, xbins=dict(start=0,end=101, size=10))])

    # Save the figure in an HTML file
    pio.write_html(fig, 'cache/student_pct_onlinecourse.html')


def test_rpy():
    pass

'''
def test_rpy():
    from rpy2 import robjects
    from rpy2.robjects import Formula, Environment
    from rpy2.robjects.vectors import IntVector, FloatVector
    from rpy2.robjects.lib import grid
    from rpy2.robjects.packages import importr, data
    from rpy2.rinterface import RRuntimeWarning
    import warnings

    # The R 'print' function
    rprint = robjects.globalenv.get("print")
    stats = importr('stats')
    grdevices = importr('grDevices')
    base = importr('base')
    datasets = importr('datasets')

    grid.activate()
    import math, datetime
    import rpy2.robjects.lib.ggplot2 as ggplot2
    import rpy2.robjects as ro
    from rpy2.robjects.packages import importr
    base = importr('base')

    mtcars = data(datasets).fetch('mtcars')['mtcars']

    pp = ggplot2.ggplot(mtcars) + \
         ggplot2.aes_string(x='wt', y='mpg', col='factor(cyl)') + \
         ggplot2.geom_point() + \
         ggplot2.geom_smooth(ggplot2.aes_string(group = 'cyl'),
                             method = 'lm')
    pp.plot()


def test_rpy2():
    import rpy2
    print(rpy2.__version__)
    import rpy2.robjects as robjects
    from rpy2.robjects.packages import importr
    # import R's "base" package
    base = importr('base')

    # import R's "utils" package
    utils = importr('utils')
    pi = robjects.r['pi']
    print(f"pi={pi[0]}")
'''


import pandas as pd
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import silhouette_score
from sklearn.tree import DecisionTreeClassifier, export_graphviz
import graphviz

from joblib import dump, load


def cluster_by_mode_1():

    # Load the data from a CSV file
    data = pd.read_csv('cache/students_bymode.csv')

    # Extract the relevant features
    features = data[['num_semesters', 'num_units', 'inperson_units', 'hybrid_units', 'online_units']]

    # Standardize the features
    scaler = StandardScaler()
    scaled_features = scaler.fit_transform(features)

    # Perform clustering with different numbers of clusters
    for n_clusters in range(4, 12):
        kmeans = KMeans(n_clusters=n_clusters, random_state=42)
        kmeans.fit(scaled_features)

        # Add the cluster labels to the original data
        data[f'cluster_{n_clusters}'] = kmeans.labels_

        print(f"Clustering with {n_clusters} clusters:")
        print(data.groupby(f'cluster_{n_clusters}').size())
        print()

    # Save the updated data with cluster labels to a new CSV file
    data.to_csv('cache/students_bymode_with_clusters_1.csv', index=False)


def cluster_by_mode():
    data = pd.read_csv('cache/students_bymode.csv')

    # Split features and target
    X = data.drop('g_number', axis=1)
    y = data['g_number']

    # Train decision tree classifier
    clf = DecisionTreeClassifier()
    clf.fit(X, y)

    # Visualize decision tree
    dot_data = export_graphviz(clf, out_file=None,
                               feature_names=X.columns,
                               class_names=y.unique(),
                               filled=True, rounded=True,
                               special_characters=True)
    graph = graphviz.Source(dot_data)
    graph.render('decision_tree', view=True)
    data.to_csv('cache/students_bymode_with_dt.csv', index=False)


def cluster_by_mode_2():

    # Load the data from a CSV file
    data = pd.read_csv('cache/students_bymode.csv')

    # Extract the features (excluding the 'g_number' column)
    features = data.drop('g_number', axis=1)

    # Scale the features to have zero mean and unit variance
    scaler = StandardScaler()
    scaled_features = scaler.fit_transform(features)

    # Determine the ideal number of clusters using the elbow method
    inertias = []
    for k in range(4, 40):  # Try different values of k (e.g., 1 to 10)
        kmeans = KMeans(n_clusters=k, random_state=42)
        kmeans.fit(scaled_features)
        inertias.append(kmeans.inertia_)

    # Plot the elbow curve
    import matplotlib.pyplot as plt
    plt.plot(range(4, 40), inertias, marker='o')
    plt.xlabel('Number of Clusters (k)')
    plt.ylabel('Inertia')
    plt.title('Elbow Method')
    plt.show()

    # Choose the ideal number of clusters based on the elbow curve
    ideal_k = 12  # Adjust this based on your observation

    # Perform clustering with the ideal number of clusters
    kmeans = KMeans(n_clusters=ideal_k, random_state=42)
    kmeans.fit(scaled_features)


    # Get the cluster labels for each data point
    labels = kmeans.labels_

    # Add the cluster labels to the original data
    data['Cluster'] = labels

    # Save the cluster labels to a new CSV file
    data.to_csv('cache/students_bymode_with_clusters_2.csv', index=False)

    # Get the cluster centers (centroids)
    centroids = scaler.inverse_transform(kmeans.cluster_centers_)

    # Print the cluster centers
    for i, centroid in enumerate(centroids):
        print(f"Cluster {i} center:")
        for feature, value in zip(features.columns, centroid):
            print(f"{feature}: {value}")
        print()


    # Save the trained objects to files
    dump(kmeans, 'kmeans.joblib')
    dump(scaler, 'scaler.joblib')

    # Load the saved objects for future use
    loaded_kmeans = load('kmeans.joblib')
    loaded_scaler = load('scaler.joblib')

    # Use the loaded objects for predictions
    new_data_scaled = loaded_scaler.transform(new_data)
    predictions = loaded_kmeans.predict(new_data_scaled)


def cluster_with_new_data():
    ## NOT TESTED
    # need to save the kmeans and scaler objects from previous step.

    # Load the new data
    new_data = pd.read_csv('new_data.csv')

    # Extract the features from the new data
    new_features = new_data.drop('g_number', axis=1)

    # Scale the new features using the fitted scaler
    scaled_new_features = scaler.transform(new_features)

    # Predict the cluster labels for the new data
    new_labels = kmeans.predict(scaled_new_features)

    # Add the cluster labels to the new data
    new_data['Cluster'] = new_labels

if __name__ == "__main__":
    options = { 1: ['get all historical grades from ilearn',get_all] ,
                2: ['process grades csv file',process_grades] ,
                3: ['reorganize full grades file by student', reorganize_grades_student],
                4: ['test shortname parse',nametest] ,
                5: ['test sem codes',codetest] ,
                6: ['get student data from orientations', get_student_orientations],
                7: ['manage course master list', all_course_names],
                8: ['grades to vectors', grades_to_vectors],
                9: ['semester startdates list', semester_dates],
                10: ['normalize course histories', normalize_course_histories],
                11: ['cluster student histories', cluster_student_histories],
                12: ['try to make a schedule', try_make_sched],
                13: ['ES model section predict attendance', exp_smoothing_section_model],
                14: ['section stats by mode', section_stats_bymode],
                15: ['student courses by semester', student_by_semester],
                16: ['LSTM model sections', lstm_model_sections],
                17: ['rearrange section data to yearly form', sections_grouped_by_year_mode],
                30: ['visualize course modes multi semester', visualize_course_modes_multi_semester],
                31: ['Report on student stats', report_student_stats],
                32: ['test rpy', test_rpy],
                33: ['cluster students by mode', cluster_by_mode],
              }
    print ('')

    if len(sys.argv) > 1 and re.search(r'^\d+',sys.argv[1]):
        resp = int(sys.argv[1])
        print("\n\nPerforming: %s\n\n" % options[resp][0])

    else:
        print ('')
        for key in options:
            print(str(key) + '.\t' + options[key][0])

        print('')
        resp = input('Choose: ')

    # Call the function in the options dict
    options[ int(resp)][1]()