canvasapp/users.py


import json, codecs, requests, re, pdb, csv, textdistance, collections
import sys, csv, string, funcy, math, shutil, imghdr, os
import pytz, time
import pandas as pd
import matplotlib.pyplot as plt

#from pandas import TimeGrouper
from collections import defaultdict
from pipelines import fetch, fetch_stream, getSemesterSchedule, header, url, FetchError, put_file
from courses import course_enrollment, users_in_semester
from localcache import users_this_semester_db, unwanted_req_paths, timeblock_24hr_from_dt, dt_from_24hr_timeblock
from localcache import teachers_courses_semester
from util import dept_from_name, most_common_item
from os.path import exists, getmtime

from canvas_secrets import url

from dateutil import parser
from datetime import datetime as dt
from datetime import timedelta
import datetime

import queue
from threading import Thread
from os import path

# for NLP
#import spacy
from gensim import corpora, models, similarities, downloader, utils
from nltk import stem


# todo: these constants

#last_4_semesters = 'fall2020 summer2020 spring2020 fall2019'.split(' ')
#last_4_semesters_ids = [62, 60, 61, 25]
last_4_semesters = 'spring2021 fall2020 summer2020 spring2020'.split(' ')
last_4_semesters_ids = [168, 65, 64, 62]

log_default_startdate = "2021-08-23T00:00:00-07:00"
lds_stamp = parser.parse(log_default_startdate)

recvd_date = '2023-01-01T00:00:00Z'
num_threads = 25
max_log_count = 250000


##########
##########
##########    GETTING USER DATA
##########
##########

# All users to a cache file     cache/allusers.json
def fetchAllUsers():

    if exists('cache/allusers.json'):
        time = date_time = dt.fromtimestamp( getmtime('cache/allusers.json') )
        newname = 'cache/allusers_'+ time.strftime('%Y%m%d') + ".json"
        print("renaming old data file to %s" % newname)
        os.rename('cache/allusers.json', newname)


    out1 = codecs.open('cache/allusers.json','w','utf-8')
    out2 = codecs.open('cache/allusers_ids.json','w','utf-8')
    all_u = fetch_stream(url + '/api/v1/accounts/1/users?per_page=100', 1)

    ids = []
    main_list = []
    for this_fetch in all_u:
        for U in this_fetch:
            ids.append(U['id'])
            main_list.append(U)

    ids.sort()
    out2.write( json.dumps(ids, indent=2))
    out1.write( json.dumps(main_list, indent=2))
    out2.close()
    out1.close()
    return ids


##########
##########
##########    TEACHERS LIST AND LOCAL USERS FILE
##########
##########

# Fetch teacher users objects from local cache
def teacherRolesCache():               # I used to be load_users
    users_raw = json.load(open('cache/ilearn_staff.json','r'))
    users = {}
    users_by_id = {}
    for U in users_raw:
        users[ U['login_id'] ] = U
        users_by_id[ U['id'] ] = U
    return users, users_by_id


# Outputs:  cache/ilearn_staff.json
# Canvas:   Fetch all people with gavilan.edu email address
def teacherRolesUpdateCache():    # I used to be get_users
    t = fetch('/api/v1/accounts/1/users?per_page=500&search_term=%40gavilan.edu&include[]=email')
    g = open('cache/ilearn_staff.json','w')
    g.write( json.dumps(t) )
    g.close()
    #put_file('/gavilan.edu/staff/flex/2020/','cache/','ilearn_staff.json')
    print("Wrote to 'cache/ilearn_staff.json'")
    return teacherRolesCache()


# Fetch preferred email address for a given user id.   ( Canvas )
def getEmail(user_id):
    results = fetch("/api/v1/users/" + str(user_id) + "/communication_channels")
    for r in results:
        if r['type']=='email':
            return r['address']
    return ''


##########
##########
##########    TEACHERS AND OTHER STAFF
##########
##########
#
# Gather all my info, CRM style, in the folder teacherdata
#
#
# Typical actions: For everyone with a teacher role:
# - What are the courses they taught for the last X semesters?
# - What's their activity level each semester?
# - Which of those courses are Online, Hybrid or Face2face?
#     + column for each semester:  OHLOHL
# - How many online classes have they taught in the past?
# - Are they brand new, or brand new online?# further...
# - what's their department?
# - what's their badges and 'tech level?'
# -


# All teachers in a particular course
def getAllTeachers(course_id=59):  # a list
    qry = '/api/v1/courses/' + str(course_id) + '/search_users?enrollment_type=teacher'
    t = url + qry
    while(t): t = fetch(t)
#
def classType(t):
    if t == 'lecture': return 'L'
    if t == 'online': return 'O'
    if t == 'hours': return 'R'
    if t == 'lab': return 'A'
    if t == 'hybrid': return 'H'
    else: return 'L'                   # todo: fix bug in schedule parser so non-online classes have a type field

def my_blank_string(): return "no data"
def my_blank_dict(): return {'name':'NoName','email':'noemail@gavilan.edu'}
def my_empty_dict(): return defaultdict(my_blank_string)

def get_email_from_rec(name,name_to_record):
    #print "Looking up: " + name
    try:
        return name_to_record[name]['email']
    except Exception as e:
        print("Missing Teacher %s" % name)
        return 'noemail@gavilan.edu'


# Pull the staff directory on the webpage. Convert to pandas dataframe
def staff_dir(get_fresh=False):
    """
    if get_fresh:
        url = "http://www.gavilan.edu/staff/dir.php"
        regex = "var\slist=(\[.*\]);"
        response = requests.get(url).text
        m = re.search(regex,response)
        if m:
            output = '{"staff":' + m.group(1) + '}'
            of = open('cache/teacherdata/staff_dir.json','w')
            of.write(output)
            js = json.loads(output)
            df = pd.DataFrame(js['staff'])
            return df
            print("Wrote cache/teacherdata/staff_dir.json")
        else:
            print("Failed on staff directory scrape")
            return ''
    else:
        input = json.loads(open('cache/teacherdata/staff_dir.json','r').read())
        df = pd.DataFrame(input['staff'])
    return df
    """

    # TODO lol get fresh again...

    old_dir = csv.reader(open('cache/personnel2020_04_12.csv'), delimiter=',')
    dept1_crxn = {r[0]:r[1] for r in csv.reader(open('cache/dir_corrections.csv'), delimiter=',') }
    dept2_crxn = {r[0]:r[2] for r in csv.reader(open('cache/dir_corrections.csv'), delimiter=',') }
    title_crxn = {r[0]:r[3] for r in csv.reader(open('cache/dir_corrections.csv'), delimiter=',') }
    revised_dir = [  ]
    columns = next(old_dir)

    for r in old_dir:
        old_dept = r[2]
        if old_dept in dept1_crxn:
            new_one = dept1_crxn[old_dept]
            if dept2_crxn[old_dept]: new_one += '/' + dept2_crxn[old_dept]
            if title_crxn[old_dept]: new_one += '/' + title_crxn[old_dept]
            r[2] =  new_one
        revised_dir.append(r)
    print(revised_dir)
    return pd.DataFrame(revised_dir,columns=columns)


#
#
#
#    ###
#    ###   TEACHER   CRM   FUNCTIONS
#    ###
#

def schedForTeacherOverview(long,short):
    sem = getSemesterSchedule(short)
    sem['type'] = sem['type'].apply(classType)
    #sem['code'] = sem[['code','type']].apply(' '.join,axis=1)
    sem['sem'] = short
    sem = sem.drop(['time','loc','name','date','days'],axis=1)  # ,'crn'
    return sem


# Return a dataframe of the last 4 semester schedules put together
def oneYearSchedule():
    sp19 = schedForTeacherOverview('2019spring','sp19')
    su19 = schedForTeacherOverview('2019summer','su19')
    fa19 = schedForTeacherOverview('2019fall','fa19')
    sp20 = schedForTeacherOverview('2020spring','sp20')

    # The four-semester schedule
    a = pd.concat([sp19,su19,fa19,sp20], sort=True, ignore_index=True)
    a = a.drop(['cap','cmp','extra','rem','sec','cred','act'], axis=1)
    a.to_csv('cache/one_year_schedule.csv')
    return a

def num_sections_last_year(line):
    #if not type(line)=='str': return 0
    parts = line.split(' ')
    return len(parts)

def sec_type_stats(line):
    #print(type(line))
    #if not type(line)=='str': return {'fail':1}
    #print("in sts: " + str(line))
    parts = line.split(' ')
    output = defaultdict(int)
    for p in parts: output[p] += 1
    return output

def prct_online(line):
    d = sec_type_stats(line)
    #print(d)
    total = 0
    my_total = 0
    for k,v in d.items():
        total += v
        if k == 'O': my_total += v
    return int(100 * ((1.0)*my_total / total))

def prct_lecture(line):
    #print(line)
    d = sec_type_stats(line)
    #if 'fail' in d: return 0
    total = 0
    my_total = 0
    for k,v in d.items():
        total += v
        if k == 'L': my_total += v
    return int(100 * ((1.0)*my_total / total))


def prct_hybrid(line):
    d = sec_type_stats(line)
    #if 'fail' in d: return 0
    total = 0
    my_total = 0
    for k,v in d.items():
        total += v
        if k == 'H': my_total += v
    return int(100 * ((1.0)*my_total / total))

# Given the names of teachers in last year's schedules, fill in email, etc. from ilearn files
def teacher_basic_info(sched, from_ilearn, names):
    bi = from_ilearn    # pd.DataFrame(from_ilearn)
    bi.rename(columns={'id':'canvasid','login_id':'goo'}, inplace=True)
    # bi.drop(['name',],axis=1,inplace=True)

    #print(bi)
    #input('xx')

    sp20 = schedForTeacherOverview('2020spring','sp20')


    codes_sp20 = sp20.groupby('teacher')['code'].apply( lambda x: ' '.join(funcy.distinct(x)) )
    crns_sp20 = sp20.groupby('teacher')['crn'].apply( lambda x: ' '.join( map( str, funcy.distinct(x))) )
    codes_sp20.rename(columns={'code':'sp20code'}, inplace=True)
    codes_sp20.to_csv('cache/trash/codes_sp20.csv',header=True)
    crns_sp20.rename(columns={'crn':'sp20crn'}, inplace=True)
    crns_sp20.to_csv('cache/trash/crns_sp20.csv',header=True)


    a = sched.groupby('teacher')['code'].apply( lambda x: ' '.join(funcy.distinct(x)) )
    a = pd.DataFrame(a)
    a.reset_index(inplace=True)
    a['dept'] = a.apply(guessDept,axis=1)
    print(a)

    def find_that_name(x):
        #print(x)
        if 'teacher' in x: return names(x['teacher'])
        #print('name not found?')
        return ''

    a['ilearn_name'] = a.apply( find_that_name, axis=1)

    a.rename(columns={'code':'courses'}, inplace=True)
    #print(type(a))
    a.reset_index(inplace=True)

    a = pd.merge(a,codes_sp20.rename('sp20courses'), on='teacher')
    a = pd.merge(a,crns_sp20.rename('sp20crns'), on='teacher')
    a.to_csv('cache/trash/sched_w_sp20.csv',header=True)
    print(a)

    a['canvasid'] = a['teacher'].map(names)
    #print(a)
    c = pd.merge(bi, a, left_on='name', right_on='ilearn_name', how='outer')
    c.to_csv('cache/trash/basic.csv',header=True)
    #print(c)
    return c


# TODO Old and broken

# what percentage of their sections were online / hybrid /lecture ?
# Consumes:  output/semesters/fa19_sched.json and etc for 1 year
# Outputs:   cache/teacher_by_semester.csv,
def teacherModalityHistory(sched=[],names=[]):
    if not len(sched):
        sched = oneYearSchedule()
        #names = match_username()

    # How many classes a teacher taught lect/online/hybrid/hours
    sec_type = sched.groupby(['teacher','sem'])['type'].apply(' '.join)
    sec_type.to_csv('cache/teacherdata/teacher_by_semester.csv',header=True)
                                                                ## THIS IS THE LIST of how many
                                                                ## lecture, hybrid, online they've taught

    #sec_type = pd.read_csv('cache/teacherdata/teacher_by_semester.csv')

    sec_grp = sec_type.groupby('teacher').aggregate( ' '.join )
    #sec_grp.to_csv('cache/trash/sec_grp_3.csv',header=True)

    #sec_grp = sec_grp.iloc[1:]   ## I'm seeing bad items on the first 2
    #sec_grp.drop(index='teacher')
    #sec_grp.to_csv('cache/trash/sec_grp_0.csv',header=True)

    #
    sec_grp = pd.DataFrame(sec_grp)
    #print(type(sec_grp))
    sec_grp['prct_online']  = sec_grp['type'].map(prct_online)

    sec_grp['prct_lecture'] = sec_grp['type'].map(prct_lecture)
    sec_grp['prct_hybrid']  = sec_grp['type'].map(prct_hybrid)
    sec_grp['num_sections_last_year'] = sec_grp['type'].map(num_sections_last_year)
    sec_grp.drop('type',axis=1,inplace=True)
    sec_grp.reset_index(inplace=True)
    sec_grp.to_csv('cache/teacherdata/modality_history.csv')
    return sec_grp


def teacherCourseHistory(a,names):
    pass
    # actually not using this. moved to _basic_info

    # YEEEAH
    sched = a.groupby(['teacher','code'])
    #for name,group in sched:
    #    print(name)
    #print(sched.count())
    return
    a['name'] = a.apply(lambda x: records_by_sname[x['teacher']]['name'],axis=1)
    a['email'] = a.apply(lambda x: records_by_sname[x['teacher']]['email'],axis=1)
    a.sort_values(by=['dept','teacher','codenum'],inplace=True)
    a = a.drop(['teacher'],axis=1)
    a.to_csv('cache/teacherdata/courses_taught.csv')

    return a
    """
    d = a.groupby(['teacher'])                     # ,'dept','codenum','codeletter'

    out1 = open('teacherdata/courses_taught.csv','w')
    by_dept = {}                                 # x todo: sort by dept also
    for name, group in d:
        #print name
        if re.search(r'^\d+',name) or name=='TBA':
            print("Skipping weird name: ", name)
            continue
        rec = {'email':'xx'}
        try:
            rec = records_by_sname[name]
            #print rec
        except Exception as e:
            print("Missing Teacher %s" % name)
            continue
        out1.write(name+"\t"+rec['email'])
        s = set()
        #print group
        for idx,r in group.iterrows():
            s.add( str(r[1]) + str(r[2]) + str(r[3]))
        for clas in sorted(s):
            d = dept_from_name(clas)
            if d in by_dept:
                if name in by_dept[d]:
                    by_dept[d][name].append(clas)
                else:
                    by_dept[d][name] = [ clas, ]
            else:
                by_dept[d] = { name: [ clas, ] }

            out1.write("\n\t"+str(clas))
        out1.write("\n")
    out1.write( json.dumps(by_dept,indent=2))"""


# Consumes:  output/semesters/fa19_sched.json and etc for 1 year
# Outputs:   cache/course_teacher_combos.csv,
def teacherSharedCourses(a=[]):
    if not len(a): a = oneYearSchedule()

    # List of classes. Group by teacher/format. Shows who has historically
    # taught a class and who teaches it most often.
    c = a.drop(['code','partofday','sem','site','type'],axis=1)   #,'dept','codeletter'
    c = c.groupby(['dept','codenum','codeletter'])  #,'teacher'
    c = c.aggregate(lambda x: set(x))
    c.to_csv('teacherdata/course_teacher_combos.csv')      ## THIS is the list of teachers who
                                                     ## share courses
    return c


# TODO: this is broken
# Consumes:  output/semesters/fa19_sched.json and etc for 1 year
# Outputs:   cache/num_courses_per_dept.csv                 (not teacher_course_oer_deptcount)
# How many courses in each department were taught in the last year?
def departmentCountCourses(a=[]):
    if not len(a): a = oneYearSchedule()

    tt = a.drop(['code','partofday','sem','site','type'],axis=1)   #,'dept','codeletter'

    """records_by_sname = defaultdict(my_empty_dict, match_usernames())
    tt.drop_duplicates(keep='first',inplace=True)
    tt['name'] = tt.apply(lambda x: records_by_sname[x['teacher']]['name'],axis=1)
    tt['email'] = tt.apply(lambda x: records_by_sname[x['teacher']]['email'],axis=1)
    tt = tt.drop(['teacher'],axis=1)
    tt.sort_values(by=['dept','name','codenum'],inplace=True)
    count = tt['dept'].value_counts()
    count.to_csv('cache/num_courses_per_dept.csv', header=True)"""


def clean_nonprint(s):
    return re.sub(f'[^{re.escape(string.printable)}]', '', s)

def read_cmte(names):
    output = []
    out2 = defaultdict(list)
    input = codecs.open('cache/teacherdata/committees_2018_2019.csv','r','utf-8')
    with input as csvfile:
        cmtereader = csv.reader(csvfile, delimiter=',', quotechar='"')
        for row in cmtereader:
            for R in row:
                R = R.strip()
                R = clean_nonprint(R)
            (fname,lname,cmtes) = row
            a = re.split(",\s*",cmtes)
            if len(a)>1:
                cmtes = a
            else:
                cmtes = a

            name1 = lname + ", " + fname
            name2 = fname + " " + lname
            name = name1
            realname = names(name1)
            if not realname:
                realname = names(name2)
                name = name2
            if realname:
                for cmm in cmtes:
                    output.append( [realname, cmm] )
                    out2[realname].append(cmm)
            else:
                print("committee participant name failed: %s / %s:\t%s" % (name1,name2,str(a)))
                print(type(name1))
    #print(out2)
    return output,out2

def read_training_records():
    myinput = open('cache/teacherdata/more_2018_2019_training_attendance.txt','r').readlines()
    current_sesh = ""
    ppl_in_sesh = {}
    all_ppl = set()

    for L in myinput:
        L = L.strip()
        if L:
            if L.startswith('#'):
                ma = re.search(r'^\#\s(.*)$',L)
                if ma:
                    current_sesh = ma.group(1)
                else:
                    print("-- read_training_records: Couldn't find training set? " + L)
            else:
                if current_sesh in ppl_in_sesh:
                    ppl_in_sesh[current_sesh].append(L)
                else:
                    ppl_in_sesh[current_sesh] = [ L, ]
                all_ppl.add(L)
    if 0:
        print(ppl_in_sesh)
        print(all_ppl)

    # Want to pivot the dict, so key is a name, value is another dict, where k2 is session name, v2 is Y/N
    d_of_d = defaultdict(dict)

    for k,v in ppl_in_sesh.items():
        for user in v:
            d_of_d[user][k] = 'Y'

    return d_of_d

# open a file and mark the people with their ids given. Return a dataframe
def read_bootcamp1(filename):
    a = pd.read_csv(filename)
    #print(a)
    b = a.loc[:, ['canvas_id','grade','last_activity']]
    b.rename(columns={'canvas_id':'bc1canvasid','grade':'bootcamp_grade','last_activity':'bootcamp_date'}, inplace=True)
    #print(b)
    return b

# open a file and mark the people with their ids given. Return a dataframe
def read_bootcamp2(filename):
    a = pd.read_csv(filename)
    #print(a)
    b = a.loc[:, ['canvas_id','grade','last_activity']]
    b.rename(columns={'canvas_id':'bc2canvasid','grade':'bootcamp_progress','last_activity':'bootcamp_date'}, inplace=True)
    #print(b)
    return b


def not_blank_or_pound(L):
    if L.startswith("#"): return False
    L = L.strip()
    if L == "": return False
    return True

def temp1(x):
    #print(x[1])
    return x[1]

def add_realnames(df,names):    # the surveys. raw name is in 2nd column
    df['ilearn_name'] = df.apply( lambda x: names(temp1(x),1), axis=1)
    return df

def compareToughNames(a,b):
    # search for a in b
    m = re.search(a, b)
    if m: return True
    return False


def compareNames(a,b,verbose=0):
    if a == b: return True

    cnDBG = 0
    try:
        parts_a = [ W.lower() for W in re.split("[\s,]", a) ]
        [ x.strip() for x in parts_a ]

        parts_b = [ W.lower() for W in re.split("[\s,]", b) ]
        [ x.strip() for x in parts_b ]

        pa2 = sorted([ parts_a[0], parts_a[-1] ])
        pb2 = sorted([ parts_b[0], parts_b[-1] ])

        if pa2 == pb2:
            if cnDBG: print("->Match: %s, %s" % (a,b))
            return True
        if pa2[0] == pb2[0] or pa2[-1] == pb2[-1]:
            if cnDBG: print("--->Near match: %s" % b)
            return False

    except Exception as e:
        #print("Problem with compareNames %s , %s" % (a,b))
        #print(e)
        return False

    if len(pa2[0])>3 and len(pb2[0])>3:
        if pa2[0][0] == pb2[0][0]:
            if pa2[0][1] == pb2[0][1]:
                if pa2[0][2] == pb2[0][2]:
                    if cnDBG: print("===> Near match (first 3): %s, %s, %s, %s" % (a, b, pa2[0], pb2[0]))
                    pass

    b = b.lower()
    a = a.lower()

    #if verbose: print("searching: %s / %s" % (a,b))
    if re.search( b, a):
        #print("REGEX MATCH: %s | %s" % (a,b))
        return True
    if re.search( a, b):
        #print("REGEX MATCH: %s | %s" % (a,b))
        return True
    return False

def find_ilearn_record(ilearn_records,manual_records, othername,verbose=0):
    # manual records are ('name':'canvas_id')
    #print(ilearn_records)
    if not othername: return ""
    if type(othername) == type(1.25): return ""
    #if math.isnan(othername): return False

    if othername in manual_records:
        a = funcy.first( funcy.where( ilearn_records, id=int(manual_records[othername]) ))
        if a:
            return a['name']

    for x in ilearn_records:
        #print('f_i_r')
        #print(othername)
        #print(x)
        if compareNames(othername,x['name'],verbose):
            return x['name']

    for k,v in manual_records.items():
        #print(k)
        #print(othername)
        #print(type(othername))
        b = re.search( k, othername)
        if b:
            a = funcy.first( funcy.where( ilearn_records, id=int(manual_records[k]) ))
            if a:
                return a['name']
    return ""


def manualNamesAndDept():
    # copied from // getTeachersInfoMain ....

    schedule_one_yr = oneYearSchedule()
    from_ilearn = list(  map(  lambda y: funcy.select_keys( lambda z: z in ['name','id','email','login_id','sortable_name'], y), \
                    json.loads(codecs.open('cache/ilearn_staff.json','r','utf-8').read()) ) )
    manual_names = manualNames()
    names_lookup = funcy.partial(find_ilearn_record, from_ilearn, manual_names)
    teacher_info = teacher_basic_info(schedule_one_yr, from_ilearn, names_lookup)
    # till here


    # the staff directory
    dr = staff_dir(False)
    print(dr)
    print(dr.columns)
    print( dr['department'].unique() )

    # now to reconcile and combine these....
    #
    # we want:
    #  - alternate names of academic / other depts, with one preferred
    #  - some people are PT Fac, FT Fac, Director, assistant, spec, and some titles are unknown.
    #  - sometimes the hierarchy is of departments, and sometimes of people. try not to confuse that.
    #


    # eventually, want to get pics or other info from other sources too, o365, cranium cafe, etc
    #


def manualNames():
    mm = dict([ x.strip().split(',') for x in \
            open('cache/teacherdata/teacher_manual_name_lookup.csv','r').readlines()])
    mz = {}
    for k,v in mm.items():
        mz[k] = v
        mz[k.lower()] = v
        parts = k.split(" ")
        if len(parts)==2:
            mz[  parts[1] + ", " + parts[0] ] = v
            mz[  parts[1] + "," + parts[0] ] = v
    #print(mz)
    return mz

# given a list of class codes, return the most common (academic) department
def guessDept(d_list):
    li = str(d_list.code).split(" ")
    count = defaultdict(int)
    #print(str(d_list.code))
    for i in li:
        m = re.search(r'^([A-Z]+)$',i)
        if m:
            count[m.group(1)] += 1
    mmax = 0
    max_L = ''
    for k,v in count.items():
        #print("  %s:%i,   " % (k,v), end='')
        if v > mmax:
            mmax = v
            max_L = k
    print("")
    return max_L

"""
# Faculty Info Plans


bootcamp_active.csv				Started bootcamp. Remind them to finish it?

bootcamp_passed.csv				Badge'd for BC. Online and Hybrid teachers not on this list need reminding.

courses_taught.csv				x

course_teacher_combos.csv		Teachers who share the teaching of a course. Courses in common.

emails_deans+chairs.txt         Just a email list

FA2017 Faculty Survey.csv       Look at answers for video, helpful formats, and comments

faculty_main_info.csv           Has percentage mix of a teachers' online/hybrid/lecture history

historical_shells_used.json     x

SP2019 Faculty Survey.csv       Look at rate tech skills, topics interested in, would add video, and comments

committees 2018 2019.csv       Committees people serve on.


Not so useful:

teacher_by_semester.csv         precursor to faculty_main_info. Has semesters separated.

"""
#
#
#
# Call all the teacher info / CRM gathering stuff
# Make one big csv file of everything I know about a teacher
def getTeachersInfoMain():

    schedule_one_yr = oneYearSchedule()
    #print(schedule_one_yr)
    #if input('q to quit ')=='q': return

    # comes from  teacherRolesUpdateCache  ...  search for  @gavilan.edu in email address
    from_ilearn = list(  map(  lambda y: funcy.select_keys( lambda z: z in ['name','id','email','login_id','sortable_name'], y), \
                    json.loads(codecs.open('cache/ilearn_staff.json','r','utf-8').read()) ) )
    #names_from_ilearn = list( [x.lower() for x in map( str, sorted(list(funcy.pluck('name',from_ilearn)))) ] )
    from_ilearn_df = pd.DataFrame(from_ilearn)


    manual_names = manualNames()
    names_lookup = funcy.partial(find_ilearn_record, from_ilearn, manual_names)
    #print(from_ilearn_df)
    #if input('q to quit ')=='q': return


    #print(schedule_one_yr)
    #print("This is one year schedule.")
    #input('\npress enter to continue')

    teacher_info = teacher_basic_info(schedule_one_yr, from_ilearn_df, names_lookup)
    #print(teacher_info)
    #input('\nThis is teacher info.\npress enter to continue')

    modality_history = teacherModalityHistory(schedule_one_yr,names_lookup)
    print(modality_history)
    #print("This is teacher modality history.")
    #input('\npress enter to continue')


    master = pd.merge( modality_history, teacher_info, on='teacher', how='outer')
    print(master)
    master.to_csv('cache/trash/joined1.csv')
    print(master.columns)
    #input('\nThis is Joined 1.\npress enter to continue')

    wp = read_bootcamp1('cache/teacherdata/bootcamp_passed.csv')
    #print(wp)
    master2 = pd.merge( master, wp, left_on='canvasid_x', right_on='bc1canvasid', how='outer')
    master2.to_csv('cache/trash/joined2.csv')
    print(master2)
    print(master2.columns)
    #input('\nThis is Joined 2.\npress enter to continue')


    wp = read_bootcamp2('cache/teacherdata/bootcamp_active.csv')
    master3 = pd.merge( master2, wp, left_on='canvasid_x', right_on='bc2canvasid', how='outer')
    master3.to_csv('cache/trash/joined3.csv')
    print(master3)
    print(master3.columns)
    #input('\nThis is Joined 3.\npress enter to continue')


    # THE VIEWS / HISTORY. UPDATE with    get_recent_views()   .... check it for appropriate dates....
    views = json.loads( codecs.open('cache/teacherdata/activitysummary.json','r','utf-8').read() )
    vdf = pd.DataFrame.from_dict(views,orient='index',columns=['cid','cname','views','goo','dates','dateviews'])
    print(vdf)
    #input('k')

    #master3.set_index('canvasid_x')
    master3 = pd.merge(master3, vdf, left_on='canvasid_x', right_on='cid',how='outer')

    dir_records = pd.DataFrame(staff_dir())
    dir_records['email'] = dir_records['email'].str.lower()
    master3['email'] = master3['email'].str.lower()

    print(dir_records)
    master3 = pd.merge(master3, dir_records, on='email',how='outer')
    print(master3)
    #if input('q to quit ')=='q': return

    #master3.fillna(0, inplace=True)
    #master3['views'] = master3['views'].astype(int)
    #master3['num_sections_last_year'] = master3['num_sections_last_year'].astype(int)


    #cmte = pd.read_csv('cache/teacherdata/committees_2018_2019.csv')
    cmte,cmte_by_name = read_cmte(names_lookup)
    cmte_str_by_name = {}
    for k in cmte_by_name.keys():
        #print(k)
        #print(cmte_by_name[k])
        cmte_str_by_name[k] = ",".join(cmte_by_name[k])
    cc = pd.DataFrame.from_dict(cmte_str_by_name,orient='index',columns=['committees'])   # 'teacher',
    cc.reset_index(inplace=True)
    master4 = pd.merge(master3, cc, left_on='name', right_on='index', how='outer')
    master4.to_csv('cache/trash/joined4.csv')

    master4.drop(['teacher','ilearn_name','canvasid_y','bc1canvasid','bc2canvasid','cid','cname','index_y'],axis=1,inplace=True)

    # Exclude surveys for now
    """
    survey_2017 = pd.read_csv('cache/teacherdata/FA2017 Faculty Survey.csv')
    survey_2017 = add_realnames(survey_2017,names_lookup)
    survey_2017.to_csv('cache/trash/survey1.csv')
    master5 = pd.merge(master4, survey_2017, left_on='name', right_on='ilearn_name', how='left')
    master5.to_csv('cache/trash/joined5.csv')

    survey_2019 = pd.read_csv('cache/teacherdata/SP2019 Faculty Survey.csv')
    survey_2019 = add_realnames(survey_2019,names_lookup)
    master6 = pd.merge(master5, survey_2019, left_on='name', right_on='ilearn_name', how='left')
    master6.to_csv('cache/trash/joined6.csv')


    newnames = [ x.strip() for x in open('cache/poll_question_names.txt','r').readlines() ]
    namedict = {}
    for i,n in enumerate(newnames):
        if i%3==1: newname = n
        if i%3==2: namedict[oldname] = newname
        if i%3==0: oldname = n
    master6 = master6.rename(columns=namedict)
    master6.to_csv('cache/teacherdata/staff_main_table.csv')
    master6.to_csv('cache/teacherdata/staff_main_table.csv')
    """


    master4.to_csv('cache/teacherdata/staff_main_table.csv')
    master4.to_csv('gui/public/staff_main_table.csv')

    other_training_records = read_training_records()
    #print(json.dumps(other_training_records,indent=2))
    #print("This is misc workshops.")
    tt = pd.DataFrame.from_dict(other_training_records,orient='index')
    tt = tt.fillna("")
    #print(tt)
    #input('\npress enter to continue')


    #teacherSharedCourses(schedule_one_yr)
    #getAllTeachersInTerm()


# TODO - broken

def enroll_staff_shell():
    pass
    """staff = users_with_gavilan_email()
    for i,s in staff.iterrows():
        print(s['canvasid'],s['name'])
        u = url + '/api/v1/courses/8528/enrollments'
        param = {
            'enrollment[user_id]':s['canvasid'],
            'enrollment[type]': 'StudentEnrollment',
            'enrollment[enrollment_state]': 'active',
        }

        res = requests.post(u, headers = header, data=param)
        print(res.text)
    """


#"Jun 28 2018 at 7:40AM" -> "%b %d %Y at %I:%M%p"
#"September 18, 2017, 22:19:55" -> "%B %d, %Y, %H:%M:%S"
#"Sun,05/12/99,12:30PM" -> "%a,%d/%m/%y,%I:%M%p"
#"Mon, 21 March, 2015" -> "%a, %d %B, %Y"
#"2018-03-12T10:12:45Z" -> "%Y-%m-%dT%H:%M:%SZ"


# take a list of raw hits.
def activity_summary(hits):
    #infile = "cache/teacherdata/activity/G00101483.json"
    #data = json.loads(open(infile,'r').read())
    #hits = data['raw']
    if not hits:
        return [ [], [], ]
    dt_list = []

    one_week = datetime.timedelta(days=14)                       # actually two....
    today = dt.now().replace(tzinfo=pytz.timezone('UTC'))

    target = today - one_week

    for h in hits:
        the_stamp = parser.parse(h['created_at'])
        if the_stamp > target:
            dt_list.append(the_stamp)
    df = pd.DataFrame(dt_list,  columns=['date',])
    df.set_index('date', drop=False, inplace=True)
    df.rename(columns={'date':'hits'}, inplace=True)
    #df.resample('1D').count().plot(kind='bar')
    #return df.resample('1D').count().to_json(date_format='iso')
    #print(hits)
    #print(df)
    if not df.size:
        return [ [], [], ]
    bins = df.resample('1D').count().reset_index()
    bins['date'] = bins['date'].apply(str)
    #print(bins)
    return [bins['date'].to_list(), bins['hits'].to_list()]

    #plt.show()

    #df = df.groupby([df['date'].dt.to_period('D')]).count().unstack()
    #df.groupby(TimeGrouper(freq='10Min')).count().plot(kind='bar')
    #df.plot(kind='bar')


# next step
# 1. save timestamp of the fetch
#
# 2. parse it and only fetch since then. afterwards, pull out non-hits. Summarize day/week/month stats.
#
# 2a. merge old and new records, and re-summarize.
#
# 3. Next improvements in GUI. hook up to python server backend.
#
# Get views counts on current teachers. todo: month is hardcoded here
def get_recent_views(id=1):
    dt_format = "%Y-%m-%dT%H:%M:%SZ"
    default_start_time = dt.strptime("2020-08-14T00:00:00Z", dt_format)
    default_start_time = default_start_time.replace(tzinfo=pytz.timezone('UTC'))
    end_time = dt.now(pytz.utc)
    print("End time is: %s" % str(end_time))
    myheaders = "x,teacher,prct_online,prct_lecture,prct_hybrid,num_sections_last_year,canvasid_x,name,sortable_name,goo,email,index_x,courses,dept,ilearn_name_x,canvasid_y,canvasid_x,bootcamp_grade,bootcamp_date_x,canvasid_y,bootcamp_progress,bootcamp_date_y,index_y,committees".split(",")

    teachers = [row for row in csv.reader(open('cache/teacherdata/staff_main_table.csv','r'))][1:]

    #tt = teachers[6:10]

    summary = {}

    for t in teachers:
        name = t[1]
        if name=="" or name=="TBA": continue
        if not t[6]: continue
        the_id = int(float(t[6]))
        if the_id == 290: continue   # STAFF STAFF
        goo = t[9]
        print(goo)

        # read log of this person:
        try:
            prev_logf = codecs.open('cache/teacherdata/activity/%s.json' % goo,'r','utf-8')
            prev_log = json.loads(prev_logf.read())
            prev_logf.close()
        except:
            print("Exception happened on reading previous temp logs.")
            prev_log = ''

        if type(prev_log) == dict:
            lastfetch = dt.strptime(prev_log['meta']['lastfetch'], dt_format)
            lastfetch = lastfetch.replace(tzinfo=pytz.timezone('UTC'))
            print("last fetch is: " + str(lastfetch))
            print("Hits BEFORE was: %i" % len(prev_log['raw']))
        else:
            lastfetch = default_start_time
            prev_log = { "raw":[], }

        end_time = dt.now(pytz.utc)
        u = url + "/api/v1/users/%s/page_views?start_time=%s&end_time=%s&per_page=100" % (str(the_id),lastfetch.strftime(dt_format), end_time.strftime(dt_format))
        #print(u)
        #input('getting this url')

        print(name + "\t",end='\n')
        if 1:    # get fresh data?
            r = fetch(u)
            prev_log['raw'].extend( r )
        summ = activity_summary(prev_log['raw'])
        mydata = {'meta':{'lastfetch':end_time.strftime(dt_format)},'summary':summ,'raw':prev_log['raw']}
        codecs.open('cache/teacherdata/activity/%s.json' % goo,'w','utf-8').write( json.dumps(mydata,indent=2))
        summary[the_id] = [the_id, name, len(prev_log['raw']),goo, summ ,mydata['meta']]
        print("Hits AFTER is: %i" % len(prev_log['raw']))
    codecs.open('cache/teacherdata/activitysummary.json','w','utf-8').write( json.dumps(summary,indent=2) )
    codecs.open('gui/public/activitysummary.json','w','utf-8').write( json.dumps(summary,indent=2) )


# TODO broken?
# Have they taught online or hybrid classes?
"""
def categorize_user(u):
    global role_table, term_courses
    their_courses = get_enrlmts_for_user(u, role_table)
    num_s = 0
    num_t = 0
    type = 's'
    online_only = 1
    is_online = []
    #print their_courses
    for x in their_courses.iterrows():
        if len(x):
            ttype = x[1]['type']
            if ttype=='StudentEnrollment': num_s += 1
            if ttype=='TeacherEnrollment': num_t += 1
            cid = x[1]['course_id']
            current_term = term_courses[lambda x: x['id']==cid]
            if not current_term.empty:
                is_online.append(current_term['is_online'].values[0])
            else: online_only = 0
        else: online_only = 0
    if num_t > num_s: type='t'
    if len(is_online)==0: online_only = 0

    for i in is_online:
        if i==0: online_only = 0
    #print "Type: " + type + " All online: " + str(online_only) + " Number courses this term: " + str(len(is_online))
    return (u[0],type, online_only, len(is_online))
"""


##########
##########
##########    PHOTOS
##########
##########    # todo: threaded

# Doest the account have a photo loaded?
def checkForAvatar(id=2):
    try:
        t = url + '/api/v1/users/%s?include[]=last_login' % str(id)
        r2 = requests.get(t, headers = header)
        result = json.loads(r2.text)
        codecs.open('cache/users/%s.txt' % str(id),'w','utf-8').write( json.dumps(result,indent=2) )

        if 'avatar_url' in result:
            if re.search(r'avatar\-50',result['avatar_url']): return 0
            else: return (result['login_id'], result['avatar_url'], result['name'])
    except Exception as e:
        print("Looking for an avatar / profile pic had a problem: %s" % str(e))
    return 0

# Grab em. Change the first if when continuing after problems....
def downloadPhoto():
    pix_dir = 'cache/picsCanvas2022/'
    # Update the list of all ilearn users?
    i_last_ix = '-1'
    photo_log_f = ''
    if 0:                           ##  CHANGE TO 0 IF CRASHED / RESUMING....
        ii = fetchAllUsers()
        photo_log_f = open("cache/fotolog.txt", "w")
    else:
        ii = json.loads(codecs.open('cache/allusers_ids.json','r').read())
        photo_log_f = open("cache/fotolog.txt", "r+")
        i_last_ix = -1
        try:
            ab = photo_log_f.read()
            print(ab)
            ac = ab.split("\n")
            print(ac)
            i_last_ix = ac[-2]
            print(i_last_ix)
        except:
            i_last_ix = -1
        i_last_ix = int(i_last_ix)


    print("Last user index checked was: %s, which is id: %s" %  \
            (i_last_ix, ii[i_last_ix] ))

    print("Max index is: %i" % len(ii))


    i_last_ix += 1
    for index in range(i_last_ix, len(ii)):
        i = ii[index]
        photo_log_f.write("\n%i" % i )

        a = checkForAvatar(i)
        if a:
            print(str(i) + ":\t" + str(a[0]) + "\t" + str(a[2]) )

            try:
                r = requests.get(a[1], stream=True)
                if r.status_code == 200:
                    r.raw.decode_content = True
                    h=r.raw
                    with open(pix_dir + a[0].lower(), 'wb') as f:
                        shutil.copyfileobj(h, f)
                    # rename to right file extension
                    img_type = imghdr.what(pix_dir + a[0].lower())
                    if img_type == 'jpeg': img_type = 'jpg'
                    try:
                        shutil.move(pix_dir + a[0].lower(),pix_dir + a[0].lower()+'.'+img_type)
                    except Exception as e:
                        print("   \tCouldn't rewrite file")
                else:
                    print(str(i) + ":\t didn't get expected photo")
            except Exception as e:
                print("   \tProblem with download " + str(e))
        else:
            print(str(i) + ":\tno user or no photo")
            pass


def mergePhotoFolders():

    staff = [ row for row in csv.reader(  open('cache/teacherdata/staff_main_table.csv','r') ) ]

    headers = staff[0]
    staff = staff[1:]

    activestaff = []

    for i,h in enumerate(headers):
        #print("%i. %s" % (i,h) )
        pass

    for S in staff:
        if S[7] and S[15]:   #   if teacher (name present) and sp20crns (taught in sp20)
            activestaff.append(S[9].lower())
    activestaffset=set(activestaff)

    #return

    a = 'cache/picsCanvas'
    b = 'gui/public/picsCanvas2018'
    c = 'gui/public/picsCanvasAll'


    # I want a big list of who has an avatar pic.

    # and i want to know how many updated since last DL, and how many are in only one or the other.


    old = os.listdir(b)
    count = defaultdict(int)

    oldset = set()
    newset = set()

    for O in old:
        if O.endswith('.jpg') or O.endswith('.png'):
            g = O.split(r'.')[0]
            oldset.add(g)

    for N in os.listdir(a):
        if N.endswith('.jpg') or N.endswith('.png'):
            g = N.split(r'.')[0]
            newset.add(g)

    """print("Active SP20 Teachers")
    print(activestaffset)

    print("Old Avatars")
    print(oldset)

    print("New Avatars")
    print(newset)"""

    updated_set = oldset.union(newset)

    tch_set = updated_set.intersection(activestaffset)

    only_old = oldset.difference(newset)

    only_new = newset.difference(oldset)

    print("Tch: %i  Old: %i New: %i" % (len(activestaffset),len(oldset),len(newset)))

    print("All avatars: %i  Teachers: %i   Only in old: %i   Only in new: %i" % (  len(updated_set), len(tch_set), len(only_old), len(only_new)))

    allpics = os.listdir(c)

    haveapic = {}
    for A in allpics:
        if A.endswith('.jpg') or A.endswith('.png'):
            g = (A.split(r'.')[0]).upper()

            haveapic[g] = A
    outie = codecs.open('gui/public/pics.json','w').write( json.dumps( haveapic,indent=2))


def mergePhotoFolders2():

    staff = [ row for row in csv.reader(  open('cache/teacherdata/staff_main_table.csv','r') ) ]

    headers = staff[0]
    staff = staff[1:]

    activestaff = []

    for i,h in enumerate(headers):
        #print("%i. %s" % (i,h) )
        pass

    for S in staff:
        if S[5]:
            activestaff.append(S[9].lower())

    a = 'cache/picsCanvas'
    b = 'gui/public/picsCanvas2018'
    c = 'gui/public/picsCanvasAll'

    old = os.listdir(b)
    count = defaultdict(int)
    for N in os.listdir(a):
        if N.endswith('.jpg') or N.endswith('.png'):
            g = N.split(r'.')[0]
            if g in activestaff:
                count['s'] += 1
            if N in old:
                #print( "Y - %s" % N)
                count['y'] += 1
            else:
                #print( "N - %s" %N )
                count['n'] += 1
        else:
            #print("x - %s" % N)
            count['x'] += 1
    print("Of the 2020 avatars, %i are in the 2018 folder, and %i are new." % (count['y'],count['n']))
    print("Of %i active teachers, %i have avatars." % (len(activestaff),count['s']))
    #print(json.dumps(count,indent=2))


# Go through my local profile pics, upload any that are missing.
def uploadPhoto():
    files = os.listdir('pics2017')
    #print json.dumps(files)
    pics_i_have = {}
    #goo = "g00188606"
    canvas_users = json.loads(open('canvas/users.json','r').read())
    t = url +  '/api/v1/users/self/files'
    i = 0
    j = 0
    pics_dir = 'pics2017/'

    for x in canvas_users:
        j += 1
        if x['login_id'].lower() + '.jpg' in files:
            #print x['login_id'] + " " + x['name']
            i += 1
            pics_i_have[x['id']] = x

    print('Canvas users: ' + str(j))
    print('Pic matches: ' + str(i))
    account_count = 0
    ids_i_uploaded = []

    for id, target in list(pics_i_have.items()):
        #if account_count > 50:
        #    print 'Stopping after 5.'
        #    break

        print('trying ' + target['name'] + '(' + str(id) + ')')
        if checkForAvatar(id):
            print("Seems to have avatar loaded.")
            continue

        goo = target['login_id'].lower()
        local_img = pics_dir + goo + '.jpg'
        inform_parameters = {
            'name':goo + '.jpg',
            'size':os.path.getsize(local_img), # read the filesize
            'content_type':'image/jpeg',
            'parent_folder_path':'profile pictures',
            'as_user_id':'{0}'.format(id)
        }

        res = requests.post(t, headers = header, data=inform_parameters)
        print("Done prepping Canvas for upload, now sending the data...")
        json_res = json.loads(res.text,object_pairs_hook=collections.OrderedDict)
        files = {'file':open(local_img,'rb').read()}

        _data = list(json_res.items())
        _data[1] = ('upload_params',list(_data[1][1].items()))
        print("Yes! Done sending pre-emptive 'here comes data' data, now uploading the file...")
        upload_file_response = requests.post(json_res['upload_url'],data=_data[1][1],files=files,allow_redirects=False)
        # Step 3: Confirm upload
        print("Done uploading the file, now confirming the upload...")
        confirmation = requests.post(upload_file_response.headers['location'],headers=header)
        if 'id' in confirmation.json():
            file_id = confirmation.json()['id']
        else:
            print('no id here')
        #print(confirmation.json())
        print("upload confirmed...nicely done!")

        time.sleep(1)
        # Make api call to set avatar image to the token of the uploaded imaged (file_id)
        params = { 'as_user_id':'{0}'.format(id)}
        avatar_options = requests.get(url + "/api/v1/users/%s/avatars"% '{0}'.format(id),headers=header,params=params)
        #print "\nAvatar options: "
        #print avatar_options.json()
        for ao in avatar_options.json():
            #print ao.keys()
            if ao.get('display_name')==goo + '.jpg':
                #print("avatar option found...")
                #print((ao.get('display_name'),ao.get('token'), ao.get('url')))
                params['user[avatar][token]'] = ao.get('token')
                set_avatar_user = requests.put(url + "/api/v1/users/%s"% '{0}'.format(id),headers=header,params=params)
                if set_avatar_user.status_code == 200:
                    print(('success uploading user avatar for {0}'.format(id)))
                    account_count += 1
                    ids_i_uploaded.append(id)
                else:
                    print('some problem setting avatar')
            else:
                pass #print 'didnt get right display name?'
    print("Uploaded these guys: " + json.dumps(ids_i_uploaded))


##########
##########
##########    EMAILING PEOPLE
##########
##########


#def test_email():
#    send_z_email("Peter Howell", "Peter", "phowell@gavilan.edu", ['CSIS85','CSIS42'])


def create_ztc_list():
    course_combos = pd.read_csv('cache/teacher_course_oer_email_list.csv')
    course_combos.fillna('',inplace=True)

    # read this file and make it a dict (in one line!)
    dept_counts = { x[0]:x[1].strip() for x in [ y.split(',') for y in open('cache/teacher_course_oer_deptcount.csv','r').readlines() ][1:] }


    course_template = "<a href='%s'>%s</a> &nbsp; &nbsp;"
    url_template = "https://docs.google.com/forms/d/e/1FAIpQLSfZLQp6wHFEdqsmpZ7jz2Y8HtKLo8XTAhrE2fyvTDOEgquBDQ/viewform?usp=pp_url&entry.783353363=%s&entry.1130271051=%s"    #   % (FULLNAME, COURSE1)


    # list depts
    mydepts = sorted(list(set(course_combos['dept'] )))
    i = 0
    outp = open("output/oer_email_list.csv","w")
    outp.write("fullname,firstname,email,link,courses\n")

    ones_i_did = [ int(x) for x in "40 38 31 21 7 12 24 25 1 13 18 22 44 55 56 51 20 16 2 3 4 5 6 8 9 10 11 14 15 17 23 53 52 50 30 48 39 37 54 49 47 46 45 43 42 41 33 32 29 28 27 26".split(" ") ]

    for D in mydepts:
        i += 1
        extra = ''
        if D in dept_counts:
            extra = " (%s)" % dept_counts[D]
        extra2 = ''
        if i in ones_i_did:
            extra2 = "xxxx "
        print("%s  %i. %s %s" % (extra2,i,D,extra))
    choice_list = input("Which department? (for multiple, separate with spaces) ").split(' ')

    all_people_df = []

    for choice in choice_list:
        is_cs = course_combos['dept']==mydepts[int(choice)-1]
        filtered =  pd.DataFrame(course_combos[is_cs])
        if len(all_people_df): all_people_df = pd.concat([filtered,all_people_df])
        else: all_people_df = filtered
        print(mydepts[int(choice)-1])
        print(all_people_df)
        print(' ')
    all_people_df.sort_values(by=['name'],inplace=True)
    print(all_people_df)

    b = all_people_df.groupby(['name'])
    for name,group in b:
        if name == 'no data': continue
        nameparts = name.split(', ')
        fullname = nameparts[1] + ' ' + nameparts[0]
        firstname = nameparts[1]

        outp.write(fullname + ',' + firstname + ',')
        email = ''
        link = ''
        courses = []
        flag = 1
        for i in group.iterrows():
            g = i[1]   # wtf is this shi.....
            this_course = g.dept + ' ' + str(g.codenum) + g.codeletter
            courses.append( this_course ) #print(g)
            email = g.email
            if flag:
                link = url_template % (fullname, this_course)
            flag = 0

        outp.write(email + ',' + link + "," + "    ".join(courses) + "\n")

    outp.close()


##########
##########
##########    FORENSICS TYPE STUFF
##########
##########

# better name for this standard fetch. so they stay together in alpha order too....

def get_user_info(id):
    u = fetch( '/api/v1/users/%i' % id )
    ff = codecs.open('cache/users/%i.txt' % id, 'w', 'utf-8')
    ff.write( json.dumps(u, indent=2))
    return u


# these are any messages that get pushed out to their email
def comm_mssgs_for_user(uid=0):
    if not uid:
        uid = input('Canvas id of the user? ')
    u = url + '/api/v1/comm_messages?user_id=%s&start_time=%s&end_time=%s' % (uid,'2021-01-01T01:01:01Z','2021-08-01T01:01:01Z')     # &filter[]=user_%s' % uid
    convos = fetch(u,1)

    oo = codecs.open('cache/comms_push_user_%s.txt' % str(uid), 'w')
    oo.write('USER %s\n' % uid)
    oo.write(json.dumps(convos, indent=2))

    print(convos)


#
def convos_for_user(uid=0):
    if not uid:
        uid = input('Canvas id of the user? ')
    u = url + '/api/v1/conversations?include_all_conversation_ids=true&as_user_id=%s' % uid     # &filter[]=user_%s' % uid
    convos = fetch(u,1)

    oo = codecs.open('cache/convo_user_%s.txt' % str(uid), 'w')
    oo.write('USER %s\n' % uid)
    oo.write(json.dumps(convos, indent=2))

    convo_ids_list = convos["conversation_ids"]
    print(convo_ids_list)

    u2 = url + '/api/v1/conversations?include_all_conversation_ids=true&scope=archived&as_user_id=%s' % uid     # &filter[]=user_%s' % uid
    archived_convos = fetch(u2,1)
    try:
        aconvo_ids_list = archived_convos["conversations_ids"]
        print(aconvo_ids_list)
    except:
        print("didnt seem to be any archived.")
        aconvo_ids_list = []

    u3 = url + '/api/v1/conversations?include_all_conversation_ids=true&scope=sent&as_user_id=%s' % uid     # &filter[]=user_%s' % uid
    sent_convos = fetch(u3,1)
    try:
        sconvo_ids_list = sent_convos["conversations_ids"]
        print(sconvo_ids_list)
    except:
        print("didnt seem to be any sent.")
        sconvo_ids_list = []

    convo_ids_list.extend(aconvo_ids_list)
    convo_ids_list.extend(sconvo_ids_list)


    ##
    ## Now get all the messages in each of these conversations
    ##

    for cid in convo_ids_list:
        print("Fetching conversation id: %s" % cid)
        oo.write("\n\n----------------\nconversation id: %s\n\n" % cid)

        u4 = url + '/api/v1/conversations/%s?as_user_id=%s' % (cid,uid)       #     ' % (cid, uid
        coverstn = fetch(u4,1)
        oo.write("\n%s\n\n" % json.dumps(coverstn,indent=2))


    """
    for c in convos:
        c['participants'] = ", ".join([ x['name'] for x in c['participants'] ])
    includes = tuple("last_message subject last_message_at participants".split(" "))
    convos = list( \
            reversed([ funcy.project(x, includes) for x in convos ]))
    """

    #

    #print(json.dumps(convos, indent=2))


# single q sub
def quiz_get_sub(courseid, quizid, subid=0):
    u = url + "/api/v1/courses/%s/quizzes/%s/submissions/%s" % ( str(courseid), str(quizid), str(subid) )

    u = url + "/api/v1/courses/%s/quizzes/%s/questions?quiz_submission_id=%s" % \
            ( str(courseid), str(quizid), str(subid) )

    u = url + "/api/v1/courses/%s/assignments/%s/submissions/%s?include[]=submission_history" % \
            ( str(courseid), str(quizid), str(subid) )

    u = url + "/api/v1/courses/%s/students/submissions?student_ids[]=all&include=submission_history&grouped=true&workflow_state=submitted" % str(courseid)
    return fetch(u)

    #?quiz_submission_id=%s"

# quiz submissions for quiz id x, in course id y
def quiz_submissions(courseid=9768, quizid=32580):
    #subs = quiz_get_sub(courseid, quizid)
    #print( json.dumps( subs, indent=2 ) )

    if 1:
        # POST
        data = { "quiz_report[includes_all_versions]": "true", "quiz_report[report_type]": "student_analysis" }

        u = url + "/api/v1/courses/%s/quizzes/%s/reports?" % ( str(courseid), str(quizid) )
        res = requests.post(u, headers = header, data=data)
        print(res.content)

    #u2 = url + "/api/v1/courses/%s/quizzes/%s/reports" % ( str(courseid), str(quizid) )
    #res2 = fetch(u2)
    #print( json.dumps(res2.content, indent=2))

    jres2 = json.loads( res.content )
    print(jres2)
    if jres2['file'] and jres2['file']['url']:
        u3 = jres2['file']['url']
        r = requests.get(u3, headers=header, allow_redirects=True)
        open('cache/quizreport.txt', 'wb').write(r.content)
    return

    for R in res2:
        if R['id'] == 7124:
            u3 = R['url']
            r = requests.get(u3, headers=header, allow_redirects=True)
            open('cache/quizreport.txt', 'wb').write(r.content)
    return

    u3 = url + "/api/v1/courses/%s/quizzes/%s/reports/%s" % ( str(courseid), str(quizid), res2[''] )

    oo = codecs.open('cache/submissions.json','w', 'utf-8')
    oo.write('[\n')
    for s in subs:
        if len(s['submissions']):
            j = json.dumps(s, indent=2)
            print(j)
            oo.write(j)
            oo.write('\n')

    oo.write('\n]\n')
    return 0


    #u = url + "/api/v1/courses/%s/quizzes/%s/submissions?include[]=submission" % (str(courseid), str(quizid))
    u = url + "/api/v1/courses/%s/quizzes/%s/submissions" % (str(courseid), str(quizid))
    subs = fetch(u, 0)
    print( json.dumps( subs, indent=1 ) )

    for S in subs['quiz_submissions']:
        print(json.dumps(S))
        submis = quiz_get_sub(courseid, quizid, S['id'])
        print(json.dumps(submis, indent=2))


# return (timeblock, course, read=0,write=1)
def requests_line(line,i=0):
    try:
        L = line  # strip?
        if type(L) == type(b'abc'): L = line.decode('utf-8')
        for pattern in unwanted_req_paths:
            if pattern in L:
                return 0
        i = 0
        line_parts = list(csv.reader( [L] ))[0]
        #for p in line_parts:
        #    print("%i\t%s" % (i, p))
        #    i += 1

        d = parser.parse(line_parts[7])
        d = d.replace(tzinfo=pytz.timezone('UTC')).astimezone(pytz.timezone('US/Pacific'))
        d = timeblock_24hr_from_dt(d)

        #r = re.search('context\'\:\s(\d+)', line_parts[22])
        #c = 0
        #if r:
        #    c = r.groups(1)
        str1 = line_parts[20]
        str2 = str1.replace("'",'"')
        str2 = str2.replace("None",'""')
        #print(str2)
        j = json.loads(str2  )
        c = j['context']
        a = line_parts[5]
        #print( str( (d, c, a) ))
        return (d, str(c), a)
    except Exception as e:
        #print("Exception: " + str(e))
        return 0


#
def report_logs(id=0):
    if not id:
        L = ['10531', ]
    else:
        L = [ id, ]
    report = []
    for id in L:
        emt_by_id = course_enrollment(id)
        for U in emt_by_id.values():
            user_d = defaultdict( int )
            print( "Lookin at user: %s" % U['user']['name'] )
            report.append( "User: %s\n" % U['user']['name'] )
            log_file_name = 'cache/users/logs/%i.csv' % U['user']['id']
            if path.exists(log_file_name):
                print("Log file %s exists" % log_file_name)
                temp = open(log_file_name, 'r').readlines()
                for T in temp[1:]:
                    #print(T)
                    result = requests_line(T)
                    if result:
                        (d, c, a)  = result
                        if c == id:
                            user_d[d] += 1
            print(json.dumps(user_d, indent=2))
            for V in sorted(user_d.keys()):
                report.append( "\t%s: %i\n" % ( dt_from_24hr_timeblock(V), user_d[V]) )
        report.append("\n\n")
    return report


def track_users_in_sem():
    L = users_this_semester_db()
    sL = list(L)
    sL.sort(reverse=True)
    fetch_queue = queue.Queue()

    for i in range(num_threads):
        worker = Thread(target=track_user_q, args=(i,fetch_queue))
        worker.setDaemon(True)
        worker.start()

    for U in sL:
        print( "adding %s to the queue" % U )
        fetch_queue.put( U )

    fetch_queue.join()
    print("Done.")


def track_users_in_class(L=[]):
    if len(L)==0:
        #id = '10531'
        ids = input("Course ids, separated with comma: ")
        L = [x for x in ids.split(',')]
    print("Getting users in: " + str(L))

    fetch_queue = queue.Queue()

    for i in range(num_threads):
        worker = Thread(target=track_user_q, args=(i,fetch_queue))
        worker.setDaemon(True)
        worker.start()


    users_set = set()
    for id in L:
        emt_by_id = course_enrollment(id)
        print(emt_by_id)
        for U in emt_by_id.values():
            if not U['user_id'] in users_set:
                print(U)
                print( "adding %s to the queue" % U['user']['name'] )
                fetch_queue.put( U['user_id'] )
                users_set.add(U['user_id'])

    all_reports = []
    fetch_queue.join()
    print("Done with %i users in these courses." % len(users_set))
    for id in L:
        rpt = report_logs(id)
        all_reports.append(rpt)
        outp = codecs.open('cache/courses/report_%s.txt' % id, 'w', 'utf-8')
        outp.write(''.join(rpt))
        outp.close()
    return all_reports

def track_user_q(id, q):
    while True:
        user = q.get()
        print("Thread %i: Going to download user %s" % (id, str(user)))
        try:
            track_user(user, id)
        except FetchError as e:
            pass
        q.task_done()


# honestly it doesn't make much sense to get full histories this way if they're
# already in the canvas data tables....

# just the most recent hits or a short period
#
# Live data would be better.

# Maintain local logs. Look to see if we have some, download logs since then for a user.
def track_user(id=0,qid=0):
    global recvd_date
    L = [id,]
    if not id:
        ids = input("User ids (1 or more separated by comma): ")
        L = [int(x) for x in ids.split(',')]
    print("Getting users: " + json.dumps(L))


    for id in L:
        id = int(id)
        # Open info file if it exists, check for last day retrived
        try:
            infofile = open("cache/users/%i.txt" % id, 'r')
            info = json.loads( infofile.read() )

            # TODO: set up this info file if it isn't there. check any changes too. it
            # was written where?....
            infofile.close()
        except Exception as e:
            print("failed to open info file for user id %i" % id)

            info = get_user_info(id)

        print("(%i) Student %i Info: " % (qid,id))
        #print( json.dumps(info, indent=2))

        url_addition = ""

        if 1:    # hard code dates
            start_date = "2023-01-01T00:00:00-07:00"
            end_date   = "2023-08-01T00:00:00-07:00"
            url_addition = f"?start_time={start_date}&end_time={end_date}"
        elif 'last_days_log' in info:
            print("There's existing log data for %s (%s)" % (info['name'] , info['sis_user_id']))
            print("Last day logged was: %s" % info['last_days_log'])
            url_addition = "?start_time=%s" % info['last_days_log']
            the_stamp = parser.parse(info['last_days_log'])
            the_stamp = the_stamp.replace(tzinfo=pytz.timezone('UTC')).astimezone(pytz.timezone('US/Pacific'))
            now = dt.now()
            now = now.replace(tzinfo=pytz.timezone('UTC')).astimezone(pytz.timezone('US/Pacific'))
            dif = now - the_stamp
            print("It was %s ago" % str(dif))
            if the_stamp < lds_stamp:
                print("Too long, taking default")
                url_addition = "?start_time=%s" % log_default_startdate

            #lds_stamp = parser.parse(log_default_startdate)

##########
        else:
            url_addition = "?start_time=%s" % log_default_startdate
            #if dif.days > 1:

        url = "/api/v1/users/%i/page_views%s" % (id, url_addition)
        print(url)

        try:

            api_gen = fetch_stream(url,0)

            log_file_name = 'cache/users/logs/%i.csv' % id
            if path.exists(log_file_name):
                print("Log file %s exists" % log_file_name)
                temp = open(log_file_name, 'a', newline='')
                csv_writer = csv.writer(temp)
            else:
                print("Creating new log file: %s" % log_file_name)
                temp = open(log_file_name, 'w', newline='')    ### TODO
                csv_writer = csv.writer(temp)


            count = 0
            for result in api_gen:
                if count == 0 and len(result):
                    header = result[0].keys()
                    csv_writer.writerow(header)
                    # results come in newest first....
                    recvd_date = result[0]['updated_at']
                    print("(%i) Most recent hit is %s" % (qid,recvd_date))

                count += len(result)
                indent = "    " * qid
                #print("(%i) Got %i records, %i so far" % (qid,len(result),count))
                print("(%s - %i) %s %i" % (qid, id, indent, count))
                if count > max_log_count:
                    print("Too many logs, bailing. sorry.")
                    break

                for R in result:
                    csv_writer.writerow(R.values())

            latest = parser.parse(recvd_date)
            #last_full_day = (latest - timedelta(days=1)).isoformat()
            info['last_days_log'] = recvd_date    #last_full_day

            infofile = open("cache/users/%i.txt" % id, 'w')
            infofile.write(json.dumps( info, indent=2 ))
            infofile.close()

            print("(%i) Output to 'cache/users/log/%i.csv'" % (qid,id))
        except FetchError as e:
            print("Getting a 502 error.")
            raise FetchError()
        except Exception as e2:
            print("Got an error receiving logs: %s" % str(e2))

#
def track_users_by_teacherclass():
    all_teachers = teachers_courses_semester()

    skip_to = "Punit Kamrah"
    skipping = 1

    grouped = funcy.group_by( lambda x: x[4], all_teachers )
    g2 = {}
    for k,v in grouped.items():
        print(k)
        if skipping and skip_to != k:
            print("skipping")
            continue
        skipping = 0

        g2[k] = list(funcy.distinct( v, 1 ))
        print("\n\n\n\n\n")
        print(k)
        print("\n\n\n\n\n")

        teacherfile = codecs.open('cache/teacherdata/reports/%s.txt' % k.replace(" ","_"),'w','utf-8')
        class_ids = funcy.lpluck(1,v)
        class_names = funcy.lpluck(2,v)
        print(class_ids)
        print(class_names)

        rpts = track_users_in_class(class_ids)

        for i, R in enumerate(rpts):
            teacherfile.write('\n\n\n---\n\n%s \n\n' % class_names[i])
            teacherfile.write(''.join(R))
            teacherfile.flush()
        teacherfile.close()


    print(json.dumps(g2, indent=2))


def nlp_sample():
    # Stream a training corpus directly from S3.
    #corpus = corpora.MmCorpus("s3://path/to/corpus")

    stemmer = stem.porter.PorterStemmer()

    strings = [
    "Human machine interface for lab abc computer applications",
    "A survey of user opinion of computer system response time",
    "The EPS user interface management system",
    "System and human system engineering testing of EPS",
    "Relation of user perceived response time to error measurement",
    "The generation of random binary unordered trees",
    "The intersection graph of paths in trees",
    "Graph minors IV Widths of trees and well quasi ordering",
    "Graph minors A survey",
]
    processed = [ [ stemmer.stem(y) for y in utils.simple_preprocess(x, min_len=4)] for x in strings]
    print(processed)
    dictionary = corpora.Dictionary( processed )
    dct = dictionary
    print(dictionary)

    corpus = [dictionary.doc2bow(text) for text in processed]

    print(corpus)

    # Train Latent Semantic Indexing with 200D vectors.
    lsi = models.LsiModel(corpus, num_topics=4)
    print(lsi.print_topics(-1))

    # Convert another corpus to the LSI space and index it.
    #index = similarities.MatrixSimilarity(lsi[another_corpus])

    tfidf = models.TfidfModel(corpus)

    #index = similarities.SparseMatrixSimilarity(tfidf[corpus], num_features=12)
    index = similarities.MatrixSimilarity(lsi[corpus])
    print(index)


    # Compute similarity of a query vs indexed documents.
    query = "tree graph".split()
    query_bow = dictionary.doc2bow(query)
    vec_lsi = lsi[query_bow]

    print(query_bow)
    print(tfidf[query_bow])
    print(vec_lsi)
    print("ok")

    #                  LdaMulticore

    lda_model = models.LdaModel(corpus=corpus,
                         id2word=dictionary,
                         random_state=100,
                         num_topics=4,
                         passes=40,
                         chunksize=1000,
                         #batch=False,
                         alpha='asymmetric',
                         decay=0.5,
                         offset=64,
                         eta=None,
                         eval_every=0,
                         iterations=100,
                         gamma_threshold=0.001,
                         per_word_topics=True)
    lda_model.save('cache/lda_model.model')
    print(lda_model.print_topics(-1))
    print(lda_model)

    for c in lda_model[corpus]:
        print("Document Topics      : ", c[0])      # [(Topics, Perc Contrib)]
        print("Word id, Topics      : ", c[1][:3])  # [(Word id, [Topics])]
        print("Phi Values (word id) : ", c[2][:2])  # [(Word id, [(Topic, Phi Value)])]
        print("Word, Topics         : ", [(dct[wd], topic) for wd, topic in c[1][:2]])   # [(Word, [Topics])]
        print("Phi Values (word)    : ", [(dct[wd], topic) for wd, topic in c[2][:2]])  # [(Word, [(Topic, Phi Value)])]
        print("------------------------------------------------------\n")


    sims = index[vec_lsi]
    print("ok2")
    print(list(enumerate(sims)))

    for document_number, score in sorted(enumerate(sims), key=lambda x: x[1], reverse=True):
        print(document_number, score)

'''
def nlp_sample2():
    # load english language model
    nlp = spacy.load('en_core_web_sm',disable=['ner','textcat'])

    text = "This is a sample sentence."

    # create spacy
    doc = nlp(text)

    for token in doc:
        print(token.text,'->',token.pos_)
'''


def one_course_enrol():

    users = '96 18771 2693 5863 327'.split()
    course = '11015'
    the_type = 'TeacherEnrollment'                          # 'StudentEnrollment'
    u = url + '/api/v1/courses/%s/enrollments' % course

    for user in users:
        param = {
            'enrollment[user_id]':user,
            'enrollment[type]': the_type,
            'enrollment[enrollment_state]': 'active',
        }

        res = requests.post(u, headers = header, data=param)
        print(res.text)


def find_new_teachers():
    filename = "cache/fa22_sched.json"
    jj = json.loads(codecs.open(filename,'r','utf-8').read())
    for J in jj:
        print( J['teacher'])

def user_db_sync():
    # currently in db
    conusr = fetch("http://192.168.1.6:8080/dir_api.php?users=1")
    conusr_emails = set([x.lower() for x in funcy.pluck('email',conusr)])

    #fetch all staff from ilearn                        ILRN            unique emails
    ilrn = json.loads(codecs.open("cache/ilearn_staff.json","r","utf-8").read())
    ilrn_emails = set([x.lower() for x in funcy.pluck('email',ilrn)])

    for e in ilrn_emails:

        if not (e in conusr_emails) and e.endswith('@gavilan.edu'):
            E = funcy.first(funcy.where(ilrn,email=e))
            goo = E['login_id'][3:]
            #print("not in conf_user: %s  \t %s \t %s" % (e,E['short_name'], E['login_id']) )
            print("INSERT INTO conf_users (goo,email,name) VALUES ('%s', '%s', '%s');" % (goo,e,E['short_name']) )


def user_db_sync2():
    #fetch all personnel dir entries from dir_api.php.  PERSL           unique emails
    persl = fetch("http://hhh.gavilan.edu/phowell/map/dir_api.php?personnel=1")
    persl_emails = set([x.lower() for x in funcy.pluck('email',persl)])
    #persl_ids = set([x.lower() for x in funcy.pluck('email',persl)])
    #
    #fetch all staff from ilearn                        ILRN            unique emails
    ilrn = json.loads(codecs.open("cache/ilearn_staff.json","r","utf-8").read())
    ilrn_emails = set([x.lower() for x in funcy.pluck('email',ilrn)])
    #
    #fetch all conf_users from dir_api.php              CONUSR          unique emails
    conusr = fetch("http://hhh.gavilan.edu/phowell/map/dir_api.php?users=1")
    conusr_emails = set([x.lower() for x in funcy.pluck('email',conusr)])

    #fetch all gavi_personnel_ext  from dir_api.php     GPEREXT         must have column 'personnel' or 'c_users' or both.
    gperext = fetch("http://hhh.gavilan.edu/phowell/map/dir_api.php?personnelext=1")

    all_emails = set(persl_emails)
    all_emails.update(ilrn_emails)
    all_emails.update(conusr_emails)

    all_emails = list(all_emails)
    all_emails.sort()

    fout = codecs.open('cache/db_staff_report.csv','w','utf-8')
    fout.write('email,personnel_dir,ilearn,conf_user\n')
    for e in all_emails:

        if e in ilrn_emails and not (e in conusr_emails) and e.endswith('@gavilan.edu'):
            E = funcy.first(funcy.where(ilrn,email=e))
            goo = E['login_id'][3:]
            #print("not in conf_user: %s  \t %s \t %s" % (e,E['short_name'], E['login_id']) )
            print("INSERT INTO conf_users (goo,email,name) VALUES ('%s', '%s', '%s');" % (goo,e,E['short_name']) )

            # goo (minus G00) email, and name go into conf_users

        fout.write(e+',')
        if e in persl_emails:
            fout.write('1,')
        else:
            fout.write('0,')
        if e in ilrn_emails:
            fout.write('1,')
        else:
            fout.write('0,')
        if e in conusr_emails:
            fout.write('1,')
        else:
            fout.write('0,')
        fout.write('\n')
    fout.close()
    #

    #print( json.dumps( [persl,ilrn,conusr,gperext], indent=2 ) )
    print('done')

import traceback


def find_no_goo():

    DO_DELETE_USERS = 0
    DO_DELETE_PORTFOLIOS = 0

    output = codecs.open('cache/no_goo_numbers.json','w','utf-8')
    output2 = codecs.open('cache/wrong_root_acct.json','w','utf-8')
    output3 = codecs.open('cache/wrong_sis_import_id.json','w','utf-8')
    output4 = codecs.open('cache/bad_portfolios.json','w','utf-8')
    #output5 = codecs.open('cache/bad_portfolios_detail.html','w','utf-8')
    all = []
    no_root = []
    no_sis = []
    port = []
    i = 0
    j = 0
    k = 0
    p = 0
    users = json.loads(codecs.open('cache/allusers.json','r','utf-8').read())
    for u in users:
        if not 'login_id' in u:
            print(u['name'])
            i+=1
            all.append(u)
            user_port = []
            pp = fetch(url + '/api/v1/users/%s/eportfolios' % str(u['id']))
            for p_user in pp:
                try:
                    user_port.append(  fetch(url+'/api/v1/eportfolios/%s' % str(p_user['id'])   ) )
                    if DO_DELETE_PORTFOLIOS:
                        #output5.write("<br />deleted: <a href='https://ilearn.gavilan.edu/eportfolios/%s'>%s\n" % (str(p_user['id']),str(p_user['id']))  )
                        #output5.flush()
                        del_request = requests.delete(url + "/api/v1/eportfolios/%s"    % str(p_user['id'])  ,headers=header)
                        print(del_request.text)
                except Exception as e:
                    traceback.print_exc()
            p += len(pp)
            port.append(pp)

            if DO_DELETE_USERS:
                print("Deleting %s..." % u['name'])
                del_request = requests.delete(url + "/api/v1/accounts/1/users/%s"    % str(u['id'])  ,headers=header)
                print(del_request.text)
        if 'root_account' in u and u['root_account'] != "ilearn.gavilan.edu":
            no_root.append(u)
            j += 1
        if 'sis_import_id' in u and not u['sis_import_id']:
            no_sis.append(u)
            k += 1
    print("Found %i users without G numbers" % i)
    print("Found %i users with non gav root account" % j)
    print("Found %i users without sis id" % k)
    print("Found %i questionable portfolios" % p)
    output.write(  json.dumps(all,indent=2) )
    output2.write(  json.dumps(no_root,indent=2) )
    output3.write(  json.dumps(no_sis,indent=2) )
    output4.write(  json.dumps(port,indent=2) )


def track_a_user():
    a = input("User ID? ")
    track_user(a)

def compare_db_tables():
    import requests

    # Download JSON files
    url_a = 'http://www.gavilan.edu/staff/tlc/db.php?a=dir'
    url_b = 'http://www.gavilan.edu/staff/tlc/db.php?a=confusers'
    response_a = requests.get(url_a)
    print('got 1')
    response_b = requests.get(url_b)
    print('got 2')

    # Parse JSON data
    data_a = response_a.json()
    data_b = response_b.json()

    by_email_conf = {}
    for item in data_a:
        by_email_conf[item['email']] = item

    # Extract email addresses from each file
    emails_a = {item['email'] for item in data_a}
    emails_b = {item['email'] for item in data_b}

    emails_a = {item for item in emails_a if item is not None}
    emails_b = {item for item in emails_b if item is not None}

    emails_a = {item.lower() for item in emails_a}
    emails_b = {item.lower() for item in emails_b}

    # Find common emails
    common_emails = emails_a.intersection(emails_b)

    # Find distinct emails for each file
    distinct_emails_a = emails_a.difference(emails_b)
    distinct_emails_b = emails_b.difference(emails_a)

    # Print the results
    print("Common Emails:")
    for email in sorted(list(common_emails)):
        print(email)

    print("\nDistinct Emails in Staff directory:")
    for email in sorted(list(distinct_emails_a)):
        print(email)

    print("\nDistinct Emails in conf users table:")
    for email in sorted(list(distinct_emails_b)):
        print(email)

    out = codecs.open('cache/users_fix.sql','w','utf-8')
    for e in common_emails:
        out.write(f"update `conf_users` set `p2id`='{by_email_conf[e]['id']}' where lower(`email`)='{e}';\n")

# given a list of classes, report back about the student on one row of info
def student_history_analysis(sh):
    from functools import reduce
    semesters_set = set()
    num_sems = 0
    num_course = len(sh)
    num_units = 0
    units_online = 0
    units_inperson = 0
    units_hybrid = 0
    units_ol = 0
    fa_23_units = 0
    fa_23_online_units = 0
    fa23_courses = 0
    fa23_onlinecourses = 0

    #un_list = [ float(x['units'].split('-')[0].split('/')[0]) for x in sh ]
    #num_units = reduce(lambda x,y: x+y, un_list)
    for section in sh:
        semesters_set.add(section['sis'])
        units = float(section['units'].split('-')[0].split('/')[0])
        num_units += units
        if section['type'] == 'in-person': units_inperson += units
        if section['type'] == 'online': units_online += units
        if section['type'] == 'hybrid': units_hybrid += units
        if section['type'] == 'online live': units_ol += units

        if section['sis'] == '202370':
            fa_23_units += units
            fa23_courses += 1
            if not section['type'] == 'in-person':
                fa_23_online_units += units
                fa23_onlinecourses += 1

    num_sems = len(semesters_set)
    if num_units == 0:
        pct_online = 0
    else:
        pct_online = round(100 * (units_online+units_hybrid+units_ol) / num_units, 1)

    if fa_23_units == 0:
        fa_23_pct_online = 0
    else:
        fa_23_pct_online = round(100 * (fa_23_online_units) / fa_23_units, 1)

    if fa23_courses == 0:
        fa23_pct_course_online = 0
    else:
        fa23_pct_course_online = round(100 * (fa23_onlinecourses) / fa23_courses, 1)
    summary = [units, num_course, f"\"{sh[0]['sortablename']}\",{sh[0]['canvasid']},{num_sems},{num_course},{num_units},{units_online},{units_inperson},{units_hybrid},{units_ol},{pct_online},{fa_23_units},{fa_23_online_units},{fa_23_pct_online},{fa23_courses},{fa23_onlinecourses},{fa23_pct_course_online}"]
    return summary

def report_student_stats():
    from localcache import users_with_history, students_current_semester
    from itertools import groupby
    u = users_with_history()
    this_sem = [x['canvasid'] for x in students_current_semester()]

    df = pd.DataFrame(u)
    filtered_df = df[df['canvasid'].isin(this_sem)]
    filtered_df.to_csv('cache/student_history_current_students.csv',index=False)

    oo = codecs.open('cache/student_units.txt','w','utf-8')
    oo.write("name,id,num_sems,num_course,num_units,units_online,units_inperson,units_hybrid,units_ol,percent_online,fa23_units,fa23_onlineunits,fa23_pct_online,fa23_num_courses,fa23_num_onlinecourses,fa23_percent_online_course\n")
    # Now group by that key
    def kk(x): return x['canvasid']
    grouped_dict = {key:list(group) for key, group in groupby(u, kk)}

    shorter = []

    for k,g in grouped_dict.items():
        if k in this_sem:
            h = student_history_analysis(g)
            #oo.write(json.dumps(h[2],indent=2)+ "\n")
            oo.write( str(h[2]) + "\n")
            shorter.append(h)
        else:
            print(f"Skipping {k}")
    #print(this_sem)
    #oo.write('units,courses\n')
    #shorter.sort(key=lambda x: x[0], reverse=True)
    #for s in shorter:
    #    print(s[2])
    #    #oo.write(f"{s[0]},{s[1]}\n")
    #    #print('\n\n')


if __name__ == "__main__":
    print ("")
    options = { 1: ['Fetch iLearn users with @gavilan.edu email address', teacherRolesUpdateCache],
                2: ['Fetch all users',fetchAllUsers],
                5: ['Download user avatars', downloadPhoto],
                6: ['Merge photo folders', mergePhotoFolders],
                7: ['Get all teachers logs 1 month', get_recent_views],
                8: ['Gather teacher history, a variety of stats.', getTeachersInfoMain],
                9: ['test rtr.', read_training_records],
                10: ['Get a users logs', track_user],
                11: ['test: oneYearSchedule', oneYearSchedule],
                12: ['summarize hit activity', activity_summary],
                13: ['Get all users logs in a class', track_users_in_class],
                14: ['Get logs for 1 user', track_a_user ],
                15: ['Get all users logs in a semester', track_users_in_sem],
                16: ['Report on attendance for all classes', track_users_by_teacherclass],
                17: ['Show all convos for a user', convos_for_user],
                21: ['Show all pushed notifications for a user', comm_mssgs_for_user],
                18: ['Quiz submissions', quiz_submissions],
                19: ['NLP Sample', nlp_sample],
                20: ['Enroll a single user into a class', one_course_enrol],
                21: ['Teachers new this semester', find_new_teachers],
                22: ['Sync personnel and conference user databases', user_db_sync],
                23: ['Find non-gnumbers', find_no_goo ],
                24: ['compare user tables', compare_db_tables],
                25: ['Report on student stats', report_student_stats],
                #3: ['Main index, 1 year, teachers and their classes', getAllTeachersInTerm],
                #5: ['Match names in schedule & ilearn', match_usernames],
                #6: ['Create Dept\'s ZTC list', create_ztc_list],
                ##7: ['Build and send ZTC emails', send_ztc_mails],
                #14: ['investigate the logs', investigate_logs],
                #12: ['test: match_usernames', match_usernames],
                #13: ['test: get all names', getAllNames],
                #13: ['x', users_with_gavilan_email],
    }
    if len(sys.argv) > 1 and re.search(r'^\d+',sys.argv[1]):
        resp = int(sys.argv[1])
        print("\n\nPerforming: %s\n\n" % options[resp][0])

    else:
        print ('')
        for key in options:
            print(str(key) + '.\t' + options[key][0])

        print('')
        resp = input('Choose: ')

    # Call the function in the options dict
    options[ int(resp)][1]()