canvasapp/localcache.py

# Local data, saving and manipulating

import os, re, gzip, codecs, funcy, pytz, sqlite3, json, random, functools, requests, sys, csv
import pandas as pd
import numpy as np
from collections import defaultdict
from datetime import datetime as dt
from datetime import timedelta
from dateutil.parser import parse
from os.path import exists, getmtime
from pipelines import sync_non_interactive, url, header, gp, dean

#from courses import getCoursesInTerm
#from courses import user_in_depts_live

mycourses = {}

local_data_folder = 'cache/canvas_data/'
sqlite_file = local_data_folder + 'data.db'    #'data_su20_4hr_blocks.db'
mylog = codecs.open(local_data_folder + 'canvas_data_log.txt','w')

thefiles_dat = {}
try:
    for L in open('cache/canvas_data_index.txt','r').readlines():
        L = L.strip()
        (fname,start,finish) = L.split(',')
        thefiles_dat[fname] = start
except Exception as e:
    print("cache/canvas_data_index.txt was not found")

thefiles = open('cache/canvas_data_index_temp.txt','a')    # rename me if nothing crashes :)


NUM_ONLY = 1    # use numeric codes instead of strings. For mathy stuff

requests_sum1_format = "id userid courseid timeblock viewcount partcount".split(" ")
requests_sum1_types = "INTEGER PRIMARY KEY AUTOINCREMENT,text,text,INTEGER,INTEGER,INTEGER".split(",")
requests_format = "id timestamp year month day userid courseid rootid course_acct_id quizid discussionid conversationid assignmentid url useragent httpmethod remoteip micros controller action contexttype contextid realid sessionid agentid httpstatus httpversion developer_key_id time_block".split(" ")
users_format = "id canvasid rootactid name tz created vis school position gender locale public bd cc state sortablename globalid".split(" ")
cc_format = "id canvasid userid address type position state created updated".split(" ")
term_format = "id canvasid rootid name start end sis".split(" ")
course_format = "id canvasid rootactid acctid termid name code type created start conclude visible sis state wikiid".split(" ")
role_format = "id canvas_id root_account_id account_id name base_role_type workflow_state created_at updated_at deleted_at".split(" ")
course_score_format = "s_id c_id a_id course_id enrol_id current final muted_current muted_final".split(" ")
enrollment_dim_format = "id cid root course_section role type workflow created updated start end complete self sis course_id user_id last_activity".split(" ")
communication_channel_dim_format = "id canvas_id user_id address type position workflow_state created_at updated_at".split(" ")
pseudonym_dim_format = "id canvas_id user_id account_id workflow_state last_request_at last_login_at current_login_at last_login_ip current_login_ip position created_at updated_at password_auto_generated deleted_at sis_user_id unique_name integration_id authentication_provider_id".split(" ")
conversation_dim_format = "id canvas_id has_media_objects subject course_id group_id account_id".split(" ")
conversation_message_dim_format = "id canvas_id conversation_id author_id created_at generated has_attachments has_media_objects body".split(" ")


unwanted_req_paths = """conversations/unread_count
CFNetwork
TooLegit
lti_user_id
brand_variables
dashboard-sidebar
dashboard_cards
ping
self/profile
login/oauth2
login/session_token
self/colors
self/profile
images/thumbnails
auth/login
auth/conversations
backup/login
blackboard ally
Proctorio
image_thumbnail
manifest.json
launch_definitions/login
login
python-requests
custom_data
content_shares
pandata_events
trypronto
users/self """.split("\n")

other_interesting_events = { }

DB_CON = 0
DB_CUR = 0

#########
######### LOCAL DB
#########

def db():
    global DB_CON, DB_CUR
    if DB_CON:
        return (DB_CON,DB_CUR)
    print('grabbing db connection')
    DB_CON = sqlite3.connect(sqlite_file)
    DB_CUR = DB_CON.cursor()

    return (DB_CON, DB_CUR)


def setup_table(table='requests'):
    (con,cur) = db()
    q = ''


    if table=='conversation':
        first = 1
        q = "CREATE TABLE IF NOT EXISTS conversation (\n"
        for L in conversation_dim_format:
            (col,type) = (L,'text')
            if not first:
                q += ",\n"
            first = 0
            q += "\t%s %s" % (col,type)
        q += "\n);"


    if table=='conversation_message':
        first = 1
        q = "CREATE TABLE IF NOT EXISTS conversation_message (\n"
        for L in conversation_message_dim_format:
            (col,type) = (L,'text')
            if not first:
                q += ",\n"
            first = 0
            q += "\t%s %s" % (col,type)
        q += "\n);"


    if table=='requests_sum1':
        first = 1
        q = "CREATE TABLE IF NOT EXISTS requests_sum1 (\n"
        for j, L in enumerate(requests_sum1_format):
            if j:
                (col,typ) = (L,requests_sum1_types[j])
                q += ",\n\t%s %s" % (col,typ)
            else:
                (col,typ) = (L,requests_sum1_types[j])
                q += "\t%s %s" % (col,typ)

        q += "\n);\n"
        print(q)
        cur.execute(q)

        q = "CREATE UNIQUE INDEX index1 ON requests_sum1(userid,courseid,timeblock);"

    if table=='requests':
        first = 1
        q = "CREATE TABLE IF NOT EXISTS requests (\n"
        for L in open('cache/request_table.txt','r').readlines():
            L = L.strip()
            #print(L)
            (col,type) = re.split("\s\s\s\s",L)
            if not first:
                q += ",\n"
            first = 0
            q += "\t%s %s" % (col,type)
        q += "\n);"

    if table=='users':
        first = 1
        q = "CREATE TABLE IF NOT EXISTS users (\n"
        for L in users_format:
            (col,type) = (L,'text')
            if not first:
                q += ",\n"
            first = 0
            q += "\t%s %s" % (col,type)
        q += "\n);"

    if table=='pseudonym':
        first = 1
        q = "CREATE TABLE IF NOT EXISTS pseudonym(\n"
        for L in pseudonym_dim_format:
            (col,type) = (L,'text')
            if not first:
                q += ",\n"
            first = 0
            q += "\t%s %s" % (col,type)
        q += "\n);"


    if table=='courses':
        first = 1
        q = "CREATE TABLE IF NOT EXISTS courses (\n"
        for L in course_format:
            (col,type) = (L,'text')
            if not first:
                q += ",\n"
            first = 0
            q += "\t%s %s" % (col,type)
        q += "\n);"

    if table=='enrollment':
        first = 1
        q = "CREATE TABLE IF NOT EXISTS enrollment (\n"
        for L in enrollment_dim_format:
            (col,type) = (L,'text')
            if not first:
                q += ",\n"
            first = 0
            q += "\t%s %s" % (col,type)
        q += "\n);"

    if table=='comm_channel':
        first = 1
        q = "CREATE TABLE IF NOT EXISTS comm_channel (\n"
        for L in communication_channel_dim_format:
            (col,type) = (L,'text')
            if not first:
                q += ",\n"
            first = 0
            q += "\t%s %s" % (col,type)
        q += "\n);"

    if table=='terms':
        first = 1
        q = "CREATE TABLE IF NOT EXISTS terms (\n"
        for L in term_format:
            (col,type) = (L,'text')
            if not first:
                q += ",\n"
            first = 0
            q += "\t%s %s" % (col,type)
        q += "\n);"

    if table=='roles':
        first = 1
        q = "CREATE TABLE IF NOT EXISTS roles (\n"
        for L in role_format:
            (col,type) = (L,'text')
            if not first:
                q += ",\n"
            first = 0
            q += "\t%s %s" % (col,type)
        return q + "\n);"
    if table == 'summary':
        q = """CREATE TABLE "summary_course_user_views" (
                "id"	INTEGER PRIMARY KEY AUTOINCREMENT,
                "courseid"	TEXT,
                "course_canvasid"	TEXT,
                "username"	TEXT,
                "userid"	TEXT,
                "user_canvasid"	TEXT,
                "count"	INTEGER,
                "time_block"	INTEGER )"""
    if q:
        print(q)
        cur.execute(q)
        con.commit()
        return

    if table == 'index':
        for q in [ #'CREATE INDEX "idx_req_userid" ON "requests" ("id","courseid","userid" );',
                'CREATE INDEX "idx_users_id" ON "users" ("id","canvasid"  );',
                'CREATE INDEX "idx_term_id" ON "terms" ("id","canvasid" );',
                'CREATE INDEX "idx_enrollment" ON "enrollment" ("cid","course_id","user_id" );',
                'CREATE INDEX "idx_courses" ON "courses" ("id","canvasid","termid" );' ]:
            #print(q)
            cur.execute(q)
        con.commit()

# Help the next function to upload new users directly to conf database on gavilan.
def employees_refresh_flex(data):
    try:
        data['a'] = 'set/newuser'
        data['sis_user_id'] = data['sis_user_id'][3:]
        print("\nUploading this: \n")
        print(json.dumps(data, indent=2))
        print("\n")
        a = input("Continue (y) or skip (n) ? ")
        if a == 'y':
            # This is what I was missing..........
            # req.add_header("Content-type", "application/x-www-form-urlencoded")
            r3 = requests.post('https://www.gavilan.edu/staff/flex/2020/api.php', params=data)
            print(r3.text)
        #print(r3.headers)
    except Exception as ex:
        print("Failed on: %s\nErr: %s" % (str(data),str(ex)))


# Everyone in iLearn DB with an   xyz@gavilan.edu   email address.
def all_gav_employees():
    (connection,cursor) = db()
    connection.row_factory = dict_factory
    q = """SELECT u.canvasid, u.name, u.created, u.sortablename, h.address, h.type, h.workflow_state,
           h.updated_at, p.last_request_at, p.last_login_at, p.current_login_at, p.last_login_ip,
           p.current_login_ip, p.sis_user_id, p.unique_name FROM users AS u
           JOIN comm_channel AS h ON u.id=h.user_id
           JOIN pseudonym AS p ON p.user_id=u.id
           WHERE h.address LIKE "%@gavilan.edu"
           ORDER BY u.sortablename"""
    cursor = connection.cursor()
    cursor.execute(q)
    everyone = cursor.fetchall()
    everyone_set = set()
    for E in everyone:
        try:
            everyone_set.add(    E['address'].lower()  )
        except Exception as e:
            print("Exception: %s\nwith: %s" % (str(e), str(E)))

    oo = open('cache/temp1.txt','w')
    oo.write(json.dumps(list(everyone_set), indent=2))
    existing = requests.get('https://gavilan.edu/staff/flex/2020/api.php?a=get/users')
    ex = json.loads( existing.text )
    already_enrolled = set()
    for usr in ex['users']:
        try:
            #already_enrolled.add(   (usr['goo'], usr['email'].lower(), usr['name'])  )
            already_enrolled.add(  usr['email'].lower()   )
        except Exception as e:
            print("Exception: %s\nWith: %s" % (str(e),str(usr)))

    oo.write( "\n"*20 + '------------------------------------------\n'*20 + '------ - - - - - - ' )
    oo.write(json.dumps(list(already_enrolled), indent=2))

    # conf_users wants:   goo, email, name, active
    #   and emails have random capitalization
    #   name is First Last,   and sometimes with Middle in there.
    #

    # using sets:     to_enroll = [ x for x in students if x not in already_enrolled ]
    new_emp = [ x for x in everyone_set if x not in already_enrolled ]

    # take the all_employee list,   filter -> anyone who's in 'existing' is removed

    # funcy.where( lambda x: x['email'] == ae[4]   ,  existing )

    #new_emp = list(funcy.filter( lambda ae: funcy.where( existing, email=ae['email'] ),  all_emp ))
    #new_emp = list(funcy.where( existing, email=b'phowell@gavilan.edu')) #ae['email'] ))
    print(new_emp)
    oo.write( "\n"*20 + '------------------------------------------\n'*20 + '------ - - - - - - ' )
    oo.write(json.dumps(list(new_emp), indent=2))

    # Now, iLearn db (everyone)... find the rows that match the email addresses
    # that we've decided we need to add (new_emp)

    #print(everyone)
    #print( "searching for %s" % j )
    #print( "searched for %s, found: %s" % (j, str(to_add) ))
    #print("\nUploading...\n")
    for j in new_emp:
        #j = new_emp[0]
        print(j)
        to_add = list(funcy.where( everyone, address=j ))
        if to_add:
            employees_refresh_flex(to_add[0])
        else:
            print("Didn't find an entry for that account.")
    print("done uploading")

#
def teachers_courses_semester():
    q = """SELECT c.id, c.canvasid AS course_cid, c.name, c.code, u.name, u.sortablename, u.canvasid AS user_cid  FROM courses AS c
JOIN enrollment AS e ON e.course_id=c.id
JOIN users AS u ON u.id=e.user_id
WHERE c.sis LIKE "202070-%"
AND NOT c.state="deleted"
AND e."type"="TeacherEnrollment"
ORDER BY u.sortablename"""
    (connection,cursor) = db()
    cursor.execute(q)
    all_teachers = cursor.fetchall()
    return all_teachers
#
def teachers_by_term():
    q = """SELECT c.id as course_id, c.canvasid as course_c_id, c.name, c.code, c.created as course_created, c.start, c.visible, c.state, e.last_activity,
u.id as user_id, u.canvasid as user_c_id, u.sortablename, u.created as user_created
FROM courses AS c
JOIN enrollment AS e ON e.course_id=c.id
JOIN users AS u ON u.id=e.user_id
WHERE c.sis LIKE "202070%"
AND e."type"="TeacherEnrollment"
ORDER BY c.code"""
    (connection,cursor) = db()
    cursor.execute(q)
    all_teachers = cursor.fetchall()


# Report for AEC
def aec_su20_report():
    global mycourses
    #AE 600 (80040; 80045; 80047)  10945
    #AE 602 (80048; 80049; 80050)  10746
    #AE 636 (80332; 80381)         10783
    #CSIS 571A (80428)             10956
    #GUID 558A (80429)             10957
    import csv

    course_id = "10957"
    course_label = "GUID 558A 80429"

    (connection,cursor) = db()
    sections = "10945 10746 10783 10956 10957".split(" ")

    for course_id in sections:
        if 0:
            for course_id in sections:
                q = """SELECT c.code, u.sortablename, c.id, e.user_id,
                        c.canvasid FROM courses AS c JOIN enrollment AS e ON e.course_id=c.id
                        JOIN users AS u ON u.id=e.user_id
                        WHERE c.canvasid=%s""" % course_id
                cursor.execute(q)

                for row in cursor:
                    print(row)
                    mycourses[row[2]] = ''
            return


        grp_sum_qry = """SELECT u.sortablename, r.timeblock, SUM(r.viewcount), u.canvasid AS user, c.canvasid
            FROM requests_sum1 AS r
            JOIN courses AS c ON r.courseid=c.id
            JOIN enrollment as e ON e.course_id=c.id
            JOIN users AS u ON u.id=r.userid
            WHERE c.canvasid=%s
            GROUP BY r.userid,c.id,r.timeblock
            ORDER BY u.sortablename ,r.timeblock """ % course_id

        cursor.execute( grp_sum_qry )
        with codecs.open("cache/aec_%s.csv" % course_id, "w", "utf-8") as write_file:
            c_out = csv.writer(write_file)
            c_out.writerow( ['name','timeblock','viewcount','timestamp','minutes'] )

            rows = [list(row) for row in cursor]
            print("Got %i records" % len(rows))
            compressed_rows = []

            last_timeblock = -1
            last_R = []
            current_minute = 0
            current_name = ""
            uptodate = 1
            for R in rows:
                print(" %s\t%s " % (R[0], current_name) )
                if R[0] != current_name:
                    if not uptodate:
                        last_R.append(current_minute)
                        last_R.pop(1)
                        last_R.pop(2)
                        last_R.pop(2)
                        compressed_rows.append(last_R)
                    uptodate = 1
                    last_timeblock = -1
                    last_R = []
                    current_minute = 0
                    current_name = R[0]

                if R[2] < 3: continue
                if R[1] != last_timeblock+1 and len(last_R):
                    # non contiguous timeblock, save the last row and reset counters
                    last_timeblock = R[1]

                    R.append( str(dt_from_timeblock( R[1] )) )

                    last_R.append(current_minute)
                    current_minute = 15

                    #last_R.pop(1)
                    last_R.pop(3)
                    last_R.pop(3)

                    compressed_rows.append(last_R)    # makes a copy of list. dunno if thats necessary
                    #print(last_R)

                    last_R = R
                    uptodate = 1
                else:
                    # contiguous or first timeblock
                    current_minute += 15
                    last_timeblock = R[1]
                    if len(last_R):
                        last_R[2] = int(last_R[2]) + int(R[2])  # add the views
                        # its contiguous, so we already have a last_R we're building on
                    else:
                        last_R = R[:]   # clone it.
                    uptodate = 0
            if not uptodate:
                last_R.append(current_minute)
                last_R.pop(1)
                last_R.pop(2)
                last_R.pop(2)
                compressed_rows.append(last_R)


            for R in compressed_rows:
                c_out.writerow(R)

            # Build up a report for everyone
            outfile = codecs.open('cache/positive_attendance_%s.csv' % course_id , 'w', 'utf-8')
            pa_out = csv.writer(outfile)
            pa_out.writerow( ['name','date','viewcount','minutes'] )

            people = funcy.group_by(lambda x: x[0], compressed_rows)
            for P in people:
                if P in ['Ally','Burgman, Lorraine','Horta, Gilbert','Mendez, Frank','Student, Test']:
                    continue
                outrows = [ [P,''] ]
                try:

                    #print(P)
                    #print(people[P])
                    for x in people[P][1:]:
                        outrows.append( [ '', x[3], x[2],x[4] ] )
                    mins = list(map( lambda x: x[4], people[P][1:]))
                    print(mins)
                    total_min = functools.reduce( lambda x, y: int(x)+int(y), mins)
                    outrows.append( ['Total minutes', total_min] )
                    print("Total minutes is %i." % total_min)
                    hours = total_min / 60.0
                    outrows.append( ['Total hours', hours] )
                    print("Total hours is %0.1f." % hours)
                    outrows.append( [] )
                    outrows.append( [] )

                    for x in outrows:
                        print(x)
                        pa_out.writerow(x)
                except Exception as e:
                    print("Some sort of error: %s" % str(e))


    connection.close()
    print("Wrote output file to: %s" % "cache/aec_%s.csv" % course_label)


    """
    HELPERS

    Whos in a course?
    SELECT * FROM enrollment as e JOIN courses AS c ON e.course_id=c.id WHERE c.canvasid=10957    ; AND c.worflow=active

    """


##########
##########
##########    JUST LOADING FROM FILE
##########
##########


######################

# Return the most up do date version of the given file. Useful for 'dimensions'.
def most_recent_file_of( target ):

    def finder(st):
        return re.search(target,st)

    all = os.listdir(local_data_folder)
    all.sort(key=lambda x: os.stat(os.path.join(local_data_folder,x)).st_mtime)
    all.reverse()
    all = list(funcy.filter( finder, all ))

    #print("file list is: " + str(all))
    if not all:
        return ''
    return all[0]

# Given a table schema, parse log file, return a list of dicts. Optionally remove some columns.
def parse_file_with(  file, format, with_gid=0  ):
    if not file: return []
    all_users = []
    for line in gzip.open(local_data_folder + file,'r'):
        line = line.strip()
        line_dict = dict(list(zip(format, line.decode('utf-8').split("\t"))))
        if with_gid: line_dict['globalid'] = line_dict['globalid'].rstrip()

        remove = []
        for k,v in line_dict.items():
            if v == '\\N' or v == b'\\N': remove.append(k)
        for k in remove: line_dict.pop(k, None)
        all_users.append(line_dict)
    return all_users


# I keep my own cache.
# I return a list of the read lines if the log dates in the file are within dates (top of this file), or FALSE
def is_requestfile_interesting(fname):
    global thefiles, thefiles_dat
    #begin_month = ['2020-01','2020-02','2020-03','2020-04','2020-05','2020-06','2020-07']
    #begin_month = ['2020-09','2020-10','2020-08']
    begin_month = ['2021-02','2021-03']

    #AE 600 (80040; 80045; 80047)  10945
    #AE 602 (80048; 80049; 80050)  10746
    #AE 636 (80332; 80381)         10783
    #CSIS 571A (80428)             10956
    #GUID 558A (80429)             10957

    # The AEC sections of interest.
    sections = '10945 10746 1783 10956 10957'.split(' ')
    # Just once, to get the people
    #[ course_enrollment(x) for x in sections ]


    first = {}
    lines = False
    if fname in thefiles_dat:
        f_date = parse(thefiles_dat[fname])
        #print("\t\t+ %s" % str(f_date))
        first = {'year':str(f_date.year), 'month':"%i-%02i" % (f_date.year,f_date.month) }
        #print("\t\t- %s" % str(first))
        #print("\t\t* From: %s (%s)" % (first['month'], thefiles_dat[fname]) )
        print("+ %s" % first['month'])
    else:
        filei = 0
        #thefiles.write(fname + ',')

        g_file = gzip.open(local_data_folder+fname,'r')
        lines = g_file.readlines()

        last = 0
        i = 0
        j = -1
        while not last:
            last = requests_line(lines[i].decode('utf-8'))
            i += 1
        first = 0
        while not first:
            first = requests_line(lines[j].decode('utf-8'))
            j -= 1

        print("- %s" % first['month'])

        thefiles.write(fname + "," + str(first['date']) + ',' + str(last['date']) + '\n')
        thefiles.flush()

    # TODO more robust here
    if first['month'] in begin_month:
        print("++ Using it.")
        if lines: return lines
        return gzip.open(local_data_folder+fname,'r').readlines()
    return False


# This is it:
# e670d58a-25cb-4666-9675-8438615a5a4a	2019-05-18 13:01:03.558	2019	2019-05	2019-05-18	-256911301467799527	94250000000003187	94250000000000001	94250000000000001	\N	\N	\N	\N	/api/v1/courses/3187/assignments?page=4573781&per_page=30	Java/1.8.0_191	GET	35.173.111.106	81639	assignments_api	index	Course	3187	\N	6dad4c59c75a3492b830fb3b1136e1bc	-553092862543029181	200	HTTP/1.1	170000000000376


# TODO - investigate pearson, developer key: 170000000000376 and their ridiculous amounts of hits.
# and all these others:  https://ilearn.gavilan.edu/accounts/1/developer_keys

#from dateutil.relativedelta import relativedelta
#diff = relativedelta(start, ends)

secs_in_a_24hr_block = 60 * 60 * 24    #  24 HOUR BLOCK
secs_in_a_4hr_block = 60 * 60 * 4    #  4 HOUR BLOCK
secs_in_a_block = 60 * 15    #  15 MINUTE BLOCK
start_of_time = '2020-08-23 00:00:00'

# Why is this 7 minutes off?
# start = dt.strptime(start_of_time, '%Y-%m-%d %H:%M:%S').replace(tzinfo=pytz.timezone('US/Pacific'))

pst = pytz.timezone('US/Pacific')
start = pst.localize(dt.strptime(start_of_time, '%Y-%m-%d %H:%M:%S'))
start_seconds = start.timestamp()

# epoch slot: imagine time starts on Jan 1 of 20xx, and is counted off in xxxxxxxxxxxxx4 hour slots, so
# xxxxxx time 0 = jan 1, 12am - 3:59am,   time 1 = 4am - 8am,... and so on.
# xxxxxx So there's 6 of these per day.
#
# In this version I'm doing 15 minute slots, 4 per hour, 96 per day.
#
# Return a 'timeblock'. An integer number of 15 minute blocks from my epoch. Expects a datetime object in PST timezone.
def timeblock_from_dt(dt_obj):
    global start, start_seconds
    secs = dt_obj.timestamp() - start_seconds
    return int(  secs / secs_in_a_block )

# Returns a time in PST, given a 'timeblock'. Will be used in translating back to human time
def dt_from_timeblock(tb):
    delta = timedelta(seconds=tb*secs_in_a_block)
    return start + delta

####
# Twenty Four hour timeblocks
def timeblock_24hr_from_dt(dt_obj):
    global start, start_seconds
    secs = dt_obj.timestamp() - start_seconds
    return int(  secs / secs_in_a_24hr_block )

# Returns a time in PST, given a 'timeblock'. Will be used in translating back to human time
def dt_from_24hr_timeblock(tb):
    delta = timedelta(seconds=tb*secs_in_a_24hr_block)
    return start + delta


####
# Four hour timeblocks
def timeblock_4hr_from_dt(dt_obj):
    global start, start_seconds
    secs = dt_obj.timestamp() - start_seconds
    return int(  secs / secs_in_a_4hr_block )

# Returns a time in PST, given a 'timeblock'. Will be used in translating back to human time
def dt_from_4hr_timeblock(tb):
    delta = timedelta(seconds=tb*secs_in_a_4hr_block)
    return start + delta


# I make the line into a dict, erase keys with no data, make a DT field called date, make a time_block (int) field.
def requests_line(line,i=0):
    L = line  # strip?
    if type(L) == type(b'abc'): L = line.decode('utf-8')
    line_parts = L.split("\t")
    for pattern in unwanted_req_paths:
        if pattern in L:
            return 0
    d = dict(list(zip(requests_format, L.split("\t"))))
    remove = []
    for k,v in d.items():
        if v == '\\N' or v == b'\\N': remove.append(k)
    for k in remove: d.pop(k, None)
    d['date'] = dt.strptime( d['timestamp'], "%Y-%m-%d %H:%M:%S.%f" )
    d['date'] = d['date'].replace(tzinfo=pytz.timezone('UTC')).astimezone(pytz.timezone('US/Pacific'))
    d['time_block'] = timeblock_from_dt(d['date'])
    #if i % 1000 == 1: print(d)
    return d

import time

# Take all the requests.gz files and index them in some useful fashion.
# Bulk insert of requests logs. Too much data to be useful.
def requests_file(fname_list):
    global mycourses
    samples = codecs.open('cache/request_samples.txt', 'a', 'utf-8')
    conn,cur = db()

    folderi = 0
    filei = 0
    last_time = time.process_time()

    q = "INSERT INTO requests_sum1 (userid, courseid, timeblock, viewcount) VALUES (?,?,?,?) ON CONFLICT (userid,courseid,timeblock) DO UPDATE SET viewcount=viewcount+1"

    for fname in fname_list:
        #if folderi > 2: return
        print("\n%i\t%s   \t" % (folderi, fname), end='', flush=True)
        folderi += 1
        filei = 0

        lines = is_requestfile_interesting(fname)
        if lines:
            vals_cache = []
            for L in lines:
                thisline = requests_line(L,filei)    #TODO select if timeblock exists
                if not thisline:
                    continue
                if random.random() > 0.9999:
                    #L = str(L)
                    if type(L) == type(b'abc'): L = L.decode('utf-8')
                    parts = L.split('\t')
                    if len(parts)>17:
                        samples.write( "\t".join( [parts[13] , parts[14], parts[15], parts[16], parts[18], parts[19]]))

                #q,v = dict_to_insert(thisline,'requests')
                if not 'courseid' in thisline: continue
                if not 'userid' in thisline: continue

                # Limit this database to certain courses?
                # if thisline['courseid'] not in mycourses: continue

                v = ( thisline['userid'], thisline['courseid'], thisline['time_block'], 1 )
                vals_cache.append(  [ str(x) for x in v ] )
                try:
                    #cur.execute(q)
                    if filei % 5000 == 0:
                        conn.executemany(q, vals_cache)
                        conn.commit()
                        t = time.process_time()
                        delta = t - last_time
                        last_time = t
                        print("\nLoop %i - committed to db in %0.1fs. " % (filei,delta), end='', flush=True)
                        samples.flush()
                    filei += 1
                except Exception as e:
                    print(thisline)
                    print(e)
                    print(q)
                    print(v)
            # do the commit on the entire file...
            conn.executemany(q, vals_cache)
            conn.commit()
            t = time.process_time()
            delta = t - last_time
            last_time = t
            print("\nLoop %i - committed to db in %0.1fs. " % (filei,delta), end='', flush=True)


# Insert or update a request line.
def upsert_request(line, vals):
    # "id userid courseid timeblock viewcount partcount"

    # is it a view or a participation?
    q = "INSERT INTO requests_sum1 (userid, courseid, timeblock, viewcount) VALUES ('%s','%s',%s,%s) ON CONFLICT (userid,courseid,timeblock) DO UPDATE SET viewcount=viewcount+1" % (  str(vals[0]), str(vals[1]), str(vals[2]), str(vals[3]) )
    return q


# Generic insert of a dict into a table. Keys of dict must match table columns.
def dict_to_insert(thisline,table):  # a dict
    vals = []
    v_str = ''
    first = 1
    q = "INSERT INTO %s (" % table

    for k in thisline.keys():
        #print(k)
        if k == 'date': continue
        if not first:
            q += ","
            v_str += ","
        q += k
        v_str += "?"
        vals.append(str(thisline[k]))
        first = 0
    q +=  ") VALUES (" + v_str + ")"
    return q, vals

# This and the following merge functions do direct inserts without further tallying.
# This now does tallying by timeblock.
def merge_requests():
    req = []
    i = 0
    max = 2000

    for f in os.listdir(local_data_folder):
        if re.search(r'requests',f) and i < max:
            req.append(f)
            i += 1
    #req = ['requests-00000-afc834d1.gz',]
    print("Checking %i request log files." % len(req))
    requests_file(req)

def merge_comm_channel():
    setup_table('comm_channel')
    (conn,cur) = db()
    count = 0

    cfile = most_recent_file_of('communication_channel_dim')
    cm = parse_file_with( cfile, communication_channel_dim_format)
    for U in cm:
        q,v = dict_to_insert(U,'comm_channel')
        try:
            cur.execute(q,v)
            count += 1
        except Exception as e:
            print(e)
            print(q)
    conn.commit()
    print("Processed %i comm channel entries" % count)


def merge_pseudonym():
    setup_table('pseudonym')
    (conn,cur) = db()
    count = 0

    cfile = most_recent_file_of('pseudonym_dim')
    cm = parse_file_with( cfile, pseudonym_dim_format)
    for U in cm:
        q,v = dict_to_insert(U,'pseudonym')
        try:
            cur.execute(q,v)
            count += 1
        except Exception as e:
            print(e)
            print(q)
    conn.commit()
    print("Processed %i pseudonym entries" % count)


def merge_users():
    setup_table('users')
    (conn,cur) = db()

    user_file = most_recent_file_of('user_dim')
    users = parse_file_with( user_file, users_format)
    for U in users:
        q,v = dict_to_insert(U,'users')
        try:
            cur.execute(q,v)
        except Exception as e:
            print(e)
            print(q)
    conn.commit()

def merge_courses():
    setup_table('courses')
    (conn,cur) = db()

    c_file = most_recent_file_of('course_dim')
    courses = parse_file_with( c_file, course_format)
    for U in courses:
        q,v = dict_to_insert(U,'courses')
        try:
            cur.execute(q,v)
        except Exception as e:
            print(e)
            print(q)
    conn.commit()

def merge_enrollment():
    setup_table('enrollment')
    (conn,cur) = db()

    c_file = most_recent_file_of('enrollment_dim')
    print("Using enrollments from: %s" % c_file)
    courses = parse_file_with( c_file, enrollment_dim_format)
    count = 0
    for U in courses:
        q,v = dict_to_insert(U,'enrollment')
        count += 1
        #if count % 1000 == 0:
        #    print( "%i - " % count + q + " " + str(v) )
        try:
            cur.execute(q,v)
        except Exception as e:
            print(e)
            print(q)
    conn.commit()
    print("Processed %i enrollments" % count)


def merge_term():
    setup_table('terms')
    (conn,cur) = db()

    c_file = most_recent_file_of('enrollment_term_dim')
    courses = parse_file_with( c_file, term_format)
    for U in courses:
        q,v = dict_to_insert(U,'terms')
        try:
            cur.execute(q,v)
        except Exception as e:
            print(e)
            print(q)
    conn.commit()

def merge_roles():
    (conn,cur) = db()
    cur.execute(setup_table('roles'))
    conn.commit()

    c_file = most_recent_file_of('role_dim')
    courses = parse_file_with( c_file, role_format)
    for U in courses:
        q,v = dict_to_insert(U,'roles')
        try:
            cur.execute(q,v)
        except Exception as e:
            print(e)
            print(q)
    conn.commit()

def merge_convos():
    setup_table('conversation')
    setup_table('conversation_message')

    (conn,cur) = db()
    c_file = most_recent_file_of('conversation_dim')
    ccc = parse_file_with( c_file, conversation_dim_format)
    for U in ccc:
        q,v = dict_to_insert(U,'conversation')
        try:
            cur.execute(q,v)
        except Exception as e:
            print(e)
            print(q)
    conn.commit()

    c_file = most_recent_file_of('conversation_message_dim')
    ccc = parse_file_with( c_file, conversation_message_dim_format)
    for U in ccc:
        q,v = dict_to_insert(U,'conversation_message')
        try:
            cur.execute(q,v)
        except Exception as e:
            print(e)
            print(q)
    conn.commit()

# For returning sqlite results as dicts
def dict_factory(cursor, row):
    d = {}
    for idx, col in enumerate(cursor.description):
        d[col[0]] = row[idx]
    return d


# TODO... approaches to all this data... list requests in order descending time, unique users, and just
# file stats on them...?

# people's maxs, with time block window:
# select *,count(course_canvasid),sum(count),max(time_block),min(time_block) from summary_course_user_views group by username order by min(time_block)

# get the time back:  dt_from_timeblock(11296)


# Attempt to do tallying
def make_views_summarys():
    connection = sqlite3.connect(sqlite_file)
    connection.row_factory = dict_factory
    cursor = connection.cursor()

    q1 = """select courses.id, courses.code, courses.name, courses.visible, courses.state, courses.sis from courses
            join terms on courses.termid=terms.id
            where terms.name="2021 Spring" and courses.state="available";
            """

    cursor.execute(q1)
    sp2020_courses = cursor.fetchall()
    #print(json.dumps(sp2020_courses,indent=2))

    print("Summarizing views...     ", end='')
    for C in sp2020_courses:
        print("%s,  " % C['name'], end='', flush=True)

        #if input('enter to go, q to quit') == 'q': break
        q2 = """select sum(requests_sum1.viewcount) as views, requests_sum1.timeblock as block, courses.code, courses.canvasid as ccid,
            users.name, users.id, users.canvasid from requests_sum1
            join users on users.id = requests_sum1.userid
            join courses on courses.id=requests_sum1.courseid
            where courses.id="%s"
            group by users.name, block """ % C['id']
        cursor.execute(q2)
        views = cursor.fetchall()
        #print(json.dumps(views,indent=2))
        for U in views:
            q3 = """INSERT INTO summary_course_user_views ("courseid","course_canvasid", "username","userid","user_canvasid","count","time_block") VALUES (?,?,?,?,?,?,?);"""
            vals = [C['id'], U['ccid'], U['name'], U['id'], U['canvasid'], U['views'], U['block']]
            #print( q3 )
            #print( vals )
            #print('')
            cursor.execute(q3,vals)
        connection.commit()
    connection.close()

# original without time_blocks info.
def make_views_summarys_v1():
    connection = sqlite3.connect(sqlite_file)
    connection.row_factory = dict_factory
    cursor = connection.cursor()

    q1 = """select courses.id, courses.code, courses.name, courses.visible, courses.state, courses.sis from courses
            join terms on courses.termid=terms.id
            where terms.name="2020 Spring " and courses.state="available";
            """

    cursor.execute(q1)
    sp2020_courses = cursor.fetchall()
    #print(json.dumps(sp2020_courses,indent=2))

    for C in sp2020_courses:
        print("Summarizing views for " + C['name'])

        #if input('enter to go, q to quit') == 'q': break
        q2 = """select count(requests.id) as views, courses.code, courses.canvasid as ccid, users.name, users.id, users.canvasid from requests
            join users on users.id = requests.userid
            join courses on courses.id=requests.courseid
            where requests.courseid="%s"
            group by users.name;""" % C['id']
        cursor.execute(q2)
        views = cursor.fetchall()
        #print(json.dumps(views,indent=2))
        for U in views:
            q3 = """INSERT INTO summary_course_user_views ("courseid","course_canvasid", "username","userid","user_canvasid","count") VALUES (?,?,?,?,?,?);"""
            vals = [C['id'], U['ccid'], U['name'], U['id'], U['canvasid'], U['views'] ]
            print( q3 )
            print( vals )
            print('')
            cursor.execute(q3,vals)
        connection.commit()
    connection.close()


# Setup my basic db stats base from scratch
def full_reload():

    path = "cache/canvas_data/"
    file = "data.db"
    if exists(path + file):
        time = date_time = dt.fromtimestamp( getmtime(path + file) )
        newname = 'data'+ time.strftime('%Y%m%d') + ".db"
        print("renaming old data file to %s" % newname)
        os.rename(path+file, path + newname)

    sync_non_interactive()

    setup_table('requests_sum1')
    setup_table('courses')
    setup_table('users')
    setup_table('roles')
    setup_table('enrollment')
    setup_table('terms')
    setup_table('conversation')
    setup_table('conversation_message')
    setup_table('summary')
    setup_table('index')


    merge_users()
    merge_comm_channel()
    merge_convos()
    merge_courses()
    merge_pseudonym()
    merge_enrollment()
    merge_term()
    merge_roles()

    #merge_requests()

    #make_views_summarys()

def guess_dept(t):
    #print(t)
    method = 1  # crosslisted courses get their own dept
    method = 2  # xlisted takes dept first listed

    if method==1:
        p = "^([A-Z/]+)\d+"
        m = re.search(p, t['code'])
        if m:
            return m.group(1)
        return '?'
    if method==2:
        p = "^([A-Z]+)[\d/]+"
        m = re.search(p, t['code'])
        if m:
            return m.group(1)
        return '?'


# Main view of all class / all user overview...
def dept_with_studentviews(dept="", sem=''):
    if not sem:
        sem = input("which semester? (ex: 2020 Fall) ")

    connection = sqlite3.connect(sqlite_file)
    connection.row_factory = dict_factory
    cursor = connection.cursor()

    q1 = """select courses.id, courses.canvasid, courses.code, courses.name, courses.visible, courses.state, courses.sis from courses
            join terms on courses.termid=terms.id
            where terms.name="%s" and courses.state="available" """ % sem
    if dept:
        q1 += " AND courses.code LIKE '%" + dept + "%';"

    print(q1)
    cursor.execute(q1)
    courses = cursor.fetchall()
    return courses
    #print(json.dumps(sp2020_courses,indent=2))

    # version 1 of this got as high as 208 MB. Removed names, other unused columns.

    qry = "select suv.user_canvasid, suv.courseid, suv.count, suv.time_block, courses.code from summary_course_user_views as suv join courses on courses.id=suv.courseid where suv.courseid=%s"

    if dept == 'all':
        views_records = list( funcy.flatten( [ cursor.execute(qry% x['id']).fetchall() for x in sp2020_courses ] ) )
        by_course = funcy.group_by( lambda x: x['code'], views_records)
        for k,v in by_course.items():
            by_course[k] = funcy.group_by( lambda x: x['user_canvasid'], v)
        return by_course


    def f(x):
        return x['code']
    this_dept = filter( lambda x: guess_dept(x)==dept, sp2020_courses )


    views_records = list( funcy.flatten( [ cursor.execute(qry% x['id']).fetchall() for x in this_dept ] ) )

    return funcy.group_by( lambda x: x['courseid'], views_records)
    return "Couldn't find that department: %s" % dept


def get_courses_in_term_local(term="172"):
    q = """SELECT c.code, c.name, c.state, c.canvasid, c.id  FROM courses AS c JOIN terms AS t ON c.termid=t.id WHERE t.canvasid==%s""" % term
    (connection,cursor) = db()
    cursor.execute(q)
    allrows = cursor.fetchall()
    return allrows

# get student count
def course_student_stats(canvasid):
    q = """SELECT u.name FROM courses AS c
JOIN enrollment AS e ON e.course_id=c.id
JOIN users AS u ON u.id=e.user_id
WHERE c.canvasid=%s
AND e.type="StudentEnrollment"
AND e.workflow="active" """ % (canvasid)
    (connection,cursor) = db()
    cursor.execute(q)
    allrows = cursor.fetchall()
    a = [ len(allrows), ]
    b = []
    for x in allrows: b.append(x[0])
    return [a,b]
    return [x[0] for x in allrows]


# get teacher name from local db
def course_quick_stats(canvasid):
    q = """SELECT c.id AS courseid, c.code, tt.name, c.state, COUNT(u.id) AS student_count  FROM courses AS c
JOIN enrollment AS e ON e.course_id=c.id
JOIN users AS u ON u.id=e.user_id
JOIN (
SELECT c.id AS courseid, u.id AS userid, c.code, u.name  FROM courses AS c
	JOIN enrollment AS e ON e.course_id=c.id
	JOIN users AS u ON u.id=e.user_id
	WHERE c.canvasid=%s
	AND e."type"="TeacherEnrollment"
) AS tt ON c.id=tt.courseid
WHERE c.canvasid=%s
AND e."type"="StudentEnrollment"
GROUP BY c.code
ORDER BY c.code""" % (canvasid,canvasid)
    (connection,cursor) = db()
    cursor.execute(q)
    allrows = cursor.fetchall()
    return allrows


# What a student has taken / teacher has taught
def user_enrolled_in(userid):
    q = """SELECT u.canvasid as user_id, c.canvasid AS course_id, u.name, u.sortablename, c.code, c.name AS course_name, c.sis, t.name, p.current_login_at, p.current_login_ip, p.sis_user_id FROM courses AS c
JOIN enrollment AS e ON e.course_id=c.id
JOIN users AS u ON e.user_id=u.id
JOIN pseudonym AS p ON p.user_id=u.id
JOIN terms AS t ON c.termid=t.id
WHERE u.canvasid=%s ORDER BY t.name ASC""" % userid
#AND e.workflow="active"
#GROUP BY u.canvasid"""               ## AND e."type"="StudentEnrollment"
    (connection,cursor) = db()
    cursor.execute(q)
    return cursor.fetchall()


# All students in this semester ...
def users_this_semester_db(sem=''):
    if not sem:
        sem = input("which semester? (ex: 202150) ")

    q = """SELECT u.canvasid, u.name, u.sortablename, COUNT(e.id) AS num FROM enrollment AS e
JOIN users AS u ON e.user_id=u.id
JOIN courses AS c ON e.course_id=c.id
WHERE c.sis LIKE "%s-%%"
AND e.workflow="active"
GROUP BY u.canvasid"""  % sem             ## AND e."type"="StudentEnrollment"
    (connection,cursor) = db()
    cursor.execute(q)
    all_u = set()
    for u in cursor:
        print(u)
        all_u.add(str(u[0]))
    print("%i users this semester." % len(all_u))
    return all_u


# Everyone whose first semester is .....
def users_new_this_semester(sem=""):
    if not len(sem):
        sem = input("which semester? (ex: 202150) ")
    users_to_enroll = set()

    where1 = "c.sis LIKE '%s-%%'" % sem
    where2 = "c.sis NOT LIKE '%s-%%'" % sem


    q = """SELECT u.canvasid, u.name, u.sortablename, GROUP_CONCAT(c.code), COUNT(e.id) AS num FROM enrollment AS e
JOIN users AS u ON e.user_id=u.id
JOIN courses AS c ON e.course_id=c.id
WHERE %s
AND e.workflow="active"
AND e."type"="StudentEnrollment"
AND u.canvasid NOT IN (
	SELECT u.canvasid FROM enrollment AS e
	JOIN users AS u ON e.user_id=u.id
	JOIN courses AS c ON e.course_id=c.id
	WHERE %s
	AND e.workflow="active"
	AND e."type"="StudentEnrollment"
	GROUP BY u.canvasid
)
GROUP BY u.canvasid
ORDER BY num DESC, u.sortablename""" % (where1,where2)


    (connection,cursor) = db()
    cursor.execute(q)
    #s = cursor.fetchall()
    #if s:
    for u in cursor:
        users_to_enroll.add(str(u[0]))
        #print(s)
    print("%i new users this semester." % len(users_to_enroll))
    return users_to_enroll


# All student users in STEM - from local db
def user_in_stem():
    enrolled = set()
    q = """SELECT c.id, c.canvasid, c.name, c.code, c.start, c.visible, c.state,
u.id AS userid, u.canvasid AS user_c_id, u.sortablename FROM courses AS c
JOIN enrollment AS e ON c.id=e.course_id
JOIN users AS u ON u.id=e.user_id
WHERE c.canvasid="11015" AND e."type"="StudentEnrollment"
AND e."workflow"='active'
ORDER BY c.code, u.sortablename """
    (connection,cursor) = db()
    cursor.execute(q)
    results = cursor.fetchall()
    for u in results:
        enrolled.add( (u[9], u[8] ) )
    return enrolled


# Get all the classes in one dept
def dept_classes(dept,sem=''):
    if not sem:
        sem = input("which semester? (ex: 202150) ")


    q = """SELECT c.id, c.canvasid, c.name, c.code, c.start, c.visible, c.state,
u.id AS userid, u.canvasid AS user_c_id, u.sortablename FROM courses AS c
JOIN enrollment AS e ON c.id=e.course_id
JOIN users AS u ON u.id=e.user_id
WHERE c.name LIKE """ + '"' + dept + """%" AND c.sis LIKE """ + '"' + sem + """%" AND e."type"="StudentEnrollment"
AND e."workflow"='active'
ORDER BY c.code, u.sortablename """

    users = set()
    (connection,cursor) = db()

    cursor.execute(q)
    results = cursor.fetchall()
    for u in results:
        users.add(  (u[9], u[8]) )
    return users


# TODO
#
# depts -> courses -> count students... 1 structure...     display as 1 grid?  treeview?
# afterwards: views by student / by week, row of grids per class...

def depts_with_classcounts(sem=''):
    if not sem:
        sem = input("which semester? (ex: 202150) ")

    # This is messier cause i don't have depts in database
    # should I add that? Or just use python. TODO

    q = """select users.canvasid, courses.code, courses.id, users.name, roles.name as role,
        enrollment.workflow as user_status, courses.state as course_state
        from courses join terms on courses.termid = terms.id
        join enrollment on enrollment.course_id=courses.id
        join users on enrollment.user_id = users.id
        join roles on roles.id=enrollment.role
        where terms.sis='%s' and enrollment.workflow='active'
        order by courses.code"""  %   sem

    connection = sqlite3.connect(sqlite_file)
    connection.row_factory = dict_factory
    cursor = connection.cursor()
    cursor.execute(q)
    results = cursor.fetchall()
    connection.close()

    def f(x):
        return x['code']
    by_dept_ = funcy.group_by( guess_dept, results )
    by_dept = {}

    def name_with_count(name,li):
        count = len(li)
        return (name,count,li[0]['id'])

    for d,li in by_dept_.items():
        classes_in_dept = funcy.group_by( f, li )
        #print(classes_in_dept)
        by_dept[d] = [ name_with_count(c,v) for c,v in classes_in_dept.items() ]


    return by_dept

def arrange_data_for_web(dept='', sem=''):
    if not sem:
        sem = input("which semester? (ex: 202150) ")

    # I want:
    # - structure of dicts, 1 file per class
    # - class -> teacher [ teacher1_cid: {name:nnn, week1:hits,week2:hits...],
    #            student [stuent1_cid: {name:nnn, week1:hits,week2:hits...],
    #

    q = "select * from courses join terms on courses.termid = terms.id where terms.sis='%s' and courses.state='claimed'" % sem

    # three... seconds:

    q2 = """select courses.code, users.name, roles.name as role from courses join terms on courses.termid = terms.id
        join enrollment on enrollment.course_id=courses.id
        join users on enrollment.user_id = users.id
        join roles on roles.id=enrollment.role
        where terms.sis='%s' and courses.state='claimed'
        order by code, role """ % sem


    # courses with users - need it as hierarchy - by course, or by user... or 1 user... (with logs...?)

    q3 = """select users.canvasid, courses.code, users.name, roles.name as role,
        enrollment.workflow as user_status, courses.state as course_state
        from courses
        join terms on courses.termid = terms.id
        join enrollment on enrollment.course_id=courses.id
        join users on enrollment.user_id = users.id
        join roles on roles.id=enrollment.role
        where terms.sis='%s'
        order by courses.code""" % sem


    connection = sqlite3.connect(sqlite_file)
    connection.row_factory = dict_factory
    cursor = connection.cursor()


    cursor.execute(q3)

    # fetch all or one we'll go for all.
    results = cursor.fetchall()
    #print(results)
    connection.close()

    def f(x):
        return x['code']
    by_dept_ = funcy.group_by( guess_dept, results )
    by_dept = {}

    for d,li in by_dept_.items():
        by_dept[d] = funcy.group_by( f, li )
    #by_course = funcy.group_by( f, results )
    #return by_course
    #print(json.dumps(by_dept,indent=2))
    if not dept:
        return by_dept  # list(by_dept.keys())

    if dept in by_dept:
        return by_dept[d]
    return "Error"

#
#
#
#
#
#
#
# This csv loading code isn't really necessary cause i get it all from the canvas_data files.
# Except that the enrollments don't seem to be there so this works.
#
# Saved to mine in the future.....

# Get enrollments. (Best to freshly run pipelines/get_rosters) and put them into DB
def build_tables(headers,name):
    first = 1
    query = "CREATE TABLE IF NOT EXISTS %s (\n" % name
    for L in headers:
        if not first:
            query += ",\n"
        first = 0
        query += "\t%s %s" % (L,"text")
    return query + "\n);"

def load_tables(table,headers,row,verbose=0):
    (conn,cur) = db()
    vals = []
    v_str = ''
    i = 0
    q = "INSERT INTO %s (" % table
    for L in headers:
        if i:
            q += ","
            v_str += ","
        q += L
        v_str += "?"
        vals.append(str(row[i]))
        i += 1
    q +=  ") VALUES (" + v_str + ")"
    try:
        cur.execute(q,vals)
        if verbose:
            print(q)
            print(vals)
    except Exception as e:
        print(e)
        print(q)
    conn.commit()

def semester_enrollments(verbose=0):
    def qstrip(txt): return txt.strip('"')

    epath = "cache/rosters/enrollments-2020-08-02-19-49-36.csv"
    #cpath = "cache/rosters/spring2020/courses.2020-02-25T15-57.csv"
    #upath = "cache/rosters/spring2020/users.2020-02-25T15-57.csv"

    enrollments =  [ list( map( qstrip, L.strip().split(','))) for L in open(epath,'r').readlines() ]
    #classes = [ list( map( qstrip, L.strip().split(','))) for L in open(cpath,'r').readlines() ]
    #users = [ list( map( qstrip, L.strip().split(','))) for L in open(upath,'r').readlines() ]

    e = build_tables(enrollments[0],"enrollments")
    #c = build_tables(classes[0],"classes")
    #u = build_tables(users[0],"users")

    if verbose:
        #for x in [e,c,u]: print(x)
        print(enrollments[0])
        print(enrollments[5])
        #print(classes[0])
        #print(classes[5])
        #print(users[0])
        #print(users[5])

    (conn,cur) = db()
    q = e
    try:
        cur.execute(q)
        if verbose: print(q)
    except Exception as ex:
        print(ex)
        print(q)
    conn.commit()

    headers = enrollments[0]
    rows = enrollments[1:]
    # Probably don't want to commit on every row?
    for row in rows:
        load_tables("enrollments",headers,row,verbose)

# Show this as a big grid? D3? CSV?

# Ultimately we need session calcs too. When we get 15 minute chunks, then just add them up....

# Overview of student hits in a course. Return a (pandas??) table student/timeblock/hits  6 * 7 * 7 items per student.


def comm_channel_file():
    """all = os.listdir(local_data_folder)
    all.sort(key=lambda x: os.stat(os.path.join(local_data_folder,x)).st_mtime)
    all.reverse()
    #print "sorted file list:"
    #print all
    for F in all:
        if re.search('communication_channel_dim',F):
            user_file = F
            break
    print("most recent comm channel file is " + user_file)"""


    user_file = most_recent_file_of('communication_channel_dim')

    all_commchannels = []
    for line in gzip.open(local_data_folder+user_file,'r'):
        line_dict = dict(list(zip(communication_channel_dim_format, line.split("\t"))))
        line_dict['globalid'] = line_dict['globalid'].rstrip()
        all_commchannels.append(line_dict)
    df = pd.DataFrame(all_commchannels)
    return df

def pseudonym_file():
    all = os.listdir(local_data_folder)
    all.sort(key=lambda x: os.stat(os.path.join(local_data_folder,x)).st_mtime)
    all.reverse()
    #print "sorted file list:"
    #print all
    for F in all:
        if re.search('pseudonym_dim',F):
            p_file = F
            break
    print("most recent pseudonym file is " + p_file)
    all_users = []
    for line in gzip.open(local_data_folder + p_file,'r'):
        line_dict = dict(list(zip(pseudonym_dim_format, line.split("\t"))))
        line_dict['authentication_provider_id'] = line_dict['authentication_provider_id'].rstrip()
        all_users.append(line_dict)
    df = pd.DataFrame(all_users)
    return df


def abcd():
    setup_table('index')


def crns_to_teachers():
    semester = '202070'
    (connection,cursor) = db()
    emails = set()
    crns = codecs.open('cache/eval_teachers_2020fa.txt','r').readlines()
    q = """SELECT c.id, c.canvasid AS course_cid, c.name, c.code, u.name, u.sortablename, u.canvasid AS user_cid, c.sis, h.address  FROM courses AS c
JOIN enrollment AS e ON e.course_id=c.id
JOIN users AS u ON u.id=e.user_id
JOIN comm_channel AS h ON u.id=h.user_id
WHERE h."type"="email"
AND c.sis = "%s-%s"
AND NOT c.state="deleted"
AND e."type"="TeacherEnrollment"
GROUP BY h.address;"""
    for c in crns:
        c = c.strip()
        print(c)
        cursor.execute(q % (semester,c))
        r = cursor.fetchall()
        for inst in r:
            emails.add(inst[8])
            print(inst)
    open('cache/eval_emails.txt','w').write( ';'.join(emails))
    return emails


def all_sem_courses_teachers():
    q = """SELECT c.id, c.canvasid AS course_cid, c.name, c.code, u.name, u.sortablename, u.canvasid AS user_cid, p.sis_user_id  FROM courses AS c
JOIN enrollment AS e ON e.course_id=c.id
JOIN users AS u ON u.id=e.user_id
JOIN pseudonym AS p ON p.user_id=u.id
WHERE c.sis LIKE "202170-%"
AND NOT c.state="deleted"
AND e."type"="TeacherEnrollment"
ORDER BY u.sortablename;"""
    (connection,cursor) = db()
    cursor.execute(q)
    courses = cursor.fetchall()
    #print(courses)
    return courses


def to_sis_sem(s):
    season = s[0:2]
    year = "20" + s[2:5]
    a = {'sp':'30','su':'50','fa':'70'}
    season = a[season]
    return year+season

def build_db_schedule():
    # from the schedule json files
    target = "\_sched\_expanded\.json"
    def finder(st):
        return re.search(target,st)

    fields = 'sem,sem_sis,crn,dept,num,gp,dean,code,name,teacher,type,cap,act,loc,site,date,days,time,cred,ztc,partofday'.split(',')
    fff = codecs.open('cache/schedule_db_version.sql', 'w', 'utf-8')
    fff.write("CREATE TABLE IF NOT EXISTS schedule ( id text, sem text, sem_sis text, dept text, num text, gp text, dean text, code text, crn text, name text, teacher text,mode text, loc text, cap text, act text, site text, date text, cred text, ztc text, days text, time text, partofday text);\n")
    all = os.listdir('cache/')
    all = list(funcy.filter( finder, all ))
    all.sort()
    for F in all:
        print("\n\n" + F)
        sched = json.loads(codecs.open('cache/'+F,'r','utf-8').read())
        for S in sched:
            parts = S['code'].split(' ')
            S['dept'] = parts[0]
            S['num'] = parts[1]
            S['gp'] = gp[parts[0]]
            S['dean'] = dean[parts[0]]
            S['sem'] = F[0:4]
            S['sem_sis'] = to_sis_sem(F[0:4])
            if not 'partofday' in S:
                S['partofday'] = ''
            str = "INSERT INTO schedule (sem,sem_sis,crn,dept,num,gp,dean,code,name,teacher,mode,cap,act,loc,site,date,days,time,cred,ztc,partofday) VALUES (%s);\n" % \
                  ", ".join( [ "'" + re.sub(r"'", "", S[x]) + "'" for x in fields ] )
            #print(str)
            fff.write(str)

def process_enrollment_data():

    sem_index = {'201830':0, '201850':1, '201870':2, '201930':3, '201950':4, '201970':5, '202030':6, '202050':7, '202070':8, '202130':9, '202150':10, '202170':11, '202230':12, '202250':13, '202270':14, '202330':15}

    def sem_to_idx(s):
        return sem_index[str(s)]

    p = pd.read_csv('cache/20221207_all_enrollments_by_student.csv')
    p = p.fillna('')
    p['sem_idx'] = p['sem'].map(sem_to_idx)
    print(p)
    print(sorted(p['sem'].unique()) )
    print(sorted(p['mode'].unique()) )
    print(sorted(p['site'].unique()) )
    print(sorted(p['partofday'].unique()) )
    print(sorted(p['days'].unique()) )
    print(sorted(p['dept'].unique()) )
    print(sorted(p['num'].unique()) )
    print(len(p['num'].unique()) )

    print("I see this many student/semester rows: ", len(p))

    #q = p.groupby(["canvasid","sem"])

    q = p.groupby(["canvasid"])
    print("I see this many students: ", len(q))
    #print(q.size())
    r = pd.DataFrame(q.size())
    print("Summary of course counts")
    print(r.iloc[:,0].value_counts())

    out = codecs.open('cache/20221207_all_enrollments_by_student_with_sems.csv','w','utf-8')
    out.write('"canvasid","sem","mode","site","partofday","days","dept","num","cred","sem_idx","local_sem_idx"\n')

    # convert absolute semester to sequence,
    # ie: student's 1st, 2nd, 3rd, etc
    for name,group in q:
        # drop students with only a single semester -- no predictive value here
        if len(group['sem_idx'].unique())<2:
            continue
        mn = group['sem_idx'].min()
        group.loc[:,'local_sem_idx'] = group['sem_idx'].map(lambda x: x - mn)
        out.write(group.to_csv(index=False, header=False))

    s = p.groupby(by="sem")
    #print("I see this many semesters: ", len(s))
    #print(s.size())


    # todo

def do_encoding():
    # one shot encoding of each field

    modes = {'hybrid':[0,0,0,1], 'in-person':[1,0,0,0], 'online':[0,1,0,0], 'online live':[0,0,1,0]}
    sites = {'Coyote Valley':[1,0,0,0,0,0], 'Gilroy':[0,1,0,0,0,0], 'Hollister':[0,0,1,0,0,0], 'Morgan Hill':[0,0,0,1,0,0], 'Online':[0,0,0,0,1,0], 'Other':[0,0,0,0,0,0], 'San Martin Airport':[0,0,0,0,0,1], 'TBA':[0,0,0,0,0,0]}
    times = {'':[0,0,0,0,0], 'Afternoon':[0,0,1,0,0], 'Evening':[0,0,0,1,0], 'Midday':[0,1,0,0,0], 'Morning':[1,0,0,0,0]}
    days  = {'':[0,0,0,0,0,0], 'F':[0,0,0,0,1,0], 'FS':[0,0,0,0,1,1], 'M':[1,0,0,0,0,0], 'MF':[1,0,0,0,1,0], 'MR':[1,0,0,1,0,0], 'MT':[1,1,0,0,0,0], 'MTR':[1,1,0,1,0,0], 'MTRF':[1,1,0,1,1,0], 'MTW':[1,1,1,0,0,0], 'MTWF':[1,1,0,1,1,0], 'MTWR':[1,1,1,1,0,0], 'MTWRF':[1,1,1,1,1,0], 'MW':[1,0,1,0,0,0], 'MWF':[1,0,1,0,1,0], 'MWR':[1,0,1,1,0,0], 'R':[0,0,0,1,0,0], 'RF':[0,0,0,1,1,0], 'S':[0,0,0,0,0,1], 'T':[0,1,0,0,0,0], 'TBA':[0,0,0,0,0,0], 'TF':[0,1,0,0,1,0], 'TR':[0,1,0,1,0,0], 'TRF':[0,1,0,1,1,0], 'TW':[0,1,1,0,0,0], 'TWR':[0,1,1,1,0,0], 'TWRF':[0,1,1,1,1,0], 'U':[0,0,0,0,0,0], 'W':[0,0,1,0,0,0], 'WF':[0,0,1,0,1,0], 'WR':[0,0,1,1,0,0]}

    deptslist = ['ACCT', 'AE', 'AH', 'AJ', 'AMT', 'ANTH', 'APE', 'ART', 'ASTR', 'ATH', 'BIO', 'BOT', 'BUS', 'CD', 'CHEM', 'CMGT', 'CMUN', 'COS', 'CSIS', 'CWE', 'DM', 'ECOL', 'ECON', 'ENGL', 'ENGR', 'ENVS', 'ESL', 'ETHN', 'FRNH', 'GEOG', 'GEOL', 'GUID', 'HE', 'HIST', 'HTM', 'HUM', 'HVAC', 'JFT', 'JLE', 'JOUR', 'JPN', 'KIN', 'LIB', 'LIFE', 'MATH', 'MCTV', 'MGMT', 'MUS', 'PHIL', 'PHYS', 'POLS', 'PSCI', 'PSYC', 'RE', 'SJS', 'SOC', 'SPAN', 'THEA', 'WELD', 'WTRM']
    d_len = len(deptslist)
    d_template = [ 0 for i in range(d_len) ]
    depts = {}
    for i in range(d_len):
        depts[  deptslist[i] ] = d_template.copy()
        depts[  deptslist[i] ][i] = 1

    numslist =  ['1', '10', '100', '100A', '101', '102', '103', '104', '105', '107', '107A', '109', '10A', '10B', '11', '110', '111', '112', '113', '114', '118', '119', '11A', '11B', '11C', '12', '120', '121', '122', '124', '126', '128', '129', '12A', '12B', '12L', '13', '130', '131', '132', '133', '135', '13A', '13B', '13C', '13D', '14', '140', '142', '143', '144', '14A', '14B', '15', '150', '152', '154', '156', '157', '158', '15A', '15B', '16', '160', '162', '164', '166', '16A', '16B', '16C', '17', '171', '173', '175', '176', '178', '179', '17A', '17B', '17C', '18', '180', '181', '182', '183', '184', '186', '187', '189', '18A', '18B', '19', '190', '191A', '192', '19A', '19B', '19C', '1A', '1B', '1C', '1L', '2', '20', '200', '201', '202', '203', '204', '205', '206', '207', '208', '209', '20A', '20B', '20C', '21', '210', '211', '212', '213', '213A', '214', '215', '216', '217', '218', '219', '22', '220', '221', '223', '225', '226', '227', '228', '229', '229A', '23', '230', '231', '231A', '232', '233', '235', '236', '24', '240', '242', '24A', '24B', '24C', '24D', '25', '250', '25A', '25B', '26', '260', '27', '270', '28', '280', '281', '282', '283', '290', '291A', '2A', '2B', '2C', '2F', '2J', '2L', '3', '301', '30A', '30B', '32', '33A', '33B', '33C', '34', '34A', '34B', '35', '36', '37', '38', '3A', '3B', '3C', '3D', '4', '40', '400', '402', '41', '411', '412', '412A', '412B', '413', '414', '415', '416', '42', '420', '43', '430', '44', '440', '44A', '44B', '44C', '45', '46', '47', '48', '4A', '4B', '4C', '5', '51', '52', '527', '528', '53', '530', '531', '534', '535', '536', '537', '538', '539', '54', '541', '542', '543', '547', '548', '549', '54L', '55', '550', '552', '553', '554', '557', '558A', '56', '560', '562', '563', '564', '565', '569', '570A', '570B', '571A', '571B', '571C', '575', '5A', '5B', '6', '60', '600', '601', '602', '603', '61A', '61B', '61C', '62A', '62B', '62C', '636', '638', '64A', '64B', '64C', '64D', '65', '66A', '66B', '66C', '68A', '68B', '68C', '7', '700', '701', '702A', '702B', '703', '704A', '705', '706', '707', '709', '70A', '71', '710', '71A', '71B', '727', '728', '73', '731', '732', '737', '738', '74', '740', '741', '742', '743', '744', '746', '747', '748', '749', '74A', '74B', '75', '752', '753', '754', '756', '76', '762', '763', '764', '77', '775', '776', '78', '784', '785', '786', '787', '788', '789', '79', '793', '7A', '7B', '7C', '8', '80', '81A', '81C', '83', '83A', '84', '85', '88A', '88B', '8A', '8B', '8C', '9', '90', '91A', '91B', '92', '97', '9A', '9B']
    n_len = len(numslist)
    n_template = [ 0 for i in range(n_len) ]
    nums = {}
    for i in range(n_len):
        nums[  numslist[i] ] = n_template.copy()
        nums[  numslist[i] ][i] = 1

    return [modes,sites,times,days,depts,nums]

    for x in [modes,sites,times,days,depts,nums]:
        print('var')
        for k,v in x.items():
            print("\t",k,":",v)


if __name__ == "__main__":

    print ('')
    options = {
            1: ['Read and join communications channels.', merge_comm_channel],
            2: ['Read and join users files.', merge_users ],
            3: ['Read and join courses files.', merge_courses ],
            4: ['Read and join enrollment files.', merge_enrollment ],
            5: ['Read and join terms files.', merge_term ],
            6: ['Read and join roles files.', merge_roles ],
            7: ['Read and join conversation files', merge_convos],
            8: ['Read all courses', semester_enrollments],
            9: ['Load requests files. Merge into 15min blocks.', merge_requests ],
            10: ['Full reload. Rename current db.', full_reload],
            11: ['test setup index', abcd],
            12: ['Test web version of data files (json)', make_views_summarys],   #depts_with_classcounts],   # arrange_data_for_web],
            13: ['Test web version of data files (?)', depts_with_classcounts],
            14: ['Student views, classes in 1 dept', dept_with_studentviews],
            15: ['AEC report positive attendance', aec_su20_report],
            16: ['Create list of all employees', all_gav_employees],
            17: ['List emails of evaluated instructors this semester', crns_to_teachers],
            18: ['Fetch this semester shells with teachers', all_sem_courses_teachers],
            19: ['Build DB schedule from json files', build_db_schedule],
            20: ['Process enrollment data', process_enrollment_data],
            21: ['Encode data', do_encoding],
            #19: ['add evals for a whole semester', instructor_list_to_activate_evals],
            #16: ['Upload new employees to flex app', employees_refresh_flex],
            }

    if len(sys.argv) > 1 and re.search(r'^\d+',sys.argv[1]):
        resp = int(sys.argv[1])
        print("\n\nPerforming: %s\n\n" % options[resp][0])

    else:
        print ('')
        for key in options:
            print(str(key) + '.\t' + options[key][0])

        print('')
        resp = input('Choose: ')

    # Call the function in the options dict
    options[ int(resp)][1]()