canvasapp/pipelines.py

from time import strptime
from bs4 import BeautifulSoup as bs
from util import UnicodeDictReader
from datetime import datetime as dt
from dateutil import parser
import pandas as pd
import codecs, json, requests, re, csv, datetime, pysftp, os, jsondiff, os.path
import sys, shutil, hmac, hashlib, base64, schedule, time, pathlib, datetime
import pdb
from collections import defaultdict
from deepdiff import DeepDiff
from canvas_secrets import apiKey, apiSecret, FTP_SITE, FTP_USER, FTP_PW, GOO, GOO_PIN, token, url, domain, account_id, header, g_id, g_secret
from canvas_secrets import instructure_url, instructure_username, instructure_private_key


"""
Everything to do with fetching data,
 - From iLearn, via token
 - current roster uploads from instructures sftp site
 - raw logs and other from canvas data repo
 - from ssb, use firefox to scrape the schedule


And some subsequent processing:
 - Raw roster files, into a more compact json format
 - Raw logs into something more useful
"""

verbose = False

users = {}
users_by_id = {}

# todo: all these constants for SSB  --  line 1008
#
# todo: https://stackoverflow.com/questions/42656247/how-can-i-use-canvas-data-rest-api-using-python

schedfile = 'temp.csv'


SEMESTER = 'Summer 2019'
short_sem = 'su19'
semester_begin = strptime('06/17', '%m/%d')
filename = 'su19_sched.json'

SEMESTER = 'Summer 2020'
short_sem = 'su20'
semester_begin = strptime('06/15', '%m/%d')
filename = 'su20_sched.json'

SEMESTER = 'Fall 2020'
short_sem = 'fa20'
semester_begin = strptime('08/24', '%m/%d')
filename = 'fa20_sched.json'

SEMESTER = 'Spring 2021'
short_sem = 'sp21'
semester_begin = strptime('02/01', '%m/%d')
filename = 'sp21_sched.json'
filename_html = 'sp21_sched.html'


SEMESTER = 'Summer 2021 (View only)'
short_sem = 'su21'
semester_begin = strptime('06/14', '%m/%d')
filename = 'su21_sched.json'
filename_html = 'su21_sched.html'


# Current or upcoming semester is first.
sems = ['su21', 'sp21', 'fa20', 'su20', 'sp20'] #, 'fa19']   #  'sp19']

sys.setrecursionlimit( 100000 )

local_data_folder = 'cache/canvas_data/'
mylog = codecs.open(local_data_folder + 'temp_log.txt','w')


gp = {}
gp['ACCT'] = 'info'
gp['AE'] = 'skill'
gp['AH'] = 'well'
gp['AJ'] = 'skill'
gp['AMT'] = 'skill'
gp['ANTH'] = 'soc'
gp['APE'] = 'skill'
gp['ART'] = 'art'
gp['ASTR'] = 'stem'
gp['ATH'] = 'well'
gp['BIO'] = 'stem'
gp['BOT'] = 'info'
gp['BUS'] = 'info'
gp['CD'] = 'skill'
gp['CHEM'] = 'stem'
gp['CMGT'] = 'skill'
gp['CMUN'] = 'comm'
gp['COS'] = 'skill'
gp['CSIS'] = 'stem'
gp['CWE'] = 'skill'
gp['DM'] = 'info'
gp['ECOL'] = 'stem'
gp['ECON'] = 'info'
gp['ENGL'] = 'soc'
gp['ENGR'] = 'stem'
gp['ENVS'] = 'stem'
gp['ESL'] = 'comm'
gp['ETHN'] = 'comm'
gp['FRNH'] = 'comm'
gp['GEOG'] = 'stem'
gp['GEOL'] = 'stem'
gp['GUID'] = 'soc'
gp['HE'] = 'well'
gp['HIST'] = 'soc'
gp['HUM'] = 'soc'
gp['HVAC'] = 'skill'
gp['JFT'] = 'skill'
gp['JLE'] = 'skill'
gp['JOUR'] = 'comm'
gp['JPN'] = 'comm'
gp['KIN'] = 'well'
gp['LIB'] = 'comm'
gp['LIFE'] = 'well'
gp['MATH'] = 'stem'
gp['MCTV'] = 'art'
gp['MUS'] = 'art'
gp['PHIL'] = 'soc'
gp['PHYS'] = 'stem'
gp['POLS'] = 'soc'
gp['PSCI'] = 'stem'
gp['PSYC'] = 'soc'
gp['RE'] = 'skill'
gp['SJS'] = 'soc'
gp['SOC'] = 'soc'
gp['SPAN'] = 'comm'
gp['THEA'] = 'art'
gp['WELD'] = 'skill'
gp['WTRM'] = 'skill'
gp['MGMT'] = 'skill'
gp['MKTG'] = 'skill'
gp['HTM'] = 'skill'

dean = {}
dean['AH'] = 'et'
dean['HE'] = 'et'
dean['ATH'] = 'et'
dean['KIN'] = 'et'
dean['LIFE'] = 'et'
dean['AE'] = 'ss'
dean['APE'] = 'ss'
dean['ACCT'] = 'ss'
dean['AJ'] = 'ss'
dean['AMT'] = 'ss'
dean['HVAC'] = 'ss'
dean['JFT'] = 'ss'
dean['JLE'] = 'ss'
dean['RE'] = 'ss'
dean['WTRM'] = 'ss'
dean['WELD'] = 'ss'
dean['ANTH'] = 'nl'
dean['ART'] = 'nl'
dean['ASTR'] = 'jn'
dean['BIO'] = 'jn'
dean['BOT'] = 'ss'
dean['BUS'] = 'ss'
dean['CD'] = 'ss'
dean['CHEM'] = 'jn'
dean['CMGT'] = 'ss'
dean['CMUN'] = 'nl'
dean['COS'] = 'ss'
dean['CSIS'] = 'ss'
dean['CWE'] = 'ss'
dean['DM'] = 'ss'
dean['ECOL'] = 'jn'
dean['ECON'] = 'nl'
dean['ENGL'] = 'nl'
dean['ENGR'] = 'jn'
dean['ENVS'] = 'jn'
dean['ESL'] = 'ss'
dean['ETHN'] = 'nl'
dean['FRNH'] = 'nl'
dean['GEOG'] = 'jn'
dean['GEOL'] = 'jn'
dean['GUID'] = 'nl'
dean['HIST'] = 'nl'
dean['HUM'] = 'nl'
dean['JOUR'] = 'nl'
dean['JPN'] = 'nl'
dean['LIB'] = 'kn'
dean['MATH'] = 'jn'
dean['MCTV'] = 'nl'
dean['MGMT'] = 'ss'
dean['MKTG'] = 'ss'
dean['HTM'] = 'ss'
dean['MUS'] = 'nl'
dean['PHIL'] = 'nl'
dean['PHYS'] = 'jn'
dean['POLS'] = 'nl'
dean['PSCI'] = 'jn'
dean['PSYC'] = 'nl'
dean['SJS'] = 'nl'
dean['SOC'] = 'nl'
dean['SPAN'] = 'nl'
dean['THEA'] = 'nl'


class FetchError(Exception):
    pass


DEBUG = 0

def d(s,end=''):
    global DEBUG
    if end and DEBUG: print(s,end=end)
    elif DEBUG: print(s)

################
################  CANVAS API MAIN FETCHING FUNCTIONS
################
################
################


# Main canvas querying fxn
def fetch(target,verbose=0):
    # if there are more results, recursivly call myself, adding on to the results.
    results = 0
    if target[0:4] != "http": target = url + target
    if verbose:
        print("++ Fetching: " + target)
    r2 = requests.get(target, headers = header)
    #if verbose:
    #print "++ Got: " + r2.text
    try:
        results = json.loads(r2.text)
        count = len(results)
    except:
        print("-- Failed to parse: ", r2.text)
    if verbose:
        print("Got %i results" % count)
    if verbose > 1:
        print(r2.headers)

        tempout = codecs.open('cache/fetchcache.txt','a','utf-8')
        tempout.write(r2.text+"\n\n")
        tempout.close()

    if ('link' in r2.headers and count > 0):
        links = r2.headers['link'].split(',')
        for L in links:
            ll = L.split(';')
            link = ll[0].replace("<","")
            link = link.replace(">","")
            if re.search(r'next', ll[1]):
                if (verbose):  print("++ More link: " + link)
                #link = re.sub(r'per_page=10$', 'per_page=100', link)    # link.replace('per_page=10','per_page=500')
                #if (verbose):  print("++ More link: " + link)

                nest = fetch(link,verbose)
                if isinstance(results,dict): results.update(nest)
                else: results.extend(nest)
    return results

# Main canvas querying fxn - stream version - don't die on big requests
def fetch_stream(target,verbose=0):
    # if there are more results, recursivly call myself, adding on to the results.
    results = 0
    while target:
        if target[0:4] != "http": target = url + target
        if verbose:
            print("++ Fetching: " + target)
        r2 = requests.get(target, headers = header)
        if r2.status_code == 502:
            raise FetchError()
        try:
            results = json.loads(r2.text)
            count = len(results)
        except:
            print("-- Failed to parse: ", r2.text)
        if verbose:
            print("Got %i results" % count)
        if verbose > 1:
            print(r2.headers)
            tempout = codecs.open('cache/fetchcache.txt','a','utf-8')
            tempout.write(r2.text+"\n\n")
            tempout.close()

        next_link_found = 0
        if ('link' in r2.headers and count > 0):
            links = r2.headers['link'].split(',')
            for L in links:
                ll = L.split(';')
                link = ll[0].replace("<","")
                link = link.replace(">","")
                if re.search(r'next', ll[1]):
                    target = link
                    next_link_found = 1
                    break
        if not next_link_found: target = 0
        yield results


# for dicts with one key, collapse that one key out, cause
# paging makes problems... example: enrollment_terms
def fetch_collapse(target,collapse='',verbose=0):
    # if there are more results, recursivly call myself, adding on to the results.
    results = 0
    if target[0:4] != "http": target = url + target
    if verbose:
        print("++ Fetching: " + target)
    r2 = requests.get(target, headers = header)
    #if verbose:
    #print "++ Got: " + r2.text
    try:
        results = json.loads(r2.text)
    except:
        print("-- Failed to parse: ", r2.text)
    if verbose: print(r2.headers)

    if collapse and collapse in results:
        results = results[collapse]

    if ('link' in r2.headers):
        links = r2.headers['link'].split(',')
        for L in links:
            ll = L.split(';')
            link = ll[0].replace("<","")
            link = link.replace(">","")
            if re.search(r'next', ll[1]):
                if (verbose):  print("++ More link: " + link)
                nest = fetch_collapse(link, collapse, verbose)
                if isinstance(results,dict): results.update(nest)
                else: results.extend(nest)
    return results


################
################  SCHEDULE PARSING HELPERS
################
################
################

# Teacher name format changed. Remove commas and switch first to last
def fix_t_name(str):
    str = str.strip()
    str = re.sub('\s+',' ',str)
    parts = str.split(', ')
    if len(parts)>1:
        return parts[1].strip() + " " + parts[0].strip()
    return str

# Separate dept and code
def split_class_dept(c):
    return c.split(' ')[0]
def split_class_code(c):
    num = c.split(' ')[1]
    parts = re.match('(\d+)([a-zA-Z]+)',num)
    #ret = "Got %s, " % c
    if parts:
        r = int(parts.group(1))
        #print(ret + "returning %i." % r)
        return r
    #print(ret + "returning %s." % num)
    return int(num)
def split_class_code_letter(c):
    num = c.split(' ')[1]
    parts = re.match('(\d+)([A-Za-z]+)',num)
    if parts:
        return parts.group(2)
    return ''

# go from sp20 to 2020spring
def shortToLongSem(s):
    parts = re.search(r'(\w\w)(\d\d)', s)
    yr = parts.group(2)
    season = parts.group(1)
    seasons = {'sp':'spring','su':'summer','fa':'fall','wi':'winter'}
    return '20'+yr+seasons[season]

# Go to the semesters folder and read the schedule. Return dataframe
def getSemesterSchedule(short='sp21'):                # I used to be current_schedule
    # todo: Some semesters have a different format.... partofday   type   site  xxx i just dL'd them again

    filename = 'cache/semesters/'+shortToLongSem(short)+'/' + short + '_sched.json'
    print("opening %s" % filename)
    #openfile = open(filename,'r')
    #a = json.loads(openfile)
    #return pd.DataFrame(a)
    schedule = pd.read_json(filename)
    schedule.teacher = schedule['teacher'].apply(fix_t_name)
    #print schedule['teacher']
    for index,r in schedule.iterrows():
        tch = r['teacher']
        parts = tch.split(' . ')
        if len(parts)>1:
            #print "Multiple teachers: (" + tch + ")"
            schedule.loc[index,'teacher'] = parts[0]
            #print "  Fixed original: ", schedule.loc[index]

            for t in parts[1:]:
                r['teacher'] = t
                schedule.loc[-1] = r
                #print "  New row appended: ", schedule.loc[-1]
    schedule = schedule.assign(dept = schedule['code'].apply(split_class_dept))
    schedule = schedule.assign(codenum = schedule['code'].apply(split_class_code))
    schedule = schedule.assign(codeletter = schedule['code'].apply(split_class_code_letter))
    #print(schedule)
    schedule['sem'] = short
    #print schedule.columns
    return schedule

def get_enrlmts_for_user(user,enrollments):
    #active enrollments
    u_en = enrollments[ lambda x: (x['user_id'] == user) & (x['workflow']=='active') ]
    return u_en[['type','course_id']]


################
################  CANVAS DATA
################
################
################


# Get something from Canvas Data
def do_request(path):
    #Set up the request pieces
    method = 'GET'
    host = 'api.inshosteddata.com'
    apiTime = dt.utcnow().strftime('%a, %d %b %y %H:%M:%S GMT')
    apiContentType = 'application/json'

    msgList = []
    msgList.append(method)
    msgList.append(host)
    msgList.append(apiContentType)
    msgList.append('')
    msgList.append(path)
    msgList.append('')
    msgList.append(apiTime)
    msgList.append(apiSecret)

    msgStr = bytes("".join("%s\n" % k for k in msgList).strip(),'utf-8')

    sig = base64.b64encode(hmac.new(key=bytes(apiSecret,'utf-8'),msg=msgStr,digestmod=hashlib.sha256).digest())
    sig = sig.decode('utf-8')

    headers = {}
    headers['Authorization'] = 'HMACAuth {}:{}'.format(apiKey,sig)
    headers['Date'] = apiTime
    headers['Content-type'] = apiContentType


    #Submit the request/get a response
    uri = "https://"+host+path
    print (uri)
    print (headers)
    response = requests.request(method='GET', url=uri, headers=headers, stream=True)

    #Check to make sure the request was ok
    if(response.status_code != 200):
        print(('Request response went bad. Got back a ', response.status_code, ' code, meaning the request was ', response.reason))
    else:
        #Use the downloaded data
        jsonData = response.json()
        #print(json.dumps(jsonData, indent=4))
        return jsonData

# Canvas data, download all new files
def sync_non_interactive():
    resp = do_request('/api/account/self/file/sync')
    mylog.write(json.dumps(resp, indent=4))
    #mylog.close()
    gotten = os.listdir(local_data_folder)
    wanted = []
    i = 0
    for x in resp['files']:
        filename = x['filename']
        exi = "No "
        if filename in gotten: exi = "Yes"
        else: wanted.append(x)

        print(str(i) + '.\tLocal: %s\tRemote: %s' % ( exi, filename ))
        i += 1
    print("I will attempt to download %i files." % len(wanted))

    #answer = input("Press enter to begin, or q to quit ")
    #if not answer == '': return

    good_count = 0
    bad_count = 0
    for W in wanted:
        print("Downloading: " + W['filename'])
        response = requests.request(method='GET', url=W['url'], stream=True)
        if(response.status_code != 200):
            print('Request response went bad. Got back a %s code, meaning the request was %s' % \
                 (response.status_code, response.reason))
            print('URL: ' + W['url'])
            bad_count += 1

        else:
            #Use the downloaded data
            with open(local_data_folder + W['filename'], 'wb') as fd:
                for chunk in response.iter_content(chunk_size=128):
                    fd.write(chunk)
            print("Success")
            good_count += 1
    print("Out of %i files, %i succeeded and %i failed." % (len(wanted),good_count,bad_count))


# list files in canvas_data (online) and choose one or some to download.
def interactive():
    resp = do_request('/api/account/self/file/sync')
    mylog.write(json.dumps(resp, indent=4))
    #mylog.close()
    i = 0
    gotten = os.listdir(local_data_folder)
    for x in resp['files']:
        print(str(i) + '.\t' + x['filename'])
        i += 1
    which = input("Which files to get? (separate with commas, or say 'all') ")
    if which=='all':
        which_a = list(range(i-1))
    else:
        which_a = which.split(",")
    for W in which_a:
        this_i = int(W)
        this_f = resp['files'][this_i]
        filename = this_f['filename']
        if filename in gotten: continue
        print("Downloading: " + filename)
        response = requests.request(method='GET', url=this_f['url'], stream=True)
        if(response.status_code != 200):
            print(('Request response went bad. Got back a ', response.status_code, ' code, meaning the request was ', response.reason))
        else:
            #Use the downloaded data
            with open(local_data_folder + filename, 'wb') as fd:
                for chunk in response.iter_content(chunk_size=128):
                    fd.write(chunk)
            print("Success")
        """if filename.split('.')[-1] == 'gz':
            try:
                plain_filename = 'canvas_data/' + ".".join(filename.split('.')[:-1])
                pf = open(plain_filename,'w')
                with gzip.open('canvas_data/' + filename , 'rb') as f:
                    pf.write(f.read())
            except Exception as e:
                print "Failed to ungizp. Probably too big: " + str(e)"""


######   SSB SCHEDULE
######
######
######

def todays_date_filename():       # helper
    n = datetime.now()
    m = n.month
    if m < 10: m = "0"+str(m)
    d = n.day
    if d < 10: d = "0" + str(d)
    return "reg_" + short_sem + "_" + str(n.year) + str(m) + str(d)

def nowAsStr():     # possible duplicate
    #Get the current time, printed in the right format
    currentTime = datetime.datetime.utcnow()
    prettyTime = currentTime.strftime('%a, %d %b %Y %H:%M:%S GMT')
    return prettyTime


def row_has_data(r):      # helper
    if r.find_all('th'):
        return False
    if len(r.find_all('td')) > 2:
        return True
    if re.search('Note\:', r.get_text()):
        return True
    return False

def row_text(r):   # helper
    #global dbg

    d("Row Txt Fxn gets:  ")
    arr = []
    for t in r.find_all('td'):
        if t.contents and len(t.contents) and t.contents[0].name == 'img':
            arr.append("1")
            d("img")
        r_text = t.get_text()
        arr.append(r_text)
        if 'colspan' in t.attrs and t['colspan']=='2':
            d('[colspan2]')
            arr.append('')
        d("\t"+r_text, end=" ")
    d('')

    if len(arr)==1 and re.search('Note\:',arr[0]):
        note_line = clean_funny( arr[0] )
        note_line = re.sub(r'\n',' ', note_line)
        note_line = re.sub(r'"','', note_line)
        #note_line = re.sub(r',','\,', note_line)
        return ',,,,,,,,,,,,,,,,,,"' + note_line + '"\n'
    del arr[0]
    arr[1] = clean_funny(arr[1])
    arr[2] = clean_funny(arr[2])
    if arr[1]: arr[1] = arr[1] + " " + arr[2]
    del arr[2]
    arr = [ re.sub(r'&nbsp;','',a) for a in arr]
    arr = [ re.sub(',','. ',a) for a in arr]
    arr = [ re.sub('\(P\)','',a) for a in arr]
    arr = [ a.strip() for a in arr]
    #del arr[-1]
    r = ','.join(arr)+'\n'
    r = re.sub('\n','',r)
    r = re.sub('add to worksheet','',r)
    d("Row Txt Fxn returns:  " + r + "\n\n")

    return r + '\n'


# Take banner's html and make a csv(?) file
def ssb_to_csv(src):
    #out = codecs.open(schedfile,'w','utf-8')
    output = 'crn,code,sec,cmp,cred,name,days,time,cap,act,rem,wl_cap,wl_act,wl_rem,teacher,date,loc,ztc,note\n'
    b = bs(src, 'html.parser')
    tab = b.find(class_="datadisplaytable")
    if not tab:
        print("hmm... didn't find a 'datadisplaytable' in this html: ")
        #print(src)
        return 0
    rows = tab.find_all('tr')
    drows = list(filter(row_has_data,rows))
    for dd in drows:
        t = row_text(dd)
        output += t
    return output


def clean_funny(str):
    if str and str.encode('utf8') == ' ': return ''
    return str
def clean_funny2(str):
    if str and str == '\xa0': return ''
    if str and str == ' ': return ''
    return str

def clean_funny3(str):
    return re.sub('\xa0','',str)


### course is a list of 1-3 lists, each one being a line in the schedule's output. First one has section
def course_start(course):
    #todo: use this to make a early/late/short field and store semester dates w/ other constants

    start = datetime(2019,1,28)
    end   = datetime(2019,5,24)

    # is it normal, early, late, winter?
    li = course[0]
    date = li[12]

    if date=='01/28-05/24':
        return 'Normal'
    if date=='TBA':
        return 'TBA'
    if date=='01/02-01/25':
        return 'Winter'
    if date=='01/02-01/24':
        return 'Winter'

    ma = re.search(  r'(\d+)\/(\d+)\-(\d+)\/(\d+)', date)
    if ma:
        # TODO do these years matter?
        mystart = datetime(2019, int(ma.group(1)), int(ma.group(2)))
        if int(ma.group(1)) > 10: mystart = datetime(2018, int(ma.group(1)), int(ma.group(2)))
        myend   = datetime(2019, int(ma.group(3)), int(ma.group(4)))
        length = myend - mystart
        weeks = length.days / 7

        if mystart != start:
            if mystart < start:
                #print 'Early Start ', str(weeks), " weeks ",
                return 'Early start'
            else:
                #print 'Late Start ', str(weeks), " weeks ",
                return 'Late start'
        else:
            if myend > end:
                #print 'Long class ', str(weeks), " weeks ",
                return 'Long term'
            else:
                #print 'Short term ', str(weeks), " weeks ",
                return 'Short term'
        #return ma.group(1) + '/' + ma.group(2) + " end: " + ma.group(3) + "/" + ma.group(4)
    else:
        return "Didn't match: " + date


def time_to_partofday(t):
    #todo: account for multiple sites/rows
    # 11:20 am-12:10 pm
    mor     = strptime('12:00 PM', '%I:%M %p')
    mid     = strptime( '2:00 PM', '%I:%M %p')
    aft     = strptime( '6:00 PM', '%I:%M %p')
    if t == 'TBA':
        return 'TBA'
    t = t.upper()
    parts = t.split('-')
    try:
        begin = strptime(parts[0], '%I:%M %p')
        end = strptime(parts[1], '%I:%M %p')
        if end > aft:
            return "Evening"
        if end > mid:
            return "Afternoon"
        if end > mor:
            return "Midday"
        return "Morning"
        #return begin,end
    except Exception as e:
        #print 'problem parsing: ', t, "   ",
        return ""

# Deduce a 'site' field, based on room name and known offsite locations
def room_to_site(room,verbose=0):
    #todo: account for multiple sites/rows
    #todo: better way to store these offsite labels
    othersites = 'AV,SBHS I-243,SBHS I-244,LOADCS,HOPEH,HOPEG,PLY,SAS,SBHS,LOHS,CHS,SBRAT,'.split(',')
    # is it gilroy, mh, hol, other, online or hybrid?
    site = 'Gilroy'
    #if len(course[0]) > 13:
    #    room = course[0][13]
    if room in othersites:
        site = "Other"
    if room == 'TBA':
        site = 'TBA'
    if room == 'AV':
        site = 'San Martin Airport'
    if re.search('MHG',room):
        site = 'Morgan Hill'
    if re.search('HOL',room):
        site = 'Hollister'
    if re.search('COY',room):
        site = 'Coyote Valley'
    if re.search('OFFSTE',room):
        site = 'Other'
    if re.search('ONLINE',room):
        site = 'Online'
    if verbose: print(room, '\t', end=' ')
    return site


from io import StringIO


# take text lines and condense them to one dict per section
def to_section_list(input_text,verbose=0):
    this_course = ''
    #todo: no output files
    #jout = codecs.open(filename, 'w', 'utf-8')
    #input = csv.DictReader(open(schedfile,'r'))
    #input = UnicodeDictReader(input_text.splitlines())
    all_courses = []


    try:
        f = StringIO(input_text)
    except:
        print("ERROR with this input_text:")
        print(input_text)
    reader = csv.reader(f, delimiter=',')
    headers = next(reader)
    for r in reader:
        d = dict(list(zip(headers,r)))
        #pdb.set_trace()
        # clean funny unicode char in blank entries
        r = {k: clean_funny2(v) for k,v in list(d.items()) }
        if verbose: print("Cleaned: " + str(r))

        if 'time' in r:
            if r['time']=='TBA': r['time'] = ''
            if r['time']: r['partofday'] = time_to_partofday(r['time'])

        r['type'] = ''

        if 'loc' in r:
            if r['loc'] == 'ONLINE': r['type'] = 'online'
            if r['loc'] == 'ONLINE' and r['time']: r['type'] = 'online live'
            if r['loc'] == 'ONLINE LIVE': r['type'] = 'online live'
            if r['loc']: r['site'] = room_to_site(r['loc'],verbose)

        if 'code' in r:
            if re.search(r'ONLINE\sLIVE',r['code']):
                r['type'] = 'online live'
            elif re.search(r'ONLINE',r['code']):
                r['type'] = 'online'

        # does it have a section? it is the last course
        if r['crn']:   # is a new course or a continuation?
            if verbose: print("  it's a new section.")
            if this_course:
                if not this_course['extra']: this_course.pop('extra',None)
                all_courses.append(this_course)
            this_course = r
            #print(r['name'])
            this_course['extra'] = []
        else:
            # is a continuation line
            if verbose: print("  additional meeting: " + str(r))
            for k,v in list(r.items()):
                if not v: r.pop(k,None)
            # TODO: if extra line is different type?
            #if this_course['type']=='online' and r['type'] != 'online': this_course['type'] = 'hybrid'
            #elif this_course['type']!='online' and r['type'] == 'online': this_course['type'] = 'hybrid'
            this_course['extra'].append(r)
    return all_courses


# Schedule / course filling history
# csv headers:  crn, code, teacher,  datetime, cap, act, wlcap, wlact
# Log the history of enrollments per course during registration
def log_section_filling(current_sched_list):
    rows = 'timestamp crn code teacher cap act wl_cap wl_act'.split(' ')
    rows_j = 'crn code teacher cap act wl_cap wl_act'.split(' ')
    print(rows_j)
    now = datetime.datetime.now().strftime('%Y-%m-%dT%H-%M')
    csv_fn = 'cache/reg_history_' + short_sem + '.csv'
    with codecs.open(csv_fn,'a','utf-8') as f:
        writer = csv.writer(f)
        for S in current_sched_list:
            #print(S)
            items = [now,]
            [ items.append( S[X] ) for X in rows_j ]
            writer.writerow(items)

# Same as above, but compressed, act only
def log_section_filling2(current_sched_list):


    now = datetime.datetime.now().strftime('%Y-%m-%dT%H')

    todays_data = { int(S['crn']): S['act'] for S in current_sched_list }
    #print(todays_data)

    todays_df = pd.DataFrame.from_dict(todays_data, orient='index', columns=[now])
    todays_df = todays_df.rename_axis('crn')
    #print(todays_df)
    todays_df.to_csv('cache/reg_today_new.csv', index=True)

    try:
        myframe = pd.read_csv('cache/reg_data_' + short_sem + '.csv')
        print(myframe)
    except:
        fff = open('cache/reg_data_'+short_sem+'.csv','w')
        fff.write('crn\n')
        fff.close()
        myframe = pd.read_csv('cache/reg_data_' + short_sem + '.csv')
        #myframe = pd.DataFrame.from_dict(todays_data, orient='index', columns=[now])
        #myframe = myframe.rename_axis('crn')
        print("Creating new data file for this semester.")

    new_df = myframe.join( todays_df, on='crn', how='outer' )
    new_df = new_df.rename_axis('crn')
    print(new_df)

    reg_data_filename = 'reg_data_' + short_sem + '.csv'
    new_df.to_csv('cache/' + reg_data_filename, index=False)
    put_file('/home/public/schedule/', 'cache/', reg_data_filename, 0)


# Use Firefox and log in to ssb and get full schedule. Only works where selenium is installed
def scrape_schedule():

    #url = "https://ssb.gavilan.edu/prod/twbkwbis.P_GenMenu?name=bmenu.P_StuMainMnu"
    url = "https://ssb-prod.ec.gavilan.edu/PROD/twbkwbis.P_GenMenu?name=bmenu.P_MainMnu"


    text = ''

    from selenium import webdriver
    from selenium.webdriver.common.keys import Keys
    from selenium.webdriver.support.ui import WebDriverWait, Select
    from selenium.webdriver.common.by import By
    from selenium.webdriver.support import expected_conditions as EC

    try:
        driver = webdriver.Firefox()
        driver.get(url)
        driver.find_element_by_id("UserID").clear()
        driver.find_element_by_id("UserID").send_keys(GOO)
        driver.find_element_by_name("PIN").send_keys(GOO_PIN)
        driver.find_element_by_name("loginform").submit()
        driver.implicitly_wait(5)

        print(driver.title)

        driver.find_element_by_link_text("Students").click()
        driver.implicitly_wait(5)
        print(driver.title)

        driver.find_element_by_link_text("Registration").click()
        driver.implicitly_wait(5)
        print(driver.title)

        driver.find_element_by_link_text("Search for Classes").click()
        driver.implicitly_wait(15)
        print(driver.title)

        dd = Select(driver.find_element_by_name("p_term"))
        if (dd):
            dd.select_by_visible_text(SEMESTER)
            driver.find_element_by_xpath("/html/body/div/div[4]/form").submit()
            driver.implicitly_wait(15)
            print(driver.title)

        driver.find_element_by_xpath("/html/body/div/div[4]/form/input[18]").click()
        driver.implicitly_wait(10)
        print(driver.title)

        driver.find_element_by_name("SUB_BTN").click()
        driver.implicitly_wait(40)
        time.sleep(15)
        driver.implicitly_wait(40)
        print(driver.title)
        text = driver.page_source
        driver.quit()

    except Exception as e:
        print("Got an exception: ", e)
    finally:
        print("")
        #driver.quit()


    codecs.open('cache/' + filename_html,'w', 'utf-8').write(text)


    #print(text)
    as_list = ssb_to_csv(text)
    #print(as_list)
    as_dict = to_section_list(as_list)
    jj = json.dumps(as_dict,indent=2)

    # TODO
    try:
        ps = codecs.open('cache/'+filename,'r','utf-8')
        prev_sched = json.loads(ps.read())
        ps.close()

        if 1:   # sometimes I want to re-run this without affecting the logs.
            log_section_filling(as_dict)
            log_section_filling2(as_dict)

        dd = DeepDiff(prev_sched, as_dict, ignore_order=True)
        pretty_json = json.dumps(  json.loads( dd.to_json() ), indent=2 )
        codecs.open('cache/%s_sched_diff.json' % short_sem,'w','utf-8').write(  pretty_json )    # dd.to_json() )

    except Exception as e:
        print(e)
        print("Can't do diff?")

    # Next, rename the prev sched_xxYY.json data file to have its date,
    # make this new one, and then upload it to the website.
    # Maybe even count the entries and do a little sanity checking
    #
    # print("Last modified: %s" % time.ctime(os.path.getmtime("test.txt")))
    # print("Created: %s" % time.ctime(os.path.getctime("test.txt")))


    try:
        last_mod = time.ctime(os.path.getmtime('cache/' + filename))

        import pathlib
        prev_stat = pathlib.Path('cache/' + filename).stat()
        mtime = dt.fromtimestamp(prev_stat.st_mtime)
        print(mtime)
    except:
        print("Couldn't Diff.")
    # fname = pathlib.Path('test.py')
    # assert fname.exists(), f'No such file: {fname}'  # check that the file exists
    # print(fname.stat())
    #
    # os.stat_result(st_mode=33206, st_ino=5066549581564298, st_dev=573948050, st_nlink=1, st_uid=0, st_gid=0, st_size=413,
    #                st_atime=1523480272, st_mtime=1539787740, st_ctime=1523480272)


    codecs.open('cache/' + filename, 'w', 'utf-8').write(jj)

    put_file('/home/public/schedule/', 'cache/', filename, 0)                             #  /gavilan.edu/_files/php/

    return as_dict

def dza_sched():
    text = codecs.open('cache/sched_fa22_deanza.html','r','utf-8').read()
    as_list = ssb_to_csv(text)
    #print(as_list)
    as_dict = to_section_list(as_list)
    jj = json.dumps(as_dict,indent=2)
    codecs.open('cache/fa22_sched_deanza.json','w','utf-8').write(jj)

# recreate schedule json files with most current online schedule format.
def recent_schedules():
    #  # todo: sems is a global in this file. Is that the right thing to do?
    #all_scheds = [ os.listdir( 'cache/rosters/' + shortToLongSem(s)) for s in sems ]
    #for i,s in enumerate(sems):
    for s in ['sp21',]:
        filename = 'cache/sched_' + s + '.html'
        print("Filename is %s" % filename)
        input = codecs.open( filename, 'r', 'utf-8').read()
        output = ssb_to_csv(input)

        csv_fn = 'cache/temp_sched_' + s + '.csv'
        if os.path.isfile(csv_fn):
            os.remove(csv_fn)

        codecs.open(csv_fn,'w','utf-8').write(output)

        jsn = to_section_list(output)
        jsn_fn = 'cache/semesters/'+shortToLongSem(s)+'/'+s+'_sched.json'
        if os.path.isfile(jsn_fn):
            os.remove(jsn_fn)
        codecs.open(jsn_fn,'w').write(json.dumps(jsn))
    print("I put the most recent schedule JSON files in ./cache/semesters/... folders.")


################
################  ROSTERS AND REGISTRATION
################
################
################

# todo: the pipeline is disorganized. Organize it to have
# a hope of taking all this to a higher level.
#

# todo: where does this belong in the pipeline? compare with recent_schedules()


# Take the generically named rosters uploads files and move them to a semester folder and give them a date.
def move_to_folder(sem,year,folder):
    semester = year+sem
    semester_path = 'cache/rosters/%s' % semester
    if not os.path.isdir('cache/rosters/'+semester):
        os.makedirs('cache/rosters/'+semester)
    now = datetime.datetime.now().strftime('%Y-%m-%dT%H-%M')
    print("+  Moving roster files to folder: %s" % semester_path)
    if not os.path.isdir(semester_path):
        print("+  Creating folder: %s" % semester_path)
        os.makedirs(semester_path)
    os.rename('cache/rosters/courses-%s.csv' % folder,     'cache/rosters/%s/courses.%s.csv' % (semester,now))
    os.rename('cache/rosters/enrollments-%s.csv' % folder, 'cache/rosters/%s/enrollments.%s.csv' % (semester,now))
    os.rename('cache/rosters/users-%s.csv' % folder,       'cache/rosters/%s/users.%s.csv' % (semester,now))


# Take raw upload (csv) files and make one big json out of them.
# This relates to enrollment files, not schedule.
def convert_roster_files(semester="",year="",folder=""):
    if not semester:
        semester = input("the semester? (ex: spring) ")
        folder = input("Folder? (ex 2020-02-25-14-58-20) ")
    uf = open('cache/rosters/users-'+folder+'.csv','r')
    cf = open('cache/rosters/courses-'+folder+'.csv','r')
    ef = open('cache/rosters/enrollments-'+folder+'.csv','r')
    u = csv.DictReader(uf)
    c = csv.DictReader(cf)
    e = csv.DictReader(ef)
    uu = [i for i in u]
    cc = [i for i in c]
    ee = [i for i in e]
    uf.close()
    cf.close()
    ef.close()
    myrosterfile = 'cache/rosters/roster_%s_%s.json' % (year, semester)

    if os.path.exists(myrosterfile):
        print("  --  Moving previous combined roster json file. opening %s ..." % myrosterfile)
        last_fileobj = open(myrosterfile,'r')
        last_file = json.load(last_fileobj)

        last_fileobj.close()

        info = last_file[3]
        last_date = info['date_filestring']

        print('  --  writing:  cache/rosters/%s%s/roster_%s.json ...' % (year,semester,last_date))

        try:
            os.rename(myrosterfile, 'cache/rosters/%s%s/roster_%s.json ...' % (year,semester,last_date))
            print('  --  ok')
        except Exception as e:
            print("  **  Failed because i couldn't move the previous roster file: %s" % myrosterfile)
            print(e)
            myrosterfile = "new_" + myrosterfile
            pass
            #os.remove('cache/old_rosters/roster_'+semester+'.'+last_date+'.json')
            #os.rename(myrosterfile, 'cache/old_rosters/roster_'+semester+'.'+last_date+'.json')

    newinfo = {'date_filestring': datetime.datetime.now().strftime('%Y-%m-%dT%H-%M'), }
    try:
        new_roster = codecs.open(myrosterfile,'w', 'utf-8')
        new_roster.write( json.dumps( [uu,cc,ee,newinfo], indent=2 ))
        new_roster.close()
        print("  --  Wrote roster info to: %s." % myrosterfile)
    except Exception as e:
        print("  **  Failed because i couldn't move the previous roster file: %s" % myrosterfile)
        print("  **  " + str(e))


# From instructure sftp site
def fetch_current_rosters():
    dt_label = datetime.datetime.now().strftime('%Y-%m-%d-%H-%M-%S')
    cnopts = pysftp.CnOpts()
    cnopts.hostkeys = None
    with pysftp.Connection(instructure_url,username=instructure_username, private_key=instructure_private_key,cnopts=cnopts) as sftp:
        sftp.chdir('SIS')
        files = sftp.listdir()
        print("--> %s I see these files at instructure ftp site: " % dt_label )
        [print("   %s" % f) for f in files]
        i = 0
        got_courses = 0
        if len(files)>1:  # and 'users.csv' in files:
            try:
                if 'users.csv' in files:
                    sftp.get('users.csv','cache/rosters/users-'+dt_label+'.csv')
                    i += 1
            except:
                print(' * users.csv not present')
            try:
                if 'courses.csv' in files:
                    sftp.get('courses.csv','cache/rosters/courses-'+dt_label+'.csv')
                    i += 1
                    got_courses = 1
            except:
                print(' * courses.csv not present')
            try:
               if 'enrollments.csv' in files:
                    sftp.get('enrollments.csv','cache/rosters/enrollments-'+dt_label+'.csv')
                    i += 1
            except:
                print(' * enrollments.csv not present')
            print('   Saved %i data files in rosters folder.' % i)

            if got_courses:
                courses = open('cache/rosters/courses-%s.csv' % dt_label,'r')
                courses.readline()
                a = courses.readline()
                print(a)
                courses.close()
                parts = a.split(',')
                year = parts[1][0:4]
                ss = parts[1][4:6]
                #print parts[1]
                sem = {'30':'spring', '50':'summer', '70':'fall' }
                this_sem = sem[ss]
                print(" -> This semester is: %s, %s" % (this_sem,year))

                print(' -> %s building data file...' % dt_label)
                convert_roster_files(this_sem,year,dt_label)
                print(' -> moving files...')
                move_to_folder(this_sem,year,dt_label)
            else:
                print(" * No courses file. Not moving files.")
        else:
            print("--> Don't see files.")
    sftp.close()

def fetch_current_rosters_auto():

    schedule.every().hour.at(":57").do(fetch_current_rosters)

    schedule.every().day.at("12:35").do(sync_non_interactive)
    schedule.every().day.at("21:00").do(sync_non_interactive)


    print("running every hour on the :57\n")
    while True:
        try:
            schedule.run_pending()
        except Exception as e:
            import traceback
            print(" ---- * * * Failed with: %s" % str(e))
            ff = open('cache/pipeline.log.txt','a')
            ff.write(datetime.datetime.now().strftime('%Y-%m-%d-%H-%M-%S') + "\n")
            ff.write(traceback.format_exc()+"\n---------\n\n")
            ff.close()
            #schedule.CancelJob
        time.sleep(15)


# read schedule file with an eye toward watching what's filling up
def schedule_filling():
    sem = 'spring2021'    # todo: hardcoded
    days = []
    for f in sorted(os.listdir('cache/rosters/'+sem+'/')):
        if f.endswith('.html'):
            match = re.search(r'sched_(\d\d\d\d)_(\d\d)_(\d+)\.html',f)
            if match:
                print(f)
                y = match.group(1)
                m = match.group(2)
                d = match.group(3)
                print("Schedule from %s %s %s." % (y,m,d))
                csv_sched = ssb_to_csv(open('cache/rosters/'+sem+'/'+f,'r').read())
                jsn = to_section_list(csv_sched)
                #print(json.dumps(jsn,indent=2))
                days.append(jsn)
    day1 = days[-2]
    day2 = days[-1]
    df = jsondiff.diff(day1, day2)
    gains = defaultdict( list )

    for D in df.keys():
        if isinstance(D, int):
            #print(day1[D]['code'] + '\t' + day1[D]['crn'] + ' Before: ' + day1[D]['act'] + ' After: ' + day2[D]['act'])
            try:
                gain = int(day2[D]['act']) - int(day1[D]['act'])
                gains[gain].append(  day1[D]['code'] + ' ' + day1[D]['crn'] )
            except:
                print("No gain for " + str(D))
            #print("\t" + str(df[D]))
        else:
            print(D)
            print(df[D])
    for key, value in sorted(gains.items(), key=lambda x: x[0]):
        print("{} : {}".format(key, value))

    #print(json.dumps(gains,indent=2))


################
################  SENDING DATA AWAY
################
################
################

# Upload a json file to www
def put_file(remotepath,localpath, localfile,prompt=1):
    show_all = 0
    folder = datetime.datetime.now().strftime('%Y-%m-%d-%H-%M-%S')
    cnopts = pysftp.CnOpts()
    cnopts.hostkeys = None

    with pysftp.Connection(FTP_SITE,username=FTP_USER, password=FTP_PW,cnopts=cnopts) as sftp:
        #todo: these paths
        #files = sftp.listdir()
        #print(folder + "\tI see these files on remote: ", files, "\n")
        sftp.chdir(remotepath)
        files = sftp.listdir()
        if show_all: print(folder + "\tI see these files on remote: ", files, "\n")
        localf = os.listdir(localpath)
        if show_all: print("I see these local: ", localf)
        if prompt:
            input('ready to upload')
        sftp.put(localpath+localfile, localfile, preserve_mtime=True)
        sftp.close()


        """
        # copy files and directories from local static, to remote static,
        # preserving modification times on the files
        for f in localf:
            print("This local file: " + f + " ", end=' ')
            if not f in files:
                sftp.put('video_srt/'+classfoldername+'/'+f, f, preserve_mtime=True)
                print("Uploaded.")
            else:
                print("Skipped.")
        """

        """if len(files)==3 and 'users.csv' in files:
            sftp.get('courses.csv','rosters/courses-'+folder+'.csv')
            sftp.get('users.csv','rosters/users-'+folder+'.csv')
            sftp.get('enrollments.csv','rosters/enrollments-'+folder+'.csv')
            print folder + '\tSaved three data files in rosters folder.'

            courses = open('rosters/courses-'+folder+'.csv','r')
            courses.readline()
            a = courses.readline()
            print a
            courses.close()
            parts = a.split(',')
            year = parts[1][0:4]
            ss = parts[1][4:6]
            #print parts[1]
            sem = {'30':'spring', '50':'summer', '70':'fall' }
            this_sem = sem[ss]
            #print this_sem, "", year
            print folder + '\tbuilding data file...'
            convert_roster_files(this_sem,year,folder)
            print folder + '\tmoving files...'
            move_to_folder(this_sem,year,folder)
        else:
            print folder + "\tDon't see all three files."""


################
################  GOOGLE DOCS
################
################
################

def sec(t): return "<h3>"+t+"</h3>\n"
def para(t): return "<p>"+t+"</p>\n"
def ul(t): return "<ul>"+t+"</ul>\n"
def li(t): return "<li>"+t+"</li>\n"

def question(t,bracket=1):
    ret = ''
    match = re.search( r'\[(.*)\]', t)
    if match and bracket:
        ret += "<a name='" + match.group(1) + "'></a>"
        t = re.sub( r'\[.*\]','',t)
    else:
        parts = t.split(' ')
        id = ''
        for p in parts:
            if re.search(r'[a-zA-Z]',p[0]): id += p[0]
        ret += "<a name='%s'></a>" % id.lower()
    return ret + '<div class="accordion" data-accordion=""><h4 class="acrd_cntl">' + t + '</h4>\n<div class="acrd_cntnt">'

def answer(t):
    return t + '</div></div>\n'

def read_paragraph_element(element,type="NORMAL_TEXT"):
    """Returns the text in the given ParagraphElement.

        Args:
            element: a ParagraphElement from a Google Doc.
    """
    text_run = element.get('textRun')
    begin = ''
    end = ''
    if not text_run:
        return ''
    if 'textStyle' in text_run and 'link' in text_run['textStyle']:
        begin = '<a href="' + text_run['textStyle']['link']['url'] + '">'
        end = '</a>'
    if 'textStyle' in text_run and 'bold' in text_run['textStyle'] and text_run['textStyle']['bold']==True and type=="NORMAL_TEXT":
        begin = '<strong>' + begin
        end = end + '</strong>'

    content = text_run.get('content')
    content = re.sub(u'\u000b','<br />\n',content)

    return begin + content + end


def get_doc(docid, bracket=1, verbose=0):
    import pickle
    import os.path
    from googleapiclient.discovery import build
    from google_auth_oauthlib.flow import InstalledAppFlow
    from google.auth.transport.requests import Request

    #ooout = open(fileout,'w')

    # If modifying these scopes, delete the file token.pickle.
    SCOPES = ['https://www.googleapis.com/auth/documents.readonly']
    creds = None
    # The file token.pickle stores the user's access and refresh tokens, and is
    # created automatically when the authorization flow completes for the first
    # time.
    if os.path.exists('token.pickle'):
        with open('token.pickle', 'rb') as token:
            creds = pickle.load(token)
    # If there are no (valid) credentials available, let the user log in.
    if not creds or not creds.valid:
        if creds and creds.expired and creds.refresh_token:
            creds.refresh(Request())
        else:
            flow = InstalledAppFlow.from_client_secrets_file(
                'credentials.json', SCOPES)
            creds = flow.run_local_server(port=0)
        # Save the credentials for the next run
        with open('token.pickle', 'wb') as token:
            pickle.dump(creds, token)

    service = build('docs', 'v1', credentials=creds)

    # Retrieve the documents contents from the Docs service.
    document = service.documents().get(documentId=docid).execute()
    if verbose: print(document)

    tempout = codecs.open('cache/trash/gdoctemp.txt','w','utf-8')
    tempout.write( json.dumps(document,indent=2) + "\n\n\n------------------------------------\n\n")
    if verbose: print('The title of the document is: {}'.format(document.get('title')))
    doc_content = document.get('body').get('content')
    if verbose: print(doc_content)

    doc_objects = document.get('inlineObjects')
    if verbose: print(doc_objects)

    doc_lists = document.get('lists')

    text = '<div class="acrd_grp" data-accordion-group="">'
    last_type = ''
    answer_text = ''
    in_a_list = ''

    img_count = 1
    img_lookup = {}
    img_heights = {}
    img_widths = {}

    if doc_objects:
        for k,value in doc_objects.items():
            tempout.write( "->" + k + "=" + json.dumps(value,indent=2) + "\n\n\n--\n\n")
            if 'inlineObjectProperties' in value:
                if 'embeddedObject' in value['inlineObjectProperties']:
                    if 'imageProperties' in value['inlineObjectProperties']['embeddedObject']:
                        if 'contentUri' in value['inlineObjectProperties']['embeddedObject']['imageProperties']:
                            print(k)
                            uu = value['inlineObjectProperties']['embeddedObject']['imageProperties']['contentUri']
                            response = requests.get(uu, stream=True)
                            name = 'image_' + str(img_count) + '.' + response.headers['content-type'].split('/')[1]
                            img_count += 1

                            img_lookup[k] = name

                            with open('cache/doc_images/'+name, 'wb') as out_file:
                                shutil.copyfileobj(response.raw, out_file)
                            print(uu)
                            print(response.headers)
                            print(name)
                            #input('x?')
                            del response
                    if 'size' in  value['inlineObjectProperties']['embeddedObject']:
                        img_heights[k] = int(value['inlineObjectProperties']['embeddedObject']['size']['height']['magnitude'])
                        img_widths[k] = int(value['inlineObjectProperties']['embeddedObject']['size']['width']['magnitude'])

    tempout.write('- - - - - - - -\n\n')
    #for value in doc_lists:
    #    tempout.write( json.dumps(value,indent=2) + "\n\n\n--\n\n")

    tempout.write('- - - - - - - -\n\n')
    list_stack = []
    list_depth = 0
    last_list_depth = 0
    for value in doc_content:
        tempout.write( json.dumps(value,indent=2) + "\n\n\n")
        if verbose: print(json.dumps(value, sort_keys=True, indent=4))

        # todo: x link, x bold, list, image.
        tag_fxn = para
        if 'paragraph' in value:
            this_text = ''

            if 'bullet' in value['paragraph']:
                # either we're (1)starting a new list, (2)in one, (3)starting a nested one, or (4)finished a nested one.

                lid = value['paragraph']['bullet']['listId']

                if not list_stack:  # 1
                    list_stack.append(lid)
                else:
                    if lid == list_stack[0]:   # 2
                        pass

                    else:
                        if not lid in list_stack:   # 3
                            list_stack.append(lid)
                        else:                       # 4
                            x = list_stack.pop()
                            while x != lid: list_stack.pop()
            elif len(list_stack) > 0:                           #  current para isn't a bullet but we still have a list open.
                list_stack = []

            list_depth = len(list_stack)

            deeper = list_depth - last_list_depth

            if deeper > 0:
                answer_text += "<ul>" * deeper
            elif deeper < 0:
                deeper = -1 * deeper
                answer_text += "</ul>" * deeper

            if len(list_stack):
                tag_fxn = li

            elements = value.get('paragraph').get('elements')

            # inlineObjectElement": {
            # "inlineObjectId": "kix.ssseeu8j9cfx",

            if 'paragraphStyle' in value.get('paragraph'):
                style = value.get('paragraph').get('paragraphStyle')
                #text += json.dumps(style, sort_keys=True, indent=4)
                if 'namedStyleType' in style:
                    type = style['namedStyleType']

            for elem in elements:

                # text content
                this_text += read_paragraph_element(elem,type)

                # image content
                if 'inlineObjectElement' in elem:
                    vpi = elem['inlineObjectElement']
                    if 'inlineObjectId' in vpi:
                        ii = vpi['inlineObjectId']
                        if ii in img_lookup:
                            img = img_lookup[ii]
                            h = img_heights[ii]
                            w = img_widths[ii]
                            this_text += '<img src="doc_images/%s" width="%i" height="%i" />' % (img,w,h)


            if last_type=='NORMAL_TEXT' and type!=last_type:
                text += answer(answer_text)
                answer_text = ''

            if type=='HEADING_2':
                text += sec(this_text)
                this_text = ''
            elif type=='HEADING_3':
                text += question(this_text,bracket)
                this_text = ''
            else:
                answer_text += tag_fxn(this_text)
                this_text = ''
            last_type = type
            last_list_depth = list_depth

        elif 'table' in value:
            # The text in table cells are in nested Structural Elements and tables may be
            # nested.
            text += "\nTABLE\n"
            #table = value.get('table')
            #for row in table.get('tableRows'):
            #    cells = row.get('tableCells')
            #    for cell in cells:
            #        text += read_strucutural_elements(cell.get('content'))
        #elif 'tableOfContents' in value:
        #    # The text in the TOC is also in a Structural Element.
        #    toc = value.get('tableOfContents')
        #    text += read_strucutural_elements(toc.get('content'))

        #else:
        #    print(json.dumps(value, sort_keys=True, indent=4))

    text += answer(answer_text)
    #text += '</div>'
    #print(text)
    return text

######### TRY #2 ######


def read_paragraph_element_2(element,type="NORMAL_TEXT"):
    text_run = element.get('textRun')
    begin = ''
    end = ''
    if not text_run: return ''
    if 'textStyle' in text_run and 'link' in text_run['textStyle']:
        begin = '<a href="' + text_run['textStyle']['link']['url'] + '">'
        end = '</a>'
    if 'textStyle' in text_run and 'bold' in text_run['textStyle'] and text_run['textStyle']['bold']==True and type=="NORMAL_TEXT":
        begin = '<strong>' + begin
        end = end + '</strong>'
    elif 'textStyle' in text_run and 'italic' in text_run['textStyle'] and text_run['textStyle']['italic']==True and type=="NORMAL_TEXT":
        begin = '<em>' + begin
        end = end + '</em>'
    content = text_run.get('content')
    content = re.sub(u'\u000b','<br />\n',content)
    return begin + content + end

# t is a string that begins with "Icons: " ... and contains comma(space) separated list
def handle_icons(t):
    text = t[7:].strip()
    parts = text.split(", ")
    return ('icons',parts)

# t is a string that begins with "Tags: " ... and contains comma(space) separated list
def handle_tags(t):
    text = t[6:].strip()
    parts = text.split(", ")
    return ('tags',parts)

def handle_question(t,bracket=1):
    anchor = ''
    match = re.search( r'\[(.*)\]', t)
    if match and bracket:
        anchor = match.group(1).lower()
        t = re.sub( r'\[.*\]','',t)
    else:
        parts = t.split(' ')
        for p in parts:
            if re.search(r'[a-zA-Z]',p[0]): anchor += p[0].lower()
    return ('question', t, anchor)

def handle_answer(t):
    return ('answer',t)

def handle_sec(t): return ('section',t)
def handle_para(t): return ('paragraph',t)
def handle_ul(t): return ('unorderdedlist',t)
def handle_li(t): return ('listitem',t)


img_count = 1
img_lookup = {}
img_heights = {}
img_widths = {}


def fetch_doc_image(k,value):
    global img_count, img_lookup, img_heights, img_widths
    if 'inlineObjectProperties' in value:
        if 'embeddedObject' in value['inlineObjectProperties']:
            if 'imageProperties' in value['inlineObjectProperties']['embeddedObject']:
                if 'contentUri' in value['inlineObjectProperties']['embeddedObject']['imageProperties']:
                    print(k)
                    uu = value['inlineObjectProperties']['embeddedObject']['imageProperties']['contentUri']
                    response = requests.get(uu, stream=True)
                    name = 'image_' + str(img_count) + '.' + response.headers['content-type'].split('/')[1]
                    img_count += 1
                    img_lookup[k] = name

                    with open('cache/doc_images/'+name, 'wb') as out_file:
                        shutil.copyfileobj(response.raw, out_file)
                    print(uu)
                    print(response.headers)
                    print(name)
                    del response
            if 'size' in  value['inlineObjectProperties']['embeddedObject']:
                img_heights[k] = int(value['inlineObjectProperties']['embeddedObject']['size']['height']['magnitude'])
                img_widths[k] = int(value['inlineObjectProperties']['embeddedObject']['size']['width']['magnitude'])


def get_doc_generic(docid, bracket=1, verbose=0):
    import pickle
    import os.path
    from googleapiclient.discovery import build
    from google_auth_oauthlib.flow import InstalledAppFlow
    from google.auth.transport.requests import Request
    global img_count, img_lookup, img_heights, img_widths

# If modifying these scopes, delete the file token.pickle.
    SCOPES = ['https://www.googleapis.com/auth/documents.readonly']
    creds = None
    # The file token.pickle stores the user's access and refresh tokens, and is
    # created automatically when the authorization flow completes for the first
    # time.
    if os.path.exists('token.pickle'):
        with open('token.pickle', 'rb') as token:
            creds = pickle.load(token)
    if not creds or not creds.valid:
        if creds and creds.expired and creds.refresh_token:
            creds.refresh(Request())
        else:
            flow = InstalledAppFlow.from_client_secrets_file(
                'credentials.json', SCOPES)
            creds = flow.run_local_server(port=0)
        # Save the credentials for the next run
        with open('token.pickle', 'wb') as token:
            pickle.dump(creds, token)

    service = build('docs', 'v1', credentials=creds)

    # Retrieve the documents contents from the Docs service.
    document = service.documents().get(documentId=docid).execute()

    tempout = codecs.open('cache/trash/gdoctemp.txt','w','utf-8')
    tempout.write( json.dumps(document,indent=2) \
        + "\n\n\n------------------------------------\n\n")
    if verbose: print('The title of the document is: {}'.format(document.get('title')))

    doc_content = document.get('body').get('content')
    doc_objects = document.get('inlineObjects')
    doc_lists = document.get('lists')

    #text = ''
    result = []
    last_type = ''
    #answer_text = ''
    answer = []
    in_a_list = ''

    # Get all the images
    for k,value in doc_objects.items():
        tempout.write( "->" + k + "=" + json.dumps(value,indent=2) + "\n\n\n--\n\n")
        fetched = fetch_doc_image(k,value)

    list_stack = []
    list_depth = 0
    last_list_depth = 0
    for value in doc_content:
        tempout.write( json.dumps(value,indent=2) + "\n\n\n")
        if verbose: print(json.dumps(value, sort_keys=True, indent=4))

        tag_fxn = handle_para
        if 'paragraph' in value:
            this_text = ''

            # First we deal with if we're in a list.
            if 'bullet' in value['paragraph']:
                # either we're (1)starting a new list, (2)in one (do nothing),
                #  (3)starting a nested one, or (4)finished a nested one.
                lid = value['paragraph']['bullet']['listId']
                if not list_stack:  # 1
                    list_stack.append(lid)
                else:
                    if not lid == list_stack[0]:
                        if not lid in list_stack:   # 3
                            list_stack.append(lid)
                        else:                       # 4
                            x = list_stack.pop()
                            while x != lid: list_stack.pop()
            elif len(list_stack) > 0:
                #  current para isn't a bullet but we still have a list open.
                list_stack = []


            list_depth = len(list_stack)
            deeper = list_depth - last_list_depth
            if deeper > 0:
                answer.append("<ul>" * deeper)
            elif deeper < 0:
                deeper = -1 * deeper
                answer.append("</ul>" * deeper)
            if len(list_stack):
                tag_fxn = handle_li

            # NOW the tag_fxn is either 'para' or 'li'... let's get the styling info next,
            elements = value.get('paragraph').get('elements')
            if 'paragraphStyle' in value.get('paragraph'):
                style = value.get('paragraph').get('paragraphStyle')
                if 'namedStyleType' in style:
                    type = style['namedStyleType']

            # and FINALLY, the actual contents.
            for elem in elements:
                # text content
                this_text += read_paragraph_element_2(elem,type)

                # image content
                if 'inlineObjectElement' in elem:
                    vpi = elem['inlineObjectElement']
                    if 'inlineObjectId' in vpi:
                        ii = vpi['inlineObjectId']
                        if ii in img_lookup:
                            img = img_lookup[ii]
                            h = img_heights[ii]
                            w = img_widths[ii]
                            this_text += '<img src="doc_images/%s" width="%i" height="%i" />' % (img,w,h)


            # Now for something tricky. Call an appropriate handler, based on:
            #  (a) what is the paragraph style type?
            #  (b) is it different from the prev one?

            if last_type=='NORMAL_TEXT' and type!=last_type:
                if this_text.strip():
                    result.append(handle_answer(answer))
                answer = []
                #answer_text = ''

            if type=='HEADING_2' and this_text.strip():
                result.append( handle_sec(this_text) )
                this_text = ''
            elif type=='HEADING_3' and this_text.strip():
                result.append(handle_question(this_text,bracket))
                this_text = ''
            else:
                if this_text.lower().startswith('tags:'):
                    tag_fxn = handle_tags
                if this_text.lower().startswith('icons:'):
                    tag_fxn = handle_icons
                if this_text.strip():
                    answer.append(tag_fxn(this_text))
                this_text = ''
            last_type = type
            last_list_depth = list_depth

        elif 'table' in value:
            pass


    result.append(handle_answer(answer))
    return json.dumps(result,indent=4)


def scrape_schedule_py():
    return 1

    """
    cur_session = requests.Session()
    mygav_url = "https://lum-prod.ec.gavilan.edu/"

    r1 = requests.get(mygav_url)

    login_url1 = "https://lum-prod.ec.gavilan.edu/c/portal/login"


    login_url = "https://eis-prod.ec.gavilan.edu/authenticationendpoint/login.do?commonAuthCallerPath=%2Fsamlsso&forceAuth=false&passiveAuth=false&tenantDomain=carbon.super&sessionDataKey=57203341-6823-4511-b88e-4e104aa2fd71&relyingParty=LP5PROD_LuminisPortalEntity&type=samlsso&sp=Luminis+Portal+PROD&isSaaSApp=false&authenticators=BasicAuthenticator:LOCAL"
    """


def scrape_schedule_multi():

    global SEMESTER, short_sem, semester_begin, filename, filename_html

    SEMESTER = 'Spring 2023'
    short_sem = 'sp23'
    semester_begin = strptime('01/30', '%m/%d')
    filename = 'sp23_sched.json'
    filename_html = 'sp23_sched.html'

    SEM = ['Fall 2022', 'Summer 2022 (View only)', 'Spring 2022 (View only)',
           'Fall 2021 (View only)', 'Summer 2021 (View only)', 'Spring 2021 (View only)', 'Fall 2020 (View only)', 'Summer 2020 (View only)', 'Spring 2020 (View only)',
           'Fall 2019 (View only)', 'Summer 2019 (View only)', 'Spring 2019 (View only)', 'Fall 2018 (View only)', 'Summer 2018 (View only)', 'Spring 2018 (View only)' ]


    srt =  'fa22,su22,sp22,fa21,su21,sp21,fa20,su20,sp20,fa19,su19,sp19,fa18,su18,sp18'.split(',')
    beg = ['08/22','06/13','01/31','08/23','06/14','02/01','08/24','06/15','01/27','08/26','06/17','01/28','08/27','06/18','01/29']

    #for i in [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14]:

    #SEMESTER = SEM[i]
    #short_sem = srt[i]
    #semester_begin = strptime(beg[i], '%m/%d')
    #filename = '%s_sched.json' % short_sem
    #filename_html = '%s_sched.html' % short_sem

    as_dict = scrape_schedule()

    expanded = list_latestarts(short_sem)
    fields = "gp,dean,dept,num,code,crn,teacher,name,act,cap,site,type".split(",")

    ffcsv = codecs.open('cache/enrollment_%s.csv' % short_sem, 'w', 'utf-8')
    with ffcsv as csvfile:
        csvwriter = csv.writer(csvfile)
        csvwriter.writerow(fields)

        for S in expanded:
            parts = S['code'].split(' ')
            S['dept'] = parts[0]
            S['num'] = parts[1]
            S['gp'] = gp[parts[0]]
            S['dean'] = dean[parts[0]]
            S['sem'] = short_sem
            # S['act'] = S['cap']
            if S['loc'] == "ONLINE LIVE": S['site'] = 'OnlineLive'
            csvwriter.writerow( [ S[x] for x in fields ] )

    put_file('/home/public/schedule/', 'cache/', 'enrollment_%s.csv' % short_sem, 0)


def scrape_for_db():

    global SEMESTER, gp, dean, short_sem, semester_begin, filename, filename_html
    fields = 'sem,crn,dept,num,gp,dean,code,name,teacher,type,cap,act,loc,site,date,days,time,cred,ztc'.split(',')


    """
    SEMESTER = 'Fall 2022'
    short_sem = 'fa22'
    semester_begin = strptime('08/22', '%m/%d')
    filename = 'fa22_sched.json'
    filename_html = 'fa22_sched.html'

    as_dict = scrape_schedule()
    fff = codecs.open('cache/%s_sched.sql' % filename, 'w', 'utf-8')
    fff.write("CREATE TABLE IF NOT EXISTS schedule ( id text, sem text, dept text, num text, gp text, dean text, code text, crn text, name text, teacher text,mode text, loc text, cap text, act text, site text, date text, cred text, ztc text, days text, time text);\n")
    for S in as_dict:
        parts = S['code'].split(' ')
        S['dept'] = parts[0]
        S['num'] = parts[1]
        S['gp'] = gp[parts[0]]
        S['dean'] = dean[parts[0]]
        S['sem'] = short_sem
        str = "INSERT INTO schedule (sem,crn,dept,num,gp,dean,code,name,teacher,mode,cap,act,loc,site,date,days,time,cred,ztc) VALUES (%s);\n" % \
              ", ".join( [ "'" + re.sub(r"'", "", S[x]) + "'" for x in fields ] )
        print(str)
        fff.write(str)
    fff.write('UPDATE schedule SET site="OnlineLive" WHERE loc="ONLINE LIVE";\n')
    fff.close()
    """


    SEMESTER = 'Spring 2023 (View only)'
    short_sem = 'sp23'
    semester_begin = strptime('01/30', '%m/%d')
    filename = 'sp23_sched.json'
    filename_html = 'sp23_sched.html'

    as_dict = scrape_schedule()
    fff = codecs.open('cache/%s_sched.sql' % filename, 'w', 'utf-8')
    fff.write("CREATE TABLE IF NOT EXISTS schedule ( id text, sem text, dept text, num text, gp text, dean text, code text, crn text, name text, teacher text,mode text, loc text, cap text, act text, site text, date text, cred text, ztc text, days text, time text);\n")
    for S in as_dict:
        parts = S['code'].split(' ')
        S['dept'] = parts[0]
        S['num'] = parts[1]
        S['gp'] = gp[parts[0]]
        S['dean'] = dean[parts[0]]
        S['sem'] = short_sem
        str = "INSERT INTO schedule (sem,crn,dept,num,gp,dean,code,name,teacher,mode,cap,act,loc,site,date,days,time,cred,ztc) VALUES (%s);\n" % \
              ", ".join( [ "'" + re.sub(r"'", "", S[x]) + "'" for x in fields ] )
        print(str)
        fff.write(str)
    fff.write('UPDATE schedule SET site="OnlineLive" WHERE loc="ONLINE LIVE";\n')
    fff.close()

def argos_data():
    global dean,gp

    f2 = codecs.open('cache/enrollment_sp23.csv','w','utf-8')
    writer = csv.writer(f2)
    headers = 'gp dean dept num code crn name act site'.split(' ')
    writer.writerow(headers)

    f = codecs.open('cache/sched_draft_sp23.csv','r','utf-8')
    reader = csv.reader(f, delimiter=',')
    headers = next(reader)
    for r in reader:
        d = dict(list(zip(headers,r)))
        print(d)
        my_dean = dean[d['Subj']]
        my_gp = gp[d['Subj']]
        dept = d['Subj']
        num = d['Crse No']
        code = dept + " " + num
        crn = d['CRN']
        name = d['Course Title']
        act = d['Open Seats']
        campus = d['Campus']
        session = d['Session']
        if campus == "Off Campus": site = session
        else: site = campus
        print(site)
        writer.writerow([my_gp,my_dean,dept,num,code,crn,name,act,site])


def expand_old_semesters():

    terms = 'sp16,su16,fa16,sp17,su17,fa17,sp18,su18,fa18,sp19,su19,fa19,sp20,su20,fa20,sp21,su21,fa21,sp22,su22,fa22'.split(',')
    terms = 'sp16,su16,fa16,sp17,su17,fa17,sp18,su18,fa18,sp19,su19,fa19,sp20,su20'.split(',')
    terms.reverse()

    for t in terms:
        list_latestarts(t)
        input('press return to continue.')

# Input: xxxx_sched.json. Output: xxxx_latestarts.txt
def list_latestarts(term):

    show_summary = 1

    the_year = '20' + term[2:4]
    print("year: ", the_year, "  semester: ", term)

    term_in = "cache/%s_sched.json" % term
    term_out = "cache/%s_latestarts.txt" % term
    expanded_out = "%s_sched_expanded.json" % term
    print("Writing output to " + term_out)
    infile = codecs.open(term_in, "r", "utf-8")
    outfile = codecs.open(term_out, "w", "utf-8")
    exoutfile = codecs.open('cache/' + expanded_out, "w", "utf-8")
    expanded = []
    sched = json.loads(infile.read())
    #print sched
    by_date = {}

    if show_summary: print("course \t loc \t type \t time")

    for C in sched:
        if (not C['type']) and C['loc'] != 'ONLINE':  # and C['time']:
            C['type'] = 'in-person'

        if show_summary: print("%s \t %s \t %s \t %s" % (C['code'],C['loc'],C['type'],C['time']))

        if 'extra' in C:
            if 'partofday' in C and ('type' in C['extra'][0]) and (C['extra'][0]['type'] == 'online') and C['loc'] != "ONLINE LIVE":
                C['type'] = 'hybrid'

        times = C['time'].split("-")
        if len(times) > 1:
            time_start = times[0]
            time_end = times[1]

            try:
                startt = time.strptime(time_start,"%I:%M %p")
                endt = time.strptime(time_end,"%I:%M %p")
                min_start = startt.tm_min
                min_end = endt.tm_min
                if min_start == 0: min_start = "00"
                else: min_start = str(min_start)
                if min_end == 0: min_end = "00"
                else: min_end = str(min_end)
                C['time_start'] = "%i:%s" % (startt.tm_hour, min_start )
                C['time_end'] = "%i:%s" % (endt.tm_hour, min_end )
                if 0:
                    print("+  Parsed %s into %s and %s." % (C['time'], C['time_start'], C['time_end']))
            except Exception as e:
                print(e, "\n-- problem parsing time ", time_start, " or ", time_end)
        else:
            C['time_start'] = ''
            C['time_end'] = ''

        if re.search('TBA',C['date']):
            C['start'] = ''
            C['end'] = ''
            C['doy'] = ''
            expanded.append(C)
            continue

        parts = C['date'].split("-")
        start = parts[0] + "/" + the_year
        end = parts[1] + "/" + the_year

        try:
            startd = parser.parse(start)
            endd = parser.parse(end)
            C['start'] = "%i-%i" % (startd.month,startd.day)
            C['end'] = "%i-%i" % (endd.month,endd.day)
            C['doy'] = startd.timetuple().tm_yday
            expanded.append(C)
        except Exception as e:
            print(e, "\n-- problem parsing ", start, " or ", end)
        if not startd in by_date:
            by_date[startd] = []
        by_date[startd].append(C)

    exoutfile.write( json.dumps(expanded,indent=2) )
    exoutfile.close()
    put_file('/home/public/schedule/', 'cache/', expanded_out, 0)

    for X in sorted(by_date.keys()):
        #print("Start: ", X)
        if len(by_date[X]) < 200:
            prettydate = X.strftime("%A, %B %d")
            #print(prettydate + ": " + str(len(by_date[X])) + " courses")
            outfile.write(prettydate + ": " + str(len(by_date[X])) + " courses" + "\n")
            for Y in by_date[X]:
                #print "\t" + Y['code'] + " " + Y['crn'] + "\t" + Y['teacher']
                #print(Y)
                #outfile.write("\t" + Y['code'] + " " + Y['crn'] + "\t" + Y['teacher'] + "\t" + Y['type'] +"\n")
                outfile.write("\t" + Y['code'] + " " + Y['crn'] + "\t" + Y['teacher'] + "\t" + Y['type'] + "\t" + "\n")
    return expanded
if __name__ == "__main__":

    print ('')
    options = { 1: ['Re-create schedule csv and json files from raw html',recent_schedules] ,
                2: ['Fetch rosters',fetch_current_rosters] ,
                3: ['Fetch rosters AND canvas data automatically',fetch_current_rosters_auto] ,
                4: ['Compute how registration is filling up classes', schedule_filling] ,
                5: ['Manually convert 3 csv files to joined json enrollment file.', convert_roster_files] ,
                6: ['Canvas data: interactive sync', interactive ],
                7: ['Canvas data: automated sync', sync_non_interactive ],
                8: ['Scrape schedule from ssb', scrape_schedule_multi ],
                9: ['Test ssb calls with python', scrape_schedule_py ],
                10: ['schedule to db', scrape_for_db ],
                11: ['clean argos draft schedule file', argos_data],
                12: ['make expanded schedule json files of old semesters', expand_old_semesters ],
                13: ['Parse deanza schedule', dza_sched ],
              }

    if len(sys.argv) > 1 and re.search(r'^\d+',sys.argv[1]):
        resp = int(sys.argv[1])
        print("\n\nPerforming: %s\n\n" % options[resp][0])

    else:
        print ('')
        for key in options:
            print(str(key) + '.\t' + options[key][0])

        print('')
        resp = input('Choose: ')

    # Call the function in the options dict
    options[ int(resp)][1]()

# Testing

#if __name__ == "__main__":
    #users = fetch('/api/v1/courses/69/users?per_page=100',1)
    #print "These are the users: "
    #print users

    #getSemesterSchedule()


    #get_doc()
    #pass