canvasapp/pipelines.py

import util
import codecs, json, requests, re, csv, datetime, os, jsondiff, os.path
import sys, shutil, hmac, hashlib, base64, schedule, time, pathlib
from datetime import timedelta

from canvas_secrets import apiKey, apiSecret, FTP_SITE, FTP_USER, FTP_PW, url, domain, account_id, header, header_media, g_id, g_secret
from canvas_secrets import instructure_url, instructure_username, instructure_private_key

import os, asyncio
from dap.api import DAPClient
from dap.dap_types import Credentials
from dap.integration.database import DatabaseConnection
from dap.replicator.sql import SQLReplicator


"""
Everything to do with fetching data,
 - From iLearn, via token
 - current roster uploads from instructures sftp site
 - raw logs and other from canvas data repo
 - from ssb, use firefox to scrape the schedule


And some subsequent processing:
 - Raw roster files, into a more compact json format
 - Raw logs into something more useful
"""

verbose = False

users = {}
users_by_id = {}

# todo: all these constants for SSB  --  line 1008
#
# todo: https://stackoverflow.com/questions/42656247/how-can-i-use-canvas-data-rest-api-using-python


sys.setrecursionlimit( 100000 )

local_data_folder = 'cache/canvas_data/'
mylog = codecs.open(local_data_folder + 'temp_log.txt','w')


class FetchError(Exception):
    pass


DEBUG = 0

def d(s,end=''):
    global DEBUG
    if end and DEBUG: print(s,end=end)
    elif DEBUG: print(s)

################
################  CANVAS API MAIN FETCHING FUNCTIONS
################
################
################


# Main canvas querying fxn
def fetch(target,verbose=0,params=0,media=0):
    # if there are more results, recursivly call myself, adding on to the results.
    results = 0
    if target[0:4] != "http": target = url + target
    if verbose:
        print("++ Fetching: " + target)
    if media:
        r2 = requests.get(target, headers = header_media)
    elif params:
        r2 = requests.get(target, headers = header, params = params)
    else:
        r2 = requests.get(target, headers = header)
    #if verbose:
    #print "++ Got: " + r2.text
    try:
        results = json.loads(r2.text)
        count = len(results)
    except:
        print("-- Failed to parse: ", r2.text)
    if verbose:
        print("Got %i results" % count)
    if verbose > 1:
        print(r2.headers)

        tempout = codecs.open('cache/fetchcache.txt','a','utf-8')
        tempout.write(r2.text+"\n\n")
        tempout.close()

    if ('link' in r2.headers and count > 0):
        links = r2.headers['link'].split(',')
        for L in links:
            ll = L.split(';')
            link = ll[0].replace("<","")
            link = link.replace(">","")
            if re.search(r'next', ll[1]):
                if (verbose):  print("++ More link: " + link)
                #link = re.sub(r'per_page=10$', 'per_page=100', link)    # link.replace('per_page=10','per_page=500')
                #if (verbose):  print("++ More link: " + link)

                nest = fetch(link,verbose,params,media)
                if isinstance(results,dict): results.update(nest)
                else: results.extend(nest)
    return results

# Main canvas querying fxn - stream version - don't die on big requests
def fetch_stream(target,verbose=0):
    # if there are more results, recursivly call myself, adding on to the results.
    results = 0
    while target:
        if target[0:4] != "http": target = url + target
        if verbose:
            print("++ Fetching: " + target)
        r2 = requests.get(target, headers = header)
        if r2.status_code == 502:
            raise FetchError()
        try:
            results = json.loads(r2.text)
            count = len(results)
        except:
            print("-- Failed to parse: ", r2.text)
        if verbose:
            print("Got %i results" % count)
        if verbose > 1:
            print(r2.headers)
            tempout = codecs.open('cache/fetchcache.txt','a','utf-8')
            tempout.write(r2.text+"\n\n")
            tempout.close()

        next_link_found = 0
        if ('link' in r2.headers and count > 0):
            links = r2.headers['link'].split(',')
            for L in links:
                ll = L.split(';')
                link = ll[0].replace("<","")
                link = link.replace(">","")
                if re.search(r'next', ll[1]):
                    target = link
                    next_link_found = 1
                    break
        if not next_link_found: target = 0
        yield results


# for dicts with one key, collapse that one key out, cause
# paging makes problems... example: enrollment_terms
def fetch_collapse(target,collapse='',verbose=0):
    # if there are more results, recursivly call myself, adding on to the results.
    results = 0
    if target[0:4] != "http": target = url + target
    if verbose:
        print("++ Fetching: " + target)
    r2 = requests.get(target, headers = header)
    #if verbose:
    #print "++ Got: " + r2.text
    try:
        results = json.loads(r2.text)
    except:
        print("-- Failed to parse: ", r2.text)
    if verbose: print(r2.headers)

    if collapse and collapse in results:
        results = results[collapse]

    if ('link' in r2.headers):
        links = r2.headers['link'].split(',')
        for L in links:
            ll = L.split(';')
            link = ll[0].replace("<","")
            link = link.replace(">","")
            if re.search(r'next', ll[1]):
                if (verbose):  print("++ More link: " + link)
                nest = fetch_collapse(link, collapse, verbose)
                if isinstance(results,dict): results.update(nest)
                else: results.extend(nest)
    return results


################
################  CANVAS DATA
################
################
################


# Get canvas data 2024 style
def canvas_data_2024_run():
    print("Updating all tables.")
    asyncio.run(canvas_data_2024())
    print("Done with all tables.")


async def canvas_data_2024():

    base_url: str = os.environ["DAP_API_URL"]
    client_id: str = os.environ["DAP_CLIENT_ID"]
    client_secret: str = os.environ["DAP_CLIENT_SECRET"]
    #connection_string: str = "postgresql://postgres:rolley34@192.168.1.6/db"

    # todo: use secrets
    connection_string: str = "postgresql://postgres:rolley34@192.168.1.199/db"

    desired_tables = "users,courses,communication_channels,context_modules,conversation_message_participants,conversation_messages,conversation_participants,conversations,course_sections,enrollment_states,enrollment_dates_overrides,enrollment_terms,enrollments,learning_outcome_groups,learning_outcome_question_results,learning_outcomes,pseudonyms,quizzes,scores,submissions,submission_versions,wiki_pages,wikis".split(',')
    credentials = Credentials.create(client_id=client_id, client_secret=client_secret)

    async with DatabaseConnection(connection_string).open() as db_connection:
        async with DAPClient(base_url, credentials) as session:
            #tables = await session.get_tables("canvas")
            for table in desired_tables:
                print(f"  trying to update {table} ")
                try:
                    #await SQLReplicator(session, db_connection).initialize("canvas", table)
                    await SQLReplicator(session, db_connection).synchronize("canvas", table)
                except Exception as e:
                    print(f"  - skipping {table} because {e}")


# Get canvas data 2024 style
def setup_canvas_data_2024_run():
    print("Setting up all tables.")
    asyncio.run(setup_canvas_data_2024())
    print("Done with all tables.")


async def setup_canvas_data_2024():

    base_url: str = os.environ["DAP_API_URL"]
    client_id: str = os.environ["DAP_CLIENT_ID"]
    client_secret: str = os.environ["DAP_CLIENT_SECRET"]
    #connection_string: str = "postgresql://postgres:rolley34@192.168.1.6/db"
    connection_string: str = "postgresql://postgres:rolley34@192.168.1.192/db"

    desired_tables = "users,courses,communication_channels,context_modules,conversation_message_participants,conversation_messages,conversation_participants,conversations,course_sections,enrollment_states,enrollment_dates_overrides,enrollment_terms,enrollments,learning_outcome_groups,learning_outcome_question_results,learning_outcomes,pseudonyms,quizzes,scores,submissions,submission_versions,wiki_pages,wikis".split(',')
    credentials = Credentials.create(client_id=client_id, client_secret=client_secret)

    async with DatabaseConnection(connection_string).open() as db_connection:
        async with DAPClient(base_url, credentials) as session:
            #tables = await session.get_tables("canvas")
            for table in desired_tables:
                print(f"  {table}")
                try:
                    await SQLReplicator(session, db_connection).initialize("canvas", table)
                except Exception as e:
                    print(f"  - skipping {table} because {e}")


################
################  ROSTERS AND REGISTRATION
################
################
################

# todo: the pipeline is disorganized. Organize it to have
# a hope of taking all this to a higher level.
#

# todo: where does this belong in the pipeline? compare with recent_schedules()


# Take the generically named rosters uploads files and move them to a semester folder and give them a date.
def move_to_folder(sem,year,folder,files):
    semester = year+sem
    semester_path = 'cache/rosters/%s' % semester
    if not os.path.isdir('cache/rosters/'+semester):
        os.makedirs('cache/rosters/'+semester)
    now = datetime.datetime.now().strftime('%Y-%m-%dT%H-%M')
    print("+  Moving roster files to folder: %s" % semester_path)
    if not os.path.isdir(semester_path):
        print("+  Creating folder: %s" % semester_path)
        os.makedirs(semester_path)
    if 'courses.csv' in files:
        os.rename('cache/rosters/courses-%s.csv' % folder,     'cache/rosters/%s/courses.%s.csv' % (semester,now))
    if 'enrollments.csv' in files:
        os.rename('cache/rosters/enrollments-%s.csv' % folder, 'cache/rosters/%s/enrollments.%s.csv' % (semester,now))
    if 'users.csv' in files:
        os.rename('cache/rosters/users-%s.csv' % folder,       'cache/rosters/%s/users.%s.csv' % (semester,now))


# Take raw upload (csv) files and make one big json out of them.
# This relates to enrollment files, not schedule.
def convert_roster_files(semester="",year="",folder=""):
    if not semester:
        semester = input("the semester? (ex: spring) ")
        folder = input("Folder? (ex 2020-02-25-14-58-20) ")
    uf = open('cache/rosters/users-'+folder+'.csv','r')
    cf = open('cache/rosters/courses-'+folder+'.csv','r')
    ef = open('cache/rosters/enrollments-'+folder+'.csv','r')
    u = csv.DictReader(uf)
    c = csv.DictReader(cf)
    e = csv.DictReader(ef)
    uu = [i for i in u]
    cc = [i for i in c]
    ee = [i for i in e]
    uf.close()
    cf.close()
    ef.close()
    myrosterfile = 'cache/rosters/roster_%s_%s.json' % (year, semester)

    if os.path.exists(myrosterfile):
        print("  --  Moving previous combined roster json file. opening %s ..." % myrosterfile)
        last_fileobj = open(myrosterfile,'r')
        last_file = json.load(last_fileobj)

        last_fileobj.close()

        info = last_file[3]
        last_date = info['date_filestring']

        print('  --  writing:  cache/rosters/%s%s/roster_%s.json ...' % (year,semester,last_date))

        try:
            os.rename(myrosterfile, 'cache/rosters/%s%s/roster_%s.json ...' % (year,semester,last_date))
            print('  --  ok')
        except Exception as e:
            print("  **  Failed because i couldn't move the previous roster file: %s" % myrosterfile)
            print(e)
            myrosterfile = "new_" + myrosterfile
            pass
            #os.remove('cache/old_rosters/roster_'+semester+'.'+last_date+'.json')
            #os.rename(myrosterfile, 'cache/old_rosters/roster_'+semester+'.'+last_date+'.json')

    newinfo = {'date_filestring': datetime.datetime.now().strftime('%Y-%m-%dT%H-%M'), }
    try:
        new_roster = codecs.open(myrosterfile,'w', 'utf-8')
        new_roster.write( json.dumps( [uu,cc,ee,newinfo], indent=2 ))
        new_roster.close()
        print("  --  Wrote roster info to: %s." % myrosterfile)
    except Exception as e:
        print("  **  Failed because i couldn't move the previous roster file: %s" % myrosterfile)
        print("  **  " + str(e))


def file_doesnt_exist(name):
    # Get list of files in current directory
    files = os.listdir()

    # Filter out zero-size files and directories
    files = [f for f in files if os.path.isfile(f) and os.path.getsize(f) > 0]

    if name in files:
        print( f"   * file: {name} already exists. not downloading." )
    else:
        print( f"   * file: {name} downloading." )

    # Check if the file exists in the filtered list
    return not (name in files)


# From instructure sftp site
def fetch_current_rosters():
    import pysftp
    cnopts = pysftp.CnOpts()
    cnopts.hostkeys = None
    with pysftp.Connection(instructure_url,username=instructure_username, private_key=instructure_private_key,cnopts=cnopts) as sftp:
        sftp.chdir('SIS')
        files = sftp.listdir()
        ff = open('cache/pipeline.log.txt','a')
        now = datetime.datetime.now()
        exact_time = now.strftime('%Y-%m-%d-%H-%M-%S')
        rounded_hour = (now.replace(second=0, microsecond=0, minute=0, hour=now.hour)
                       + timedelta(hours=now.minute//30))

        rounded_time = rounded_hour.strftime('%Y-%m-%d-%H')

        if len(files)>0:  # and 'users.csv' in files:
            print(f"--> {exact_time}: I see these files at instructure ftp site:")
            [print(f"    - {f}") for f in files]
            i = 0
            seen_files = []
            check = ['login','users','courses','enrollments']

            for checking in check:
                try:
                    if f'{checking}.csv' in files and file_doesnt_exist(f'{checking}-{rounded_time}.csv'):
                        sftp.get(f'{checking}.csv',f'cache/rosters/{checking}-{rounded_time}.csv')
                        i += 1
                        seen_files.append(f'{checking}.csv')
                except:
                    print(f' * {checking}.csv not present')
            print('   Saved %i data files in rosters folder.' % i)
            ff.write( f"   Saved {i} data files: {seen_files}")

            if i>2:
                if 'courses.csv' in seen_files:
                    courses = open(f'cache/rosters/courses-{rounded_time}.csv','r')
                    courses.readline()
                    a = courses.readline()
                    print(a)
                    courses.close()
                    parts = a.split(',')
                    year = parts[1][0:4]
                    ss = parts[1][4:6]
                    sem = {'30':'spring', '50':'summer', '70':'fall' }
                    this_sem = sem[ss]
                    print(f" -> This semester is: {this_sem}, {year}" )
                    print(f" -> Building data file... {rounded_time}")
                    convert_roster_files(this_sem,year,rounded_time)
                    print(' -> moving files...')
                    ff.write( f"   Moved files to folder: {this_sem} {year} {rounded_time}\n")
                    move_to_folder(this_sem,year,rounded_time,seen_files)
                else:
                    print(" * No courses file. Not moving files.")
                    ff.write( f" * No courses file. Not moving files.\n")
        else:
            print(f"--> {exact_time}: Don't see files.")
    sftp.close()

def fetch_current_rosters_auto():
    fetch_minute = "56,57,58,59,00,01,02,03,04,05,06".split(",")
    for m in fetch_minute:
        schedule.every().hour.at(f":{m}").do(fetch_current_rosters)

    #schedule.every().day.at("12:35").do(sync_non_interactive)
    #schedule.every().day.at("21:00").do(sync_non_interactive)


    #print(f"running every hour on the :{fetch_minute}\n")
    while True:
        try:
            schedule.run_pending()
            time.sleep(4)
        except Exception as e:
            import traceback
            print(" ---- * * * Failed with: %s" % str(e))
            ff = open('cache/pipeline.log.txt','a')
            ff.write(datetime.datetime.now().strftime('%Y-%m-%d-%H-%M-%S') + "\n")
            ff.write(traceback.format_exc()+"\n---------\n\n")
            ff.close()
            #schedule.CancelJob
        time.sleep(1)


# Canvas data, download all new files
def sync_non_interactive():
    resp = do_request('/api/account/self/file/sync')
    mylog.write(json.dumps(resp, indent=4))
    #mylog.close()
    gotten = os.listdir(local_data_folder)
    wanted = []
    i = 0
    for x in resp['files']:
        filename = x['filename']
        exi = "No "
        if filename in gotten: exi = "Yes"
        else: wanted.append(x)

        print(str(i) + '.\tLocal: %s\tRemote: %s' % ( exi, filename ))
        i += 1
    print("I will attempt to download %i files." % len(wanted))

    #answer = input("Press enter to begin, or q to quit ")
    #if not answer == '': return

    good_count = 0
    bad_count = 0
    for W in wanted:
        print("Downloading: " + W['filename'])
        response = requests.request(method='GET', url=W['url'], stream=True)
        if(response.status_code != 200):
            print('Request response went bad. Got back a %s code, meaning the request was %s' % \
                 (response.status_code, response.reason))
            print('URL: ' + W['url'])
            bad_count += 1

        else:
            #Use the downloaded data
            with open(local_data_folder + W['filename'], 'wb') as fd:
                for chunk in response.iter_content(chunk_size=128):
                    fd.write(chunk)
            print("Success")
            good_count += 1
    print("Out of %i files, %i succeeded and %i failed." % (len(wanted),good_count,bad_count))


## OLD STYLE CANVAS DATA

# Get something from Canvas Data
def do_request(path):
    #Set up the request pieces
    method = 'GET'
    host = 'api.inshosteddata.com'
    apiTime = datetime.utcnow().strftime('%a, %d %b %y %H:%M:%S GMT')
    apiContentType = 'application/json'

    msgList = []
    msgList.append(method)
    msgList.append(host)
    msgList.append(apiContentType)
    msgList.append('')
    msgList.append(path)
    msgList.append('')
    msgList.append(apiTime)
    msgList.append(apiSecret)

    msgStr = bytes("".join("%s\n" % k for k in msgList).strip(),'utf-8')

    sig = base64.b64encode(hmac.new(key=bytes(apiSecret,'utf-8'),msg=msgStr,digestmod=hashlib.sha256).digest())
    sig = sig.decode('utf-8')

    headers = {}
    headers['Authorization'] = 'HMACAuth {}:{}'.format(apiKey,sig)
    headers['Date'] = apiTime
    headers['Content-type'] = apiContentType


    #Submit the request/get a response
    uri = "https://"+host+path
    print (uri)
    print (headers)
    response = requests.request(method='GET', url=uri, headers=headers, stream=True)

    #Check to make sure the request was ok
    if(response.status_code != 200):
        print(('Request response went bad. Got back a ', response.status_code, ' code, meaning the request was ', response.reason))
    else:
        #Use the downloaded data
        jsonData = response.json()
        #print(json.dumps(jsonData, indent=4))
        return jsonData


################
################  SENDING DATA AWAY
################
################
################

# Upload a json file to www
def put_file(remotepath,localpath, localfile,prompt=1):
    import pysftp
    show_all = 0
    folder = datetime.datetime.now().strftime('%Y-%m-%d-%H-%M-%S')
    cnopts = pysftp.CnOpts()
    cnopts.hostkeys = None

    with pysftp.Connection(FTP_SITE,username=FTP_USER, password=FTP_PW,cnopts=cnopts) as sftp:
        #todo: these paths
        #files = sftp.listdir()
        #print(folder + "\tI see these files on remote: ", files, "\n")
        sftp.chdir(remotepath)
        files = sftp.listdir()
        if show_all: print(folder + "\tI see these files on remote: ", files, "\n")
        localf = os.listdir(localpath)
        if show_all: print("I see these local: ", localf)
        if prompt:
            input('ready to upload')
        sftp.put(localpath+localfile, localfile, preserve_mtime=True)
        sftp.close()


        """
        # copy files and directories from local static, to remote static,
        # preserving modification times on the files
        for f in localf:
            print("This local file: " + f + " ", end=' ')
            if not f in files:
                sftp.put('video_srt/'+classfoldername+'/'+f, f, preserve_mtime=True)
                print("Uploaded.")
            else:
                print("Skipped.")
        """

        """if len(files)==3 and 'users.csv' in files:
            sftp.get('courses.csv','rosters/courses-'+folder+'.csv')
            sftp.get('users.csv','rosters/users-'+folder+'.csv')
            sftp.get('enrollments.csv','rosters/enrollments-'+folder+'.csv')
            print folder + '\tSaved three data files in rosters folder.'

            courses = open('rosters/courses-'+folder+'.csv','r')
            courses.readline()
            a = courses.readline()
            print a
            courses.close()
            parts = a.split(',')
            year = parts[1][0:4]
            ss = parts[1][4:6]
            #print parts[1]
            sem = {'30':'spring', '50':'summer', '70':'fall' }
            this_sem = sem[ss]
            #print this_sem, "", year
            print folder + '\tbuilding data file...'
            convert_roster_files(this_sem,year,folder)
            print folder + '\tmoving files...'
            move_to_folder(this_sem,year,folder)
        else:
            print folder + "\tDon't see all three files."""


################
################  GOOGLE DOCS
################
################
################

def sec(t): return "<h3>"+t+"</h3>\n"
def para(t): return "<p>"+t+"</p>\n"
def ul(t): return "<ul>"+t+"</ul>\n"
def li(t): return "<li>"+t+"</li>\n"

def question(t,bracket=1):
    ret = ''
    match = re.search( r'\[(.*)\]', t)
    if match and bracket:
        ret += "<a name='" + match.group(1) + "'></a>"
        t = re.sub( r'\[.*\]','',t)
    else:
        parts = t.split(' ')
        id = ''
        for p in parts:
            if re.search(r'[a-zA-Z]',p[0]): id += p[0]
        ret += "<a name='%s'></a>" % id.lower()
    return ret + '<div class="accordion" data-accordion=""><h4 class="acrd_cntl">' + t + '</h4>\n<div class="acrd_cntnt">'

def answer(t):
    return t + '</div></div>\n'

def read_paragraph_element(element,type="NORMAL_TEXT"):
    """Returns the text in the given ParagraphElement.

        Args:
            element: a ParagraphElement from a Google Doc.
    """
    text_run = element.get('textRun')
    begin = ''
    end = ''
    if not text_run:
        return ''
    if 'textStyle' in text_run and 'link' in text_run['textStyle']:
        begin = '<a href="' + text_run['textStyle']['link']['url'] + '">'
        end = '</a>'
    if 'textStyle' in text_run and 'bold' in text_run['textStyle'] and text_run['textStyle']['bold']==True and type=="NORMAL_TEXT":
        begin = '<strong>' + begin
        end = end + '</strong>'

    content = text_run.get('content')
    content = re.sub(u'\u000b','<br />\n',content)

    return begin + content + end


def get_doc(docid, bracket=1, verbose=0):
    import pickle
    import os.path
    from googleapiclient.discovery import build
    from google_auth_oauthlib.flow import InstalledAppFlow
    from google.auth.transport.requests import Request

    #ooout = open(fileout,'w')

    # If modifying these scopes, delete the file token.pickle.
    SCOPES = ['https://www.googleapis.com/auth/documents.readonly']
    creds = None
    # The file token.pickle stores the user's access and refresh tokens, and is
    # created automatically when the authorization flow completes for the first
    # time.
    if os.path.exists('token.pickle'):
        with open('token.pickle', 'rb') as token:
            creds = pickle.load(token)
    # If there are no (valid) credentials available, let the user log in.
    if not creds or not creds.valid:
        if creds and creds.expired and creds.refresh_token:
            creds.refresh(Request())
        else:
            flow = InstalledAppFlow.from_client_secrets_file(
                'credentials.json', SCOPES)
            creds = flow.run_local_server(port=0)
        # Save the credentials for the next run
        with open('token.pickle', 'wb') as token:
            pickle.dump(creds, token)

    service = build('docs', 'v1', credentials=creds)

    # Retrieve the documents contents from the Docs service.
    document = service.documents().get(documentId=docid).execute()
    if verbose: print(document)

    tempout = codecs.open('cache/trash/gdoctemp.txt','w','utf-8')
    tempout.write( json.dumps(document,indent=2) + "\n\n\n------------------------------------\n\n")
    if verbose: print('The title of the document is: {}'.format(document.get('title')))
    doc_content = document.get('body').get('content')
    if verbose: print(doc_content)

    doc_objects = document.get('inlineObjects')
    if verbose: print(doc_objects)

    doc_lists = document.get('lists')

    text = '<div class="acrd_grp" data-accordion-group="">'
    last_type = ''
    answer_text = ''
    in_a_list = ''

    img_count = 1
    img_lookup = {}
    img_heights = {}
    img_widths = {}

    if doc_objects:
        for k,value in doc_objects.items():
            tempout.write( "->" + k + "=" + json.dumps(value,indent=2) + "\n\n\n--\n\n")
            if 'inlineObjectProperties' in value:
                if 'embeddedObject' in value['inlineObjectProperties']:
                    if 'imageProperties' in value['inlineObjectProperties']['embeddedObject']:
                        if 'contentUri' in value['inlineObjectProperties']['embeddedObject']['imageProperties']:
                            print(k)
                            uu = value['inlineObjectProperties']['embeddedObject']['imageProperties']['contentUri']
                            response = requests.get(uu, stream=True)
                            name = 'image_' + str(img_count) + '.' + response.headers['content-type'].split('/')[1]
                            img_count += 1

                            img_lookup[k] = name

                            with open('cache/doc_images/'+name, 'wb') as out_file:
                                shutil.copyfileobj(response.raw, out_file)
                            print(uu)
                            print(response.headers)
                            print(name)
                            #input('x?')
                            del response
                    if 'size' in  value['inlineObjectProperties']['embeddedObject']:
                        img_heights[k] = int(value['inlineObjectProperties']['embeddedObject']['size']['height']['magnitude'])
                        img_widths[k] = int(value['inlineObjectProperties']['embeddedObject']['size']['width']['magnitude'])

    tempout.write('- - - - - - - -\n\n')
    #for value in doc_lists:
    #    tempout.write( json.dumps(value,indent=2) + "\n\n\n--\n\n")

    tempout.write('- - - - - - - -\n\n')
    list_stack = []
    list_depth = 0
    last_list_depth = 0
    for value in doc_content:
        tempout.write( json.dumps(value,indent=2) + "\n\n\n")
        if verbose: print(json.dumps(value, sort_keys=True, indent=4))

        # todo: x link, x bold, list, image.
        tag_fxn = para
        if 'paragraph' in value:
            this_text = ''

            if 'bullet' in value['paragraph']:
                # either we're (1)starting a new list, (2)in one, (3)starting a nested one, or (4)finished a nested one.

                lid = value['paragraph']['bullet']['listId']

                if not list_stack:  # 1
                    list_stack.append(lid)
                else:
                    if lid == list_stack[0]:   # 2
                        pass

                    else:
                        if not lid in list_stack:   # 3
                            list_stack.append(lid)
                        else:                       # 4
                            x = list_stack.pop()
                            while x != lid: list_stack.pop()
            elif len(list_stack) > 0:                           #  current para isn't a bullet but we still have a list open.
                list_stack = []

            list_depth = len(list_stack)

            deeper = list_depth - last_list_depth

            if deeper > 0:
                answer_text += "<ul>" * deeper
            elif deeper < 0:
                deeper = -1 * deeper
                answer_text += "</ul>" * deeper

            if len(list_stack):
                tag_fxn = li

            elements = value.get('paragraph').get('elements')

            # inlineObjectElement": {
            # "inlineObjectId": "kix.ssseeu8j9cfx",

            if 'paragraphStyle' in value.get('paragraph'):
                style = value.get('paragraph').get('paragraphStyle')
                #text += json.dumps(style, sort_keys=True, indent=4)
                if 'namedStyleType' in style:
                    type = style['namedStyleType']

            for elem in elements:

                # text content
                this_text += read_paragraph_element(elem,type)

                # image content
                if 'inlineObjectElement' in elem:
                    vpi = elem['inlineObjectElement']
                    if 'inlineObjectId' in vpi:
                        ii = vpi['inlineObjectId']
                        if ii in img_lookup:
                            img = img_lookup[ii]
                            h = img_heights[ii]
                            w = img_widths[ii]
                            this_text += '<img src="doc_images/%s" width="%i" height="%i" />' % (img,w,h)


            if last_type=='NORMAL_TEXT' and type!=last_type:
                text += answer(answer_text)
                answer_text = ''

            if type=='HEADING_2':
                text += sec(this_text)
                this_text = ''
            elif type=='HEADING_3':
                text += question(this_text,bracket)
                this_text = ''
            else:
                answer_text += tag_fxn(this_text)
                this_text = ''
            last_type = type
            last_list_depth = list_depth

        elif 'table' in value:
            # The text in table cells are in nested Structural Elements and tables may be
            # nested.
            text += "\nTABLE\n"
            #table = value.get('table')
            #for row in table.get('tableRows'):
            #    cells = row.get('tableCells')
            #    for cell in cells:
            #        text += read_strucutural_elements(cell.get('content'))
        #elif 'tableOfContents' in value:
        #    # The text in the TOC is also in a Structural Element.
        #    toc = value.get('tableOfContents')
        #    text += read_strucutural_elements(toc.get('content'))

        #else:
        #    print(json.dumps(value, sort_keys=True, indent=4))

    text += answer(answer_text)
    #text += '</div>'
    #print(text)
    return text

######### TRY #2 ######


def read_paragraph_element_2(element,type="NORMAL_TEXT"):
    text_run = element.get('textRun')
    begin = ''
    end = ''
    if not text_run: return ''
    if 'textStyle' in text_run and 'link' in text_run['textStyle']:
        begin = '<a href="' + text_run['textStyle']['link']['url'] + '">'
        end = '</a>'
    if 'textStyle' in text_run and 'bold' in text_run['textStyle'] and text_run['textStyle']['bold']==True and type=="NORMAL_TEXT":
        begin = '<strong>' + begin
        end = end + '</strong>'
    elif 'textStyle' in text_run and 'italic' in text_run['textStyle'] and text_run['textStyle']['italic']==True and type=="NORMAL_TEXT":
        begin = '<em>' + begin
        end = end + '</em>'
    content = text_run.get('content')
    content = re.sub(u'\u000b','<br />\n',content)
    return begin + content + end

# t is a string that begins with "Icons: " ... and contains comma(space) separated list
def handle_icons(t):
    text = t[7:].strip()
    parts = text.split(", ")
    return ('icons',parts)

# t is a string that begins with "Tags: " ... and contains comma(space) separated list
def handle_tags(t):
    text = t[6:].strip()
    parts = text.split(", ")
    return ('tags',parts)

def handle_question(t,bracket=1):
    anchor = ''
    match = re.search( r'\[(.*)\]', t)
    if match and bracket:
        anchor = match.group(1).lower()
        t = re.sub( r'\[.*\]','',t)
    else:
        parts = t.split(' ')
        for p in parts:
            if re.search(r'[a-zA-Z]',p[0]): anchor += p[0].lower()
    return ('question', t, anchor)

def handle_answer(t):
    return ('answer',t)

def handle_sec(t): return ('section',t)
def handle_para(t): return ('paragraph',t)
def handle_ul(t): return ('unorderdedlist',t)
def handle_li(t): return ('listitem',t)


img_count = 1
img_lookup = {}
img_heights = {}
img_widths = {}


def fetch_doc_image(k,value):
    global img_count, img_lookup, img_heights, img_widths
    if 'inlineObjectProperties' in value:
        if 'embeddedObject' in value['inlineObjectProperties']:
            if 'imageProperties' in value['inlineObjectProperties']['embeddedObject']:
                if 'contentUri' in value['inlineObjectProperties']['embeddedObject']['imageProperties']:
                    print(k)
                    uu = value['inlineObjectProperties']['embeddedObject']['imageProperties']['contentUri']
                    response = requests.get(uu, stream=True)
                    name = 'image_' + str(img_count) + '.' + response.headers['content-type'].split('/')[1]
                    img_count += 1
                    img_lookup[k] = name

                    with open('cache/doc_images/'+name, 'wb') as out_file:
                        shutil.copyfileobj(response.raw, out_file)
                    print(uu)
                    print(response.headers)
                    print(name)
                    del response
            if 'size' in  value['inlineObjectProperties']['embeddedObject']:
                img_heights[k] = int(value['inlineObjectProperties']['embeddedObject']['size']['height']['magnitude'])
                img_widths[k] = int(value['inlineObjectProperties']['embeddedObject']['size']['width']['magnitude'])


def get_doc_generic(docid, bracket=1, verbose=0):
    import pickle
    import os.path
    from googleapiclient.discovery import build
    from google_auth_oauthlib.flow import InstalledAppFlow
    from google.auth.transport.requests import Request
    global img_count, img_lookup, img_heights, img_widths

# If modifying these scopes, delete the file token.pickle.
    SCOPES = ['https://www.googleapis.com/auth/documents.readonly']
    creds = None
    # The file token.pickle stores the user's access and refresh tokens, and is
    # created automatically when the authorization flow completes for the first
    # time.
    if os.path.exists('token.pickle'):
        with open('token.pickle', 'rb') as token:
            creds = pickle.load(token)
    if not creds or not creds.valid:
        if creds and creds.expired and creds.refresh_token:
            creds.refresh(Request())
        else:
            flow = InstalledAppFlow.from_client_secrets_file(
                'credentials.json', SCOPES)
            creds = flow.run_local_server(port=0)
        # Save the credentials for the next run
        with open('token.pickle', 'wb') as token:
            pickle.dump(creds, token)

    service = build('docs', 'v1', credentials=creds)

    # Retrieve the documents contents from the Docs service.
    document = service.documents().get(documentId=docid).execute()

    tempout = codecs.open('cache/trash/gdoctemp.txt','w','utf-8')
    tempout.write( json.dumps(document,indent=2) \
        + "\n\n\n------------------------------------\n\n")
    if verbose: print('The title of the document is: {}'.format(document.get('title')))

    doc_content = document.get('body').get('content')
    doc_objects = document.get('inlineObjects')
    doc_lists = document.get('lists')

    #text = ''
    result = []
    last_type = ''
    #answer_text = ''
    answer = []
    in_a_list = ''

    # Get all the images
    for k,value in doc_objects.items():
        tempout.write( "->" + k + "=" + json.dumps(value,indent=2) + "\n\n\n--\n\n")
        fetched = fetch_doc_image(k,value)

    list_stack = []
    list_depth = 0
    last_list_depth = 0
    for value in doc_content:
        tempout.write( json.dumps(value,indent=2) + "\n\n\n")
        if verbose: print(json.dumps(value, sort_keys=True, indent=4))

        tag_fxn = handle_para
        if 'paragraph' in value:
            this_text = ''

            # First we deal with if we're in a list.
            if 'bullet' in value['paragraph']:
                # either we're (1)starting a new list, (2)in one (do nothing),
                #  (3)starting a nested one, or (4)finished a nested one.
                lid = value['paragraph']['bullet']['listId']
                if not list_stack:  # 1
                    list_stack.append(lid)
                else:
                    if not lid == list_stack[0]:
                        if not lid in list_stack:   # 3
                            list_stack.append(lid)
                        else:                       # 4
                            x = list_stack.pop()
                            while x != lid: list_stack.pop()
            elif len(list_stack) > 0:
                #  current para isn't a bullet but we still have a list open.
                list_stack = []


            list_depth = len(list_stack)
            deeper = list_depth - last_list_depth
            if deeper > 0:
                answer.append("<ul>" * deeper)
            elif deeper < 0:
                deeper = -1 * deeper
                answer.append("</ul>" * deeper)
            if len(list_stack):
                tag_fxn = handle_li

            # NOW the tag_fxn is either 'para' or 'li'... let's get the styling info next,
            elements = value.get('paragraph').get('elements')
            if 'paragraphStyle' in value.get('paragraph'):
                style = value.get('paragraph').get('paragraphStyle')
                if 'namedStyleType' in style:
                    type = style['namedStyleType']

            # and FINALLY, the actual contents.
            for elem in elements:
                # text content
                this_text += read_paragraph_element_2(elem,type)

                # image content
                if 'inlineObjectElement' in elem:
                    vpi = elem['inlineObjectElement']
                    if 'inlineObjectId' in vpi:
                        ii = vpi['inlineObjectId']
                        if ii in img_lookup:
                            img = img_lookup[ii]
                            h = img_heights[ii]
                            w = img_widths[ii]
                            this_text += '<img src="doc_images/%s" width="%i" height="%i" />' % (img,w,h)


            # Now for something tricky. Call an appropriate handler, based on:
            #  (a) what is the paragraph style type?
            #  (b) is it different from the prev one?

            if last_type=='NORMAL_TEXT' and type!=last_type:
                if this_text.strip():
                    result.append(handle_answer(answer))
                answer = []
                #answer_text = ''

            if type=='HEADING_2' and this_text.strip():
                result.append( handle_sec(this_text) )
                this_text = ''
            elif type=='HEADING_3' and this_text.strip():
                result.append(handle_question(this_text,bracket))
                this_text = ''
            else:
                if this_text.lower().startswith('tags:'):
                    tag_fxn = handle_tags
                if this_text.lower().startswith('icons:'):
                    tag_fxn = handle_icons
                if this_text.strip():
                    answer.append(tag_fxn(this_text))
                this_text = ''
            last_type = type
            last_list_depth = list_depth

        elif 'table' in value:
            pass


    result.append(handle_answer(answer))
    return json.dumps(result,indent=4)


def process_reg_history(term='fa25'):
    from collections import defaultdict
    from itertools import groupby
    from operator import itemgetter

    def read_grouped_csv(path):
        with open(path, newline='') as f:
            fieldnames = ['datetime', 'crn', 'course', 'teacher', 'max', 'enrolled', 'waitlistmax', 'waitlisted']
            reader = csv.DictReader(f, fieldnames=fieldnames)
            rows = sorted(reader, key=lambda r: r['datetime'])        # Group by timestamp
        grouped = {}
        for ts, group in groupby(rows, key=itemgetter('datetime')):
            grouped[ts] = {r['crn']: r for r in group}
        return grouped

    def crossed_threshold(old_val, new_val, max_val):
        thresholds = [0.25, 0.5, 0.75, 1.0]
        if int(max_val) == 0:
            return False, None
        old_ratio = int(old_val) / int(max_val)
        new_ratio = int(new_val) / int(max_val)
        for t in thresholds:
            if old_ratio < t <= new_ratio:
                return True, int(t * 100)
        return False, None

    def detect_changes(prev, curr):
        changes = defaultdict(list)

        all_crns = prev.keys() | curr.keys()
        for crn in all_crns:
            o, n = prev.get(crn), curr.get(crn)
            if not o:
                changes[crn].append((n['datetime'], "Section was added."))
            elif not n:
                changes[crn].append((
                    o['datetime'],
                    f"Section was removed (last seen: teacher {o['teacher']}, "
                    f"{o['enrolled']}/{o['max']} enrolled, {o['waitlisted']}/{o['waitlistmax']} waitlisted)."
                ))
            else:
                dt = n['datetime']
                if o['teacher'] != n['teacher']:
                    changes[crn].append((dt, f"Teacher changed from {o['teacher']} to {n['teacher']}."))
                if o['enrolled'] != n['enrolled']:
                    crossed, percent = crossed_threshold(o['enrolled'], n['enrolled'], n['max'])
                    if crossed:
                        changes[crn].append((dt, f"Enrollment crossed {percent}% ({n['enrolled']}/{n['max']})."))
                if int(n['waitlisted']) > 10 and o['waitlisted'] != n['waitlisted']:
                    changes[crn].append((dt, f"Waitlist exceeds 10: {n['waitlisted']}."))
        return changes

    def time_to_iso(s):
        return datetime.datetime.strptime(s, "%Y-%m-%dT%H-%M").isoformat()

    def detect_changes_structured(prev, curr):
        changes = defaultdict(list)

        all_crns = prev.keys() | curr.keys()
        for crn in all_crns:
            o, n = prev.get(crn), curr.get(crn)
            if not o:
                changes[crn].append({'time':time_to_iso(n['datetime']), "type":'section update', 'message': "Section was added."})
            elif not n:
                changes[crn].append(
                    {'time':time_to_iso(o['datetime']), "type":'section update', 'message': "Section was removed.",
                     'value': o['enrolled'], 'capacity': o['max'], })
            else:
                dt = time_to_iso(n['datetime'])
                if o['teacher'] != n['teacher']:
                    changes[crn].append({'time':dt, "type":'teacher_change',
                     'message': f"Teacher changed from {o['teacher']} to {n['teacher']}.",
                     'old_teacher': o['teacher'], 'new_teacher': n['teacher'], })
                if o['enrolled'] != n['enrolled']:
                    crossed, percent = crossed_threshold(o['enrolled'], n['enrolled'], n['max'])
                    if crossed:
                        changes[crn].append({'time':dt, "type":'enrollment_milestone',
                     'message': f"Enrollment crossed {percent}% ({n['enrolled']}/{n['max']}).",
                     'percent':percent,'value':n['enrolled'],'capacity':n['max'] })
                if int(n['waitlisted']) > 10 and o['waitlisted'] < n['waitlisted']:
                    changes[crn].append({'time':dt, "type":'enrollment_milestone',
                     'message': f"Waitlist exceeds 10: {n['waitlisted']}).",
                     'value':n['waitlisted']})
        return changes


    def process_diff_timeline(path):
        snapshots = read_grouped_csv(path)
        timeline = sorted(snapshots.keys())
        timeline_diffs = []
        timeline_diffs_structured = []
        course_names = {}  # crn -> latest known course name

        for i in range(1, len(timeline)):
            prev_ts, curr_ts = timeline[i-1], timeline[i]
            prev, curr = snapshots[prev_ts], snapshots[curr_ts]

            # update course name map
            for crn, row in curr.items():
                course_names[crn] = row['course']

            delta = detect_changes(prev, curr)
            timeline_diffs.append(delta)

            delta_structured = detect_changes_structured(prev,curr)
            timeline_diffs_structured.append(delta_structured)

        # Flatten and group by crn
        crn_changes = defaultdict(list)
        for delta in timeline_diffs:
            for crn, changes in delta.items():
                crn_changes[crn].extend(changes)

        # Flatten and group by crn
        crn_changes_structured = defaultdict(list)
        for delta in timeline_diffs_structured:
            for crn, changes in delta.items():
                crn_changes_structured[crn].extend(changes)

        # Sort changes for each CRN by datetime
        for crn in crn_changes:
            crn_changes[crn].sort(key=lambda x: x[0])

        # Sort changes for each CRN by datetime
        for crn in crn_changes_structured:
            crn_changes[crn].sort(key=lambda x: x[0])

        return crn_changes, crn_changes_structured, course_names

    fresh_history = requests.get(f"http://gavilan.cc/schedule/reg_history_{term}.csv").text
    fresh_file = codecs.open(f'cache/reg_history_{term}.csv','w','utf-8')
    fresh_file.write(fresh_history)
    fresh_file.close()

    output1 = codecs.open(f'cache/reg_timeline_{term}.txt','w','utf-8')
    output2 = codecs.open(f'cache/reg_timeline_{term}.json','w','utf-8')
    changes, changes_structured, course_names = process_diff_timeline(f"cache/reg_history_{term}.csv")

    # once for plain text

    for crn in sorted(changes, key=lambda c: course_names.get(c, "")):
        course = course_names.get(crn, "")
        course_output = {'code': course, 'crn':crn,'events':[]}
        print(f"\n{course} (CRN {crn}):")
        output1.write(f"\n{course} (CRN {crn}):\n")
        for dt, msg in changes[crn]:
            print(f"  [{dt}] {msg}")
            output1.write(f"  [{dt}] {msg}\n")

            course_output['events'].append({'message':msg, 'time':time_to_iso(dt)})

    # again for structured
    crn_list = []

    for crn in sorted(changes_structured, key=lambda c: course_names.get(c, "")):
        course = course_names.get(crn, "")
        course_output = {'code': course, 'crn':crn,'events':changes_structured[crn]}
        crn_list.append(course_output)

    output2.write( json.dumps(crn_list,indent=2) )
    output2.close()


def recreate_all():
    for x in 'sp20 su20 fa20 sp21 su21 fa21 sp22 su22 fa22 sp23 su23 fa23 sp24'.split(' '):
        try:
            recreate_reg_data(x)
        except Exception as e:
            print(f'Failed on {x} with: {e}')


def recreate_reg_data(term="fa25"):
    from collections import defaultdict
    from datetime import datetime

    def parse_row(row):
        dt = datetime.strptime(row['datetime'], "%Y-%m-%dT%H-%M")
        crn = row['crn']
        enrolled = int(row['enrolled'])
        return dt, row['datetime'], crn, enrolled

    def reduce_latest_per_day(rows):
        latest = defaultdict(dict)  # latest[crn][date] = (dt, ts, enrolled)
        latest_ts_by_date = {}      # date → (dt, ts) for header naming

        for row in rows:
            dt, full_ts, crn, enrolled = parse_row(row)
            date_str = dt.date().isoformat()
            ts_header = dt.strftime("%Y-%m-%dT%H")  # <-- this is what we want

            # for each crn, per day, keep latest reading
            if date_str not in latest[crn] or dt > latest[crn][date_str][0]:
                latest[crn][date_str] = (dt, ts_header, enrolled)

            # also record latest timestamp per day for consistent column headers
            if date_str not in latest_ts_by_date or dt > latest_ts_by_date[date_str][0]:
                latest_ts_by_date[date_str] = (dt, ts_header)

        return latest, [ts for _, ts in sorted(latest_ts_by_date.values())]

    def pivot_table(latest, headers):
        crns = sorted(latest)
        table = []

        for crn in crns:
            row = [crn]
            for ts in headers:
                date_str = ts[:10]  # match on YYYY-MM-DD
                val = latest[crn].get(date_str)
                if val and val[1] == ts:
                    row.append(str(val[2]))
                else:
                    row.append("")
            table.append(row)

        return ['crn'] + headers, table

    #with open(f"cache/reg_history_{term}.csv", newline='') as f:
    from io import StringIO
    url = f"https://gavilan.cc/schedule/reg_history_{term}.csv"

    # Download
    resp = requests.get(url)
    resp.raise_for_status()   # raises if bad status

    # Wrap the text in a file-like object
    f = StringIO(resp.text)

    fieldnames = ['datetime', 'crn', 'course', 'teacher', 'max', 'enrolled', 'waitlistmax', 'waitlisted']
    reader = csv.DictReader(f, fieldnames=fieldnames)
    rows = list(reader)

    latest, headers = reduce_latest_per_day(rows)
    header_row, table = pivot_table(latest, headers)

    with open(f"cache/reg_data_{term}.csv", "w", newline='') as f:
        writer = csv.writer(f)
        writer.writerow(header_row)
        writer.writerows(table)


if __name__ == "__main__":

    print ('')
    options = { 1: ['Fetch rosters on schedule',fetch_current_rosters_auto] ,
                2: ['Get canvas data 2024 style', canvas_data_2024_run ],
                3: ['Set up canvas data 2024 style', setup_canvas_data_2024_run],
                4: ['Narrative timeline of section updates', process_reg_history],
                5: ['Create narrative format all semesters', recreate_all],
                6: ['Recreate reg_data from full reg history', recreate_reg_data],
    }

    '''1: ['Re-create schedule csv and json files from raw html',recent_schedules] ,
    2: ['Fetch rosters',fetch_current_rosters] ,
    3:
    4: ['Compute how registration is filling up classes', schedule_filling] ,
    5: ['Manually convert 3 csv files to joined json enrollment file.', convert_roster_files] ,
    6: ['Canvas data: interactive sync', interactive ],
    7: ['Canvas data: automated sync', sync_non_interactive ],
    8:
    9:
    16: ['Scrape schedule from ssb', scrape_schedule_multi ],
    14: ['Generate latestart schedule', list_latestarts ],
    15: ['Test ssb calls with python', scrape_schedule_py ],
    10: ['schedule to db', scrape_for_db ],
    11: ['clean argos draft schedule file', argos_data_from_cvc],
    12: ['make expanded schedule json files of old semesters', expand_old_semesters ],
    13: ['Parse deanza schedule', dza_sched ],
    '''


    if len(sys.argv) > 1 and re.search(r'^\d+',sys.argv[1]):
        resp = int(sys.argv[1])
        print("\n\nPerforming: %s\n\n" % options[resp][0])

    else:
        print ('')
        for key in options:
            print(str(key) + '.\t' + options[key][0])

        print('')
        resp = input('Choose: ')

    # Call the function in the options dict
    options[ int(resp)][1]()

# Testing

#if __name__ == "__main__":
    #users = fetch('/api/v1/courses/69/users?per_page=100',1)
    #print "These are the users: "
    #print users

    #getSemesterSchedule()


    #get_doc()
    #pass