From 9584f45f3076620b54594f09f4524704a0753939 Mon Sep 17 00:00:00 2001
From: Peter Howell <peter.howell@gmail.com>
Date: Fri, 5 Sep 2025 23:06:34 +0000
Subject: [PATCH] cleanup

---
 content.py    |  329 ++++++-
 courses.py    |   19 +-
 depricated.py |   92 ++
 localcache.py |    4 +-
 pipelines.py  | 2553 ++++++++++++++++++++-----------------------------
 search.py     |   23 +
 users.py      |    8 +-
 7 files changed, 1472 insertions(+), 1556 deletions(-)
diff --git a/content.py b/content.py
index 9332113..c8b92b4 100644
--- a/content.py
+++ b/content.py
@@ -1528,6 +1528,334 @@ LANE: HyFlex
 
 
 
+################
+################  GOOGLE DOCS HELPERS (moved from pipelines)
+################
+
+def sec(t): return "<h3>"+t+"</h3>\n"
+def para(t): return "<p>"+t+"</p>\n"
+def ul(t): return "<ul>"+t+"</ul>\n"
+def li(t): return "<li>"+t+"</li>\n"
+
+def question(t,bracket=1):
+    ret = ''
+    match = re.search( r'\[(.*)\]', t)
+    if match and bracket:
+        ret += "<a name='" + match.group(1) + "'></a>"
+        t = re.sub( r'\[.*\]','',t)
+    else:
+        parts = t.split(' ')
+        id = ''
+        for p in parts:
+            if re.search(r'[a-zA-Z]',p[0]): id += p[0]
+        ret += "<a name='%s'></a>" % id.lower()
+    return ret + '<div class="accordion" data-accordion=""><h4 class="acrd_cntl">' + t + '</h4>\n<div class="acrd_cntnt">'
+
+def answer(t):
+    return t + '</div></div>\n'
+
+def read_paragraph_element(element,type="NORMAL_TEXT"):
+    text_run = element.get('textRun')
+    begin = ''
+    end = ''
+    if not text_run:
+        return ''
+    if 'textStyle' in text_run and 'link' in text_run['textStyle']:
+        begin = '<a href="' + text_run['textStyle']['link']['url'] + '">'
+        end = '</a>'
+    if 'textStyle' in text_run and 'bold' in text_run['textStyle'] and text_run['textStyle']['bold']==True and type=="NORMAL_TEXT":
+        begin = '<strong>' + begin
+        end = end + '</strong>'
+    content = text_run.get('content')
+    content = re.sub(u'\u000b','<br />\n',content)
+    return begin + content + end
+
+def read_paragraph_element_2(element,type="NORMAL_TEXT"):
+    return read_paragraph_element(element,type)
+
+
+
+# t is a string that begins with "Icons: " ... and contains comma(space) separated list
+def handle_icons(t):
+    text = t[7:].strip()
+    parts = text.split(", ")
+    return ('icons',parts)
+
+# t is a string that begins with "Tags: " ... and contains comma(space) separated list
+def handle_tags(t):
+    text = t[6:].strip()
+    parts = text.split(", ")
+    return ('tags',parts)
+
+def handle_question(t,bracket=1):
+    anchor = ''
+    match = re.search( r'\[(.*)\]', t)
+    if match and bracket:
+        anchor = match.group(1).lower()
+        t = re.sub( r'\[.*\]','',t)
+    else:
+        parts = t.split(' ')
+        for p in parts:
+            if re.search(r'[a-zA-Z]',p[0]): anchor += p[0].lower()
+    return ('question', t, anchor)
+
+def handle_answer(t):
+    return ('answer',t)
+
+def handle_sec(t): return ('section',t)
+def handle_para(t): return ('paragraph',t)
+def handle_ul(t): return ('unorderdedlist',t)
+def handle_li(t): return ('listitem',t)
+
+
+
+img_count = 1
+img_lookup = {}
+img_heights = {}
+img_widths = {}
+
+
+'''def fetch_doc_image(k,value):
+    global img_count, img_lookup, img_heights, img_widths
+    if 'inlineObjectProperties' in value:
+        if 'embeddedObject' in value['inlineObjectProperties']:
+            if 'imageProperties' in value['inlineObjectProperties']['embeddedObject']:
+                if 'contentUri' in value['inlineObjectProperties']['embeddedObject']['imageProperties']:
+                    print(k)
+                    uu = value['inlineObjectProperties']['embeddedObject']['imageProperties']['contentUri']
+                    response = requests.get(uu, stream=True)
+                    name = 'image_' + str(img_count) + '.' + response.headers['content-type'].split('/')[1]
+                    img_count += 1
+                    img_lookup[k] = name
+                    
+                    with open('cache/doc_images/'+name, 'wb') as out_file:
+                        shutil.copyfileobj(response.raw, out_file)
+                    print(uu)
+                    print(response.headers)
+                    print(name)
+                    del response
+            if 'size' in  value['inlineObjectProperties']['embeddedObject']: 
+                img_heights[k] = int(value['inlineObjectProperties']['embeddedObject']['size']['height']['magnitude'])
+                img_widths[k] = int(value['inlineObjectProperties']['embeddedObject']['size']['width']['magnitude'])
+'''
+
+
+def fetch_doc_image(k,value):
+    import shutil
+    if 'inlineObjectProperties' in value:
+        if 'embeddedObject' in value['inlineObjectProperties']:
+            if 'imageProperties' in value['inlineObjectProperties']['embeddedObject']:
+                if 'contentUri' in value['inlineObjectProperties']['embeddedObject']['imageProperties']:
+                    uu = value['inlineObjectProperties']['embeddedObject']['imageProperties']['contentUri']
+                    response = requests.get(uu, stream=True)
+                    name = 'image_' + str(k) + '.' + response.headers['content-type'].split('/')[1]
+                    with open('cache/doc_images/'+name, 'wb') as out_file:
+                        shutil.copyfileobj(response.raw, out_file)
+                    del response
+    return True
+
+def get_doc(docid, bracket=1, verbose=0):
+    import pickle, shutil
+    import os.path
+    from googleapiclient.discovery import build
+    from google_auth_oauthlib.flow import InstalledAppFlow
+    from google.auth.transport.requests import Request
+    
+    #ooout = open(fileout,'w')
+
+    # If modifying these scopes, delete the file token.pickle.
+    SCOPES = ['https://www.googleapis.com/auth/documents.readonly']
+    creds = None
+    # The file token.pickle stores the user's access and refresh tokens, and is
+    # created automatically when the authorization flow completes for the first
+    # time.
+    if os.path.exists('token.pickle'):
+        with open('token.pickle', 'rb') as token:
+            creds = pickle.load(token)
+    # If there are no (valid) credentials available, let the user log in.
+    if not creds or not creds.valid:
+        if creds and creds.expired and creds.refresh_token:
+            creds.refresh(Request())
+        else:
+            flow = InstalledAppFlow.from_client_secrets_file(
+                'credentials.json', SCOPES)
+            creds = flow.run_local_server(port=0)
+        # Save the credentials for the next run
+        with open('token.pickle', 'wb') as token:
+            pickle.dump(creds, token)
+
+    service = build('docs', 'v1', credentials=creds)
+
+    # Retrieve the documents contents from the Docs service.
+    document = service.documents().get(documentId=docid).execute()
+    if verbose: print(document)
+        
+    tempout = codecs.open('cache/trash/gdoctemp.txt','w','utf-8')
+    tempout.write( json.dumps(document,indent=2) + "\n\n\n------------------------------------\n\n")
+    if verbose: print('The title of the document is: {}'.format(document.get('title'))) 
+    doc_content = document.get('body').get('content')
+    if verbose: print(doc_content)
+    
+    doc_objects = document.get('inlineObjects')
+    if verbose: print(doc_objects)
+    
+    doc_lists = document.get('lists')
+    
+    text = '<div class="acrd_grp" data-accordion-group="">'
+    last_type = ''
+    answer_text = ''
+    in_a_list = ''
+    
+    img_count = 1
+    img_lookup = {}
+    img_heights = {}
+    img_widths = {}
+    
+    if doc_objects:
+        for k,value in doc_objects.items():
+            tempout.write( "->" + k + "=" + json.dumps(value,indent=2) + "\n\n\n--\n\n")
+            if 'inlineObjectProperties' in value:
+                if 'embeddedObject' in value['inlineObjectProperties']:
+                    if 'imageProperties' in value['inlineObjectProperties']['embeddedObject']:
+                        if 'contentUri' in value['inlineObjectProperties']['embeddedObject']['imageProperties']:
+                            print(k)
+                            uu = value['inlineObjectProperties']['embeddedObject']['imageProperties']['contentUri']
+                            response = requests.get(uu, stream=True)
+                            name = 'image_' + str(img_count) + '.' + response.headers['content-type'].split('/')[1]
+                            img_count += 1
+                            
+                            img_lookup[k] = name
+                            
+                            with open('cache/doc_images/'+name, 'wb') as out_file:
+                                shutil.copyfileobj(response.raw, out_file)
+                            print(uu)
+                            print(response.headers)
+                            print(name)
+                            #input('x?')
+                            del response
+                    if 'size' in  value['inlineObjectProperties']['embeddedObject']: 
+                        img_heights[k] = int(value['inlineObjectProperties']['embeddedObject']['size']['height']['magnitude'])
+                        img_widths[k] = int(value['inlineObjectProperties']['embeddedObject']['size']['width']['magnitude'])
+        
+    tempout.write('- - - - - - - -\n\n')
+    #for value in doc_lists:
+    #    tempout.write( json.dumps(value,indent=2) + "\n\n\n--\n\n")
+    
+    tempout.write('- - - - - - - -\n\n')
+    list_stack = []
+    list_depth = 0
+    last_list_depth = 0
+    for value in doc_content:
+        tempout.write( json.dumps(value,indent=2) + "\n\n\n")
+        if verbose: print(json.dumps(value, sort_keys=True, indent=4))
+
+        # todo: x link, x bold, list, image.
+        tag_fxn = para
+        if 'paragraph' in value:
+            this_text = ''
+            
+            if 'bullet' in value['paragraph']:
+                # either we're (1)starting a new list, (2)in one, (3)starting a nested one, or (4)finished a nested one. 
+                
+                lid = value['paragraph']['bullet']['listId']
+                
+                if not list_stack:  # 1
+                    list_stack.append(lid)
+                else:
+                    if lid == list_stack[0]:   # 2
+                        pass
+                        
+                    else:
+                        if not lid in list_stack:   # 3
+                            list_stack.append(lid)
+                        else:                       # 4
+                            x = list_stack.pop()
+                            while x != lid: list_stack.pop()
+            elif len(list_stack) > 0:                           #  current para isn't a bullet but we still have a list open.             
+                list_stack = []
+            
+            list_depth = len(list_stack)
+            
+            deeper = list_depth - last_list_depth
+            
+            if deeper > 0:
+                answer_text += "<ul>" * deeper
+            elif deeper < 0:
+                deeper = -1 * deeper
+                answer_text += "</ul>" * deeper
+            
+            if len(list_stack):
+                tag_fxn = li
+                
+            elements = value.get('paragraph').get('elements')
+            
+            # inlineObjectElement": {
+            # "inlineObjectId": "kix.ssseeu8j9cfx",
+            
+            if 'paragraphStyle' in value.get('paragraph'):
+                style = value.get('paragraph').get('paragraphStyle')
+                #text += json.dumps(style, sort_keys=True, indent=4)
+                if 'namedStyleType' in style:
+                    type = style['namedStyleType']
+            
+            for elem in elements:
+                
+                # text content
+                this_text += read_paragraph_element(elem,type)
+                
+                # image content
+                if 'inlineObjectElement' in elem:
+                    vpi = elem['inlineObjectElement']
+                    if 'inlineObjectId' in vpi:
+                        ii = vpi['inlineObjectId']
+                        if ii in img_lookup:
+                            img = img_lookup[ii]
+                            h = img_heights[ii]
+                            w = img_widths[ii]
+                            this_text += '<img src="doc_images/%s" width="%i" height="%i" />' % (img,w,h)
+                
+                
+            
+            if last_type=='NORMAL_TEXT' and type!=last_type:
+                text += answer(answer_text)
+                answer_text = ''
+            
+            if type=='HEADING_2':
+                text += sec(this_text)
+                this_text = ''
+            elif type=='HEADING_3': 
+                text += question(this_text,bracket)
+                this_text = ''
+            else:
+                answer_text += tag_fxn(this_text)
+                this_text = ''
+            last_type = type
+            last_list_depth = list_depth
+                
+        elif 'table' in value:
+            # The text in table cells are in nested Structural Elements and tables may be
+            # nested.
+            text += "\nTABLE\n"
+            #table = value.get('table')
+            #for row in table.get('tableRows'):
+            #    cells = row.get('tableCells')
+            #    for cell in cells:
+            #        text += read_strucutural_elements(cell.get('content'))
+        #elif 'tableOfContents' in value:
+        #    # The text in the TOC is also in a Structural Element.
+        #    toc = value.get('tableOfContents')
+        #    text += read_strucutural_elements(toc.get('content'))
+        
+        #else:
+        #    print(json.dumps(value, sort_keys=True, indent=4))
+    
+    text += answer(answer_text)
+    #text += '</div>'
+    #print(text)
+    return text
+    
+def get_doc_generic(docid, bracket=1, verbose=0):
+    return get_doc(docid, bracket, verbose)
 
 
 
@@ -1567,4 +1895,3 @@ if __name__ == "__main__":
     
     # Call the function in the options dict
     options[ int(resp)][1]() 
-
diff --git a/courses.py b/courses.py
index 00a49e0..b33133f 100644
--- a/courses.py
+++ b/courses.py
@@ -2690,24 +2690,7 @@ def enrollment_helper():
 
     # fill percentage for each section, then by mode, tod, campus
 
-def try_clustering(df):
-    # Import required libraries
-    from sklearn.cluster import KMeans
-
-    # Preprocessing
-
-    # Assuming df is your DataFrame and "modes" is your categorical column
-    #df['code'] = df['code'].astype('category').cat.codes 
-
-    # Removing any other unnecessary columns
-    df = df.drop(['code'], axis=1)
-
-    # Perform KMeans clustering
-    kmeans = KMeans(n_clusters=4, random_state=0).fit(df) 
-
-    # Get the cluster labels
-    labels = kmeans.labels_
-
+## moved: try_clustering now in search.py
     # Add labels to the DataFrame
     #df['clusters'] = labels
     #print(df)
diff --git a/depricated.py b/depricated.py
index b34a5c2..1c2e604 100644
--- a/depricated.py
+++ b/depricated.py
@@ -4,6 +4,98 @@
 
 # from pipelines - canvas data
 
+
+# Canvas data, download all new files
+def sync_non_interactive():
+    resp = do_request('/api/account/self/file/sync')    
+    mylog.write(json.dumps(resp, indent=4))
+    #mylog.close()
+    gotten = os.listdir(local_data_folder)
+    wanted = []
+    i = 0
+    for x in resp['files']:
+        filename = x['filename']
+        exi = "No "
+        if filename in gotten: exi = "Yes"
+        else: wanted.append(x)
+        
+        print(str(i) + '.\tLocal: %s\tRemote: %s' % ( exi, filename ))
+        i += 1
+    print("I will attempt to download %i files." % len(wanted))
+    
+    #answer = input("Press enter to begin, or q to quit ")
+    #if not answer == '': return
+    
+    good_count = 0
+    bad_count = 0
+    for W in wanted:
+        print("Downloading: " + W['filename'])
+        response = requests.request(method='GET', url=W['url'], stream=True)
+        if(response.status_code != 200):
+            print('Request response went bad. Got back a %s code, meaning the request was %s' % \
+                 (response.status_code, response.reason))
+            print('URL: ' + W['url'])
+            bad_count += 1
+            
+        else:
+            #Use the downloaded data
+            with open(local_data_folder + W['filename'], 'wb') as fd:
+                for chunk in response.iter_content(chunk_size=128):
+                    fd.write(chunk)
+            print("Success")
+            good_count += 1
+    print("Out of %i files, %i succeeded and %i failed." % (len(wanted),good_count,bad_count))
+        
+    
+## OLD STYLE CANVAS DATA
+
+# Get something from Canvas Data
+def do_request(path):  
+    #Set up the request pieces
+    method = 'GET'
+    host = 'api.inshosteddata.com'
+    apiTime = datetime.utcnow().strftime('%a, %d %b %y %H:%M:%S GMT')
+    apiContentType = 'application/json'
+
+    msgList = []
+    msgList.append(method)
+    msgList.append(host)
+    msgList.append(apiContentType)
+    msgList.append('')
+    msgList.append(path)
+    msgList.append('')
+    msgList.append(apiTime)
+    msgList.append(apiSecret)
+
+    msgStr = bytes("".join("%s\n" % k for k in msgList).strip(),'utf-8')
+
+    sig = base64.b64encode(hmac.new(key=bytes(apiSecret,'utf-8'),msg=msgStr,digestmod=hashlib.sha256).digest())
+    sig = sig.decode('utf-8')
+
+    headers = {}
+    headers['Authorization'] = 'HMACAuth {}:{}'.format(apiKey,sig)
+    headers['Date'] = apiTime
+    headers['Content-type'] = apiContentType
+
+
+    #Submit the request/get a response
+    uri = "https://"+host+path
+    print (uri)
+    print (headers)
+    response = requests.request(method='GET', url=uri, headers=headers, stream=True)
+
+    #Check to make sure the request was ok
+    if(response.status_code != 200):
+        print(('Request response went bad. Got back a ', response.status_code, ' code, meaning the request was ', response.reason))
+    else:
+        #Use the downloaded data
+        jsonData = response.json()
+        #print(json.dumps(jsonData, indent=4))
+        return jsonData
+
+
+
+
 def file_doesnt_exist(name):
     # Get list of files in current directory
     files = os.listdir()
diff --git a/localcache.py b/localcache.py
index 2668751..c1f1874 100644
--- a/localcache.py
+++ b/localcache.py
@@ -8,7 +8,7 @@ from datetime import datetime as dt
 from datetime import timedelta
 from dateutil.parser import parse
 from os.path import exists, getmtime
-from pipelines import sync_non_interactive, url, header
+from pipelines import url, header
 import util
 from semesters import short_to_sis
 
@@ -1121,7 +1121,7 @@ def full_reload():
         except Exception as e:
             print("Couldn't rename file:", str(e))
 
-    sync_non_interactive()
+    #sync_non_interactive()
 
     setup_table('requests_sum1')
     setup_table('courses')
diff --git a/pipelines.py b/pipelines.py
index 7e74f56..159ee41 100644
--- a/pipelines.py
+++ b/pipelines.py
@@ -1,1530 +1,1023 @@
-import util
-import codecs, json, requests, re, csv, datetime, os, jsondiff, os.path
-import sys, shutil, hmac, hashlib, base64, schedule, time, pathlib
-from datetime import timedelta
-
-from canvas_secrets import apiKey, apiSecret, FTP_SITE, FTP_USER, FTP_PW, url, domain, account_id, header, header_media, g_id, g_secret
-from canvas_secrets import instructure_url, instructure_username, instructure_private_key
-
-import os, asyncio
-from dap.api import DAPClient
-from dap.dap_types import Credentials
-from dap.integration.database import DatabaseConnection
-from dap.replicator.sql import SQLReplicator
-
-
-
-"""
-Everything to do with fetching data, 
- - From iLearn, via token
- - current roster uploads from instructures sftp site
- - raw logs and other from canvas data repo
- - from ssb, use firefox to scrape the schedule
- 
- 
-And some subsequent processing:
- - Raw roster files, into a more compact json format
- - Raw logs into something more useful
-"""                                              
-
-verbose = False
-
-users = {}
-users_by_id = {}
-
-# todo: all these constants for SSB  --  line 1008
-#
-# todo: https://stackoverflow.com/questions/42656247/how-can-i-use-canvas-data-rest-api-using-python
-
-
-
-    
-
-
-sys.setrecursionlimit( 100000 )
-
-local_data_folder = 'cache/canvas_data/'
-mylog = codecs.open(local_data_folder + 'temp_log.txt','w')
-
-
-
-
-class FetchError(Exception):
-    pass
-
-
-DEBUG = 0
-
-def d(s,end=''): 
-    global DEBUG
-    if end and DEBUG: print(s,end=end)
-    elif DEBUG: print(s)
-
-################
-################  CANVAS API MAIN FETCHING FUNCTIONS
-################
-################
-################
-
-
-
-
-# Main canvas querying fxn    
-def fetch(target,verbose=0,params=0,media=0):
-    # if there are more results, recursivly call myself, adding on to the results.
-    results = 0
-    if target[0:4] != "http": target = url + target
-    if verbose: 
-        print("++ Fetching: " + target)
-    if media:
-        r2 = requests.get(target, headers = header_media)
-    elif params:
-        r2 = requests.get(target, headers = header, params = params)
-    else:
-        r2 = requests.get(target, headers = header)
-    #if verbose:
-    #print "++ Got: " + r2.text
-    try:
-        results = json.loads(r2.text)
-        count = len(results)
-    except:
-        print("-- Failed to parse: ", r2.text)
-    if verbose: 
-        print("Got %i results" % count)
-    if verbose > 1:
-        print(r2.headers)
-        
-        tempout = codecs.open('cache/fetchcache.txt','a','utf-8')
-        tempout.write(r2.text+"\n\n")
-        tempout.close()
-    
-    if ('link' in r2.headers and count > 0):
-        links = r2.headers['link'].split(',')
-        for L in links:
-            ll = L.split(';')
-            link = ll[0].replace("<","")
-            link = link.replace(">","")
-            if re.search(r'next', ll[1]):
-                if (verbose):  print("++ More link: " + link)
-                #link = re.sub(r'per_page=10$', 'per_page=100', link)    # link.replace('per_page=10','per_page=500')
-                #if (verbose):  print("++ More link: " + link)
-                
-                nest = fetch(link,verbose,params,media)
-                if isinstance(results,dict): results.update(nest)
-                else: results.extend(nest)
-    return results    
-    
-# Main canvas querying fxn - stream version - don't die on big requests
-def fetch_stream(target,verbose=0): 
-    # if there are more results, recursivly call myself, adding on to the results.
-    results = 0
-    while target:
-        if target[0:4] != "http": target = url + target
-        if verbose: 
-            print("++ Fetching: " + target)
-        r2 = requests.get(target, headers = header)
-        if r2.status_code == 502:
-            raise FetchError()
-        try:
-            results = json.loads(r2.text)
-            count = len(results)
-        except:
-            print("-- Failed to parse: ", r2.text)
-        if verbose: 
-            print("Got %i results" % count)
-        if verbose > 1:
-            print(r2.headers)
-            tempout = codecs.open('cache/fetchcache.txt','a','utf-8')
-            tempout.write(r2.text+"\n\n")
-            tempout.close()
-        
-        next_link_found = 0
-        if ('link' in r2.headers and count > 0):
-            links = r2.headers['link'].split(',')
-            for L in links:
-                ll = L.split(';')
-                link = ll[0].replace("<","")
-                link = link.replace(">","")
-                if re.search(r'next', ll[1]):
-                    target = link 
-                    next_link_found = 1
-                    break
-        if not next_link_found: target = 0
-        yield results
-
-    
-# for dicts with one key, collapse that one key out, cause
-# paging makes problems... example: enrollment_terms    
-def fetch_collapse(target,collapse='',verbose=0):
-    # if there are more results, recursivly call myself, adding on to the results.
-    results = 0
-    if target[0:4] != "http": target = url + target
-    if verbose: 
-        print("++ Fetching: " + target)
-    r2 = requests.get(target, headers = header)
-    #if verbose:
-    #print "++ Got: " + r2.text
-    try:
-        results = json.loads(r2.text)
-    except:
-        print("-- Failed to parse: ", r2.text)
-    if verbose: print(r2.headers)
-    
-    if collapse and collapse in results:
-        results = results[collapse]
-    
-    if ('link' in r2.headers):
-        links = r2.headers['link'].split(',')
-        for L in links:
-            ll = L.split(';')
-            link = ll[0].replace("<","")
-            link = link.replace(">","")
-            if re.search(r'next', ll[1]):
-                if (verbose):  print("++ More link: " + link)
-                nest = fetch_collapse(link, collapse, verbose)
-                if isinstance(results,dict): results.update(nest)
-                else: results.extend(nest)
-    return results    
-    
-
-
-
-    
-    
-
-
-
-################
-################  CANVAS DATA
-################
-################
-################
-
-
-# Get canvas data 2024 style
-def canvas_data_2024_run():
-    print("Updating all tables.")
-    asyncio.run(canvas_data_2024())
-    print("Done with all tables.")
-
-
-async def canvas_data_2024():
-
-    base_url: str = os.environ["DAP_API_URL"]
-    client_id: str = os.environ["DAP_CLIENT_ID"]
-    client_secret: str = os.environ["DAP_CLIENT_SECRET"]
-    #connection_string: str = "postgresql://postgres:rolley34@192.168.1.6/db"
-
-    # todo: use secrets
-    connection_string: str = "postgresql://postgres:rolley34@192.168.1.199/db"
-    
-    desired_tables = "users,courses,communication_channels,context_modules,conversation_message_participants,conversation_messages,conversation_participants,conversations,course_sections,enrollment_states,enrollment_dates_overrides,enrollment_terms,enrollments,learning_outcome_groups,learning_outcome_question_results,learning_outcomes,pseudonyms,quizzes,scores,submissions,submission_versions,wiki_pages,wikis".split(',')
-    credentials = Credentials.create(client_id=client_id, client_secret=client_secret)
-
-    async with DatabaseConnection(connection_string).open() as db_connection:
-        async with DAPClient(base_url, credentials) as session:
-            #tables = await session.get_tables("canvas")
-            for table in desired_tables:
-                print(f"  trying to update {table} ")
-                try:
-                    #await SQLReplicator(session, db_connection).initialize("canvas", table) 
-                    await SQLReplicator(session, db_connection).synchronize("canvas", table)
-                except Exception as e:
-                    print(f"  - skipping {table} because {e}")
-
-
-
-# Get canvas data 2024 style
-def setup_canvas_data_2024_run():
-    print("Setting up all tables.")
-    asyncio.run(setup_canvas_data_2024())
-    print("Done with all tables.")
-
-
-async def setup_canvas_data_2024():
-
-    base_url: str = os.environ["DAP_API_URL"]
-    client_id: str = os.environ["DAP_CLIENT_ID"]
-    client_secret: str = os.environ["DAP_CLIENT_SECRET"]
-    #connection_string: str = "postgresql://postgres:rolley34@192.168.1.6/db"
-    connection_string: str = "postgresql://postgres:rolley34@192.168.1.192/db"
-    
-    desired_tables = "users,courses,communication_channels,context_modules,conversation_message_participants,conversation_messages,conversation_participants,conversations,course_sections,enrollment_states,enrollment_dates_overrides,enrollment_terms,enrollments,learning_outcome_groups,learning_outcome_question_results,learning_outcomes,pseudonyms,quizzes,scores,submissions,submission_versions,wiki_pages,wikis".split(',')
-    credentials = Credentials.create(client_id=client_id, client_secret=client_secret)
-
-    async with DatabaseConnection(connection_string).open() as db_connection:
-        async with DAPClient(base_url, credentials) as session:
-            #tables = await session.get_tables("canvas")
-            for table in desired_tables:
-                print(f"  {table}")
-                try:
-                    await SQLReplicator(session, db_connection).initialize("canvas", table) 
-                except Exception as e:
-                    print(f"  - skipping {table} because {e}")
-
-
-
-
-
-
-
-
-
-
-
-  
-
-
-    
-  
-
-
-
-
-
-
-################
-################  ROSTERS AND REGISTRATION
-################
-################
-################
-
-# todo: the pipeline is disorganized. Organize it to have 
-# a hope of taking all this to a higher level.
-#     
-
-# todo: where does this belong in the pipeline? compare with recent_schedules()
-
-
-
-# Take the generically named rosters uploads files and move them to a semester folder and give them a date.    
-def move_to_folder(sem,year,folder,files):
-    semester = year+sem
-    semester_path = 'cache/rosters/%s' % semester
-    if not os.path.isdir('cache/rosters/'+semester):
-        os.makedirs('cache/rosters/'+semester)
-    now = datetime.datetime.now().strftime('%Y-%m-%dT%H-%M')
-    print("+  Moving roster files to folder: %s" % semester_path)
-    if not os.path.isdir(semester_path):
-        print("+  Creating folder: %s" % semester_path)
-        os.makedirs(semester_path)
-    def safe_move(src, dst):
-        try:
-            os.replace(src, dst)
-        except Exception as ex:
-            print(f"! move failed {src} -> {dst}: {ex}")
-    if 'courses.csv' in files:
-        safe_move('cache/rosters/courses-%s.csv' % folder,     'cache/rosters/%s/courses.%s.csv' % (semester,now))
-    if 'enrollments.csv' in files:
-        safe_move('cache/rosters/enrollments-%s.csv' % folder, 'cache/rosters/%s/enrollments.%s.csv' % (semester,now))
-    if 'users.csv' in files:
-        safe_move('cache/rosters/users-%s.csv' % folder,       'cache/rosters/%s/users.%s.csv' % (semester,now))
-    
-
-
-# Take raw upload (csv) files and make one big json out of them. 
-# This relates to enrollment files, not schedule. 
-def convert_roster_files(semester="",year="",folder=""):
-    if not semester:
-        semester = input("the semester? (ex: spring) ")
-        folder = input("Folder? (ex 2020-02-25-14-58-20) ")
-    uf = open('cache/rosters/users-'+folder+'.csv','r')
-    cf = open('cache/rosters/courses-'+folder+'.csv','r')
-    ef = open('cache/rosters/enrollments-'+folder+'.csv','r')
-    u = csv.DictReader(uf)
-    c = csv.DictReader(cf)
-    e = csv.DictReader(ef)
-    uu = [i for i in u]
-    cc = [i for i in c]
-    ee = [i for i in e]
-    uf.close()
-    cf.close()
-    ef.close()
-    myrosterfile = 'cache/rosters/roster_%s_%s.json' % (year, semester) 
-
-    if os.path.exists(myrosterfile):
-        print("  --  Moving previous combined roster json file. opening %s ..." % myrosterfile)
-        last_fileobj = open(myrosterfile,'r')
-        last_file = json.load(last_fileobj)
-                             
-        last_fileobj.close()
-
-        info = last_file[3]
-        last_date = info['date_filestring']
-        
-        print('  --  writing:  cache/rosters/%s%s/roster_%s.json ...' % (year,semester,last_date))
-        
-        try:
-            os.rename(myrosterfile, 'cache/rosters/%s%s/roster_%s.json ...' % (year,semester,last_date))
-            print('  --  ok')
-        except Exception as e:
-            print("  **  Failed because i couldn't move the previous roster file: %s" % myrosterfile)
-            print(e)
-            myrosterfile = "new_" + myrosterfile
-            pass
-            #os.remove('cache/old_rosters/roster_'+semester+'.'+last_date+'.json')
-            #os.rename(myrosterfile, 'cache/old_rosters/roster_'+semester+'.'+last_date+'.json')
-
-    newinfo = {'date_filestring': datetime.datetime.now().strftime('%Y-%m-%dT%H-%M'), }
-    try:
-        new_roster = codecs.open(myrosterfile,'w', 'utf-8')
-        new_roster.write( json.dumps( [uu,cc,ee,newinfo], indent=2 ))
-        new_roster.close()
-        print("  --  Wrote roster info to: %s." % myrosterfile)
-    except Exception as e:
-        print("  **  Failed because i couldn't move the previous roster file: %s" % myrosterfile)
-        print("  **  " + str(e))
-            
-                                
-                                              
-
-
-
-# From instructure sftp site
-def fetch_current_rosters(sftp=None, label_hour=None):
-    """Download roster CSVs from Instructure SFTP and post-process.
-    - If sftp provided, reuse it (already chdir'd to SIS).
-    - Files are saved using label_hour (format YYYY-MM-DD-HH). If None, compute fallback label by flooring to the hour.
-    """
-    import pysftp
-    def log(msg):
-        ts = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
-        try:
-            with open('cache/pipeline.log.txt','a', encoding='utf-8') as f:
-                f.write(f"[{ts}] {msg}\n")
-        except Exception:
-            print(msg)
-
-    close_after = False
-    if sftp is None:
-        cnopts = pysftp.CnOpts()
-        cnopts.hostkeys = None
-        sftp = pysftp.Connection(instructure_url, username=instructure_username, private_key=instructure_private_key, cnopts=cnopts)
-        sftp.chdir('SIS')
-        close_after = True
-    try:
-        now = datetime.datetime.now()
-        if not label_hour:
-            # default fallback: floor to hour
-            label_hour = now.replace(minute=0, second=0, microsecond=0).strftime('%Y-%m-%d-%H')
-
-        files = set(sftp.listdir())
-        log(f"fetch_current_rosters: label={label_hour} files={sorted(files)}")
-        need = ['login','users','courses','enrollments']
-        saved = []
-        for name in need:
-            remote = f'{name}.csv'
-            target = f'cache/rosters/{name}-{label_hour}.csv'
-            try:
-                if remote in files:
-                    if not (os.path.exists(target) and os.path.getsize(target) > 0):
-                        temp = target + '.part'
-                        try:
-                            if os.path.exists(temp):
-                                os.remove(temp)
-                        except Exception:
-                            pass
-                        sftp.get(remote, temp)
-                        # atomic replace
-                        os.replace(temp, target)
-                        saved.append(remote)
-                        log(f'  saved {remote} -> {target}')
-                    else:
-                        log(f'  already exists: {target}, skipping')
-                else:
-                    log(f'  missing on server: {remote}')
-            except Exception as ex:
-                log(f'  download failed: {remote} -> {target} err={ex}')
-
-        if len(saved) >= 3 and 'courses.csv' in saved or os.path.exists(f'cache/rosters/courses-{label_hour}.csv'):
-            try:
-                with open(f'cache/rosters/courses-{label_hour}.csv','r') as courses:
-                    courses.readline()
-                    a = courses.readline()
-                parts = a.split(',')
-                year = parts[1][0:4]
-                ss = parts[1][4:6]
-                sem = {'30':'spring', '50':'summer', '70':'fall' }
-                this_sem = sem.get(ss, 'spring')
-                log(f"post-process for semester={this_sem} year={year} label={label_hour}")
-                convert_roster_files(this_sem,year,label_hour)
-                move_to_folder(this_sem,year,label_hour,saved)
-            except Exception as expp:
-                log(f'post-processing failed: {expp}')
-        else:
-            log('not enough files to post-process yet')
-    finally:
-        if close_after:
-            try:
-                sftp.close()
-            except Exception:
-                pass
-
-def fetch_current_rosters_auto(poll_seconds=15):
-    """Poll Instructure SFTP from the top of each hour until all 4 roster files appear, then download once.
-    - Robust logging to cache/pipeline.log.txt
-    - Stops polling for the remainder of that hour after success
-    - Tries again on the next hour
-    """
-    import pysftp
-    from contextlib import contextmanager
-
-    log_path = 'cache/pipeline.log.txt'
-
-    def log(msg):
-        ts = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
-        try:
-            with open(log_path, 'a', encoding='utf-8') as f:
-                f.write(f"[{ts}] {msg}\n")
-        except Exception:
-            print(f"[log-fail] {msg}")
-
-    @contextmanager
-    def sftp_conn():
-        cnopts = pysftp.CnOpts()
-        cnopts.hostkeys = None
-        conn = None
-        try:
-            conn = pysftp.Connection(instructure_url, username=instructure_username,
-                                     private_key=instructure_private_key, cnopts=cnopts)
-            conn.chdir('SIS')
-            yield conn
-        finally:
-            try:
-                if conn:
-                    conn.close()
-            except Exception:
-                pass
-
-    required = {'login.csv', 'users.csv', 'courses.csv', 'enrollments.csv'}
-    current_window = None  # e.g., '2025-08-30-14'
-    window_done = False
-    window_end = None
-
-    def compute_window(now):
-        """Return (label_hour_str, window_end_dt) if within a window, else (None, None).
-        Window spans [H-5m, H+5m] around each top-of-hour H.
-        If minute >= 55 -> window is next hour
-        If minute <= 5  -> window is current hour
-        Else -> not in window.
-        """
-        m = now.minute
-        if m >= 55:
-            base = (now.replace(minute=0, second=0, microsecond=0) + timedelta(hours=1))
-        elif m <= 5:
-            base = now.replace(minute=0, second=0, microsecond=0)
-        else:
-            return (None, None)
-        label = base.strftime('%Y-%m-%d-%H')
-        end = base + timedelta(minutes=5)  # end of window
-        return (label, end)
-
-    log('fetch_current_rosters_auto: starting poll loop')
-    while True:
-        now = datetime.datetime.now()
-        hour_key = now.strftime('%Y-%m-%d %H')
-        label, enddt = compute_window(now)
-        if label is None:
-            time.sleep(max(5, int(poll_seconds)))
-            continue
-
-        if label != current_window:
-            current_window = label
-            window_done = False
-            window_end = enddt
-            log(f'New window: {current_window} (until {window_end.strftime("%H:%M:%S")}). Resetting state.')
-
-        if window_done:
-            time.sleep(5)
-            continue
-
-        # Poll SFTP for files
-        try:
-            with sftp_conn() as sftp:
-                files = set(sftp.listdir())
-                missing = list(required - files)
-                if missing:
-                    log(f'Not ready yet. Missing: {missing}')
-                else:
-                    log('All required files present inside window. Starting download...')
-                    try:
-                        fetch_current_rosters(sftp=sftp, label_hour=current_window)
-                        log('Download complete. Marking window_done for this window.')
-                        window_done = True
-                    except Exception as ex_dl:
-                        import traceback
-                        log('Download failed: ' + str(ex_dl))
-                        log(traceback.format_exc())
-        except Exception as ex_list:
-            import traceback
-            log('SFTP list failed: ' + str(ex_list))
-            log(traceback.format_exc())
-
-        # Check for window timeout
-        try:
-            if not window_done and window_end and now >= window_end:
-                log(f'REPORT: window {current_window} ended without all files. Consider alerting/email.')
-                window_done = True  # stop further checks this window
-        except Exception:
-            pass
-
-        time.sleep(max(5, int(poll_seconds)))
-
-
-# Canvas data, download all new files
-def sync_non_interactive():
-    resp = do_request('/api/account/self/file/sync')    
-    mylog.write(json.dumps(resp, indent=4))
-    #mylog.close()
-    gotten = os.listdir(local_data_folder)
-    wanted = []
-    i = 0
-    for x in resp['files']:
-        filename = x['filename']
-        exi = "No "
-        if filename in gotten: exi = "Yes"
-        else: wanted.append(x)
-        
-        print(str(i) + '.\tLocal: %s\tRemote: %s' % ( exi, filename ))
-        i += 1
-    print("I will attempt to download %i files." % len(wanted))
-    
-    #answer = input("Press enter to begin, or q to quit ")
-    #if not answer == '': return
-    
-    good_count = 0
-    bad_count = 0
-    for W in wanted:
-        print("Downloading: " + W['filename'])
-        response = requests.request(method='GET', url=W['url'], stream=True)
-        if(response.status_code != 200):
-            print('Request response went bad. Got back a %s code, meaning the request was %s' % \
-                 (response.status_code, response.reason))
-            print('URL: ' + W['url'])
-            bad_count += 1
-            
-        else:
-            #Use the downloaded data
-            with open(local_data_folder + W['filename'], 'wb') as fd:
-                for chunk in response.iter_content(chunk_size=128):
-                    fd.write(chunk)
-            print("Success")
-            good_count += 1
-    print("Out of %i files, %i succeeded and %i failed." % (len(wanted),good_count,bad_count))
-        
-    
-## OLD STYLE CANVAS DATA
-
-# Get something from Canvas Data
-def do_request(path):  
-    #Set up the request pieces
-    method = 'GET'
-    host = 'api.inshosteddata.com'
-    apiTime = datetime.utcnow().strftime('%a, %d %b %y %H:%M:%S GMT')
-    apiContentType = 'application/json'
-
-    msgList = []
-    msgList.append(method)
-    msgList.append(host)
-    msgList.append(apiContentType)
-    msgList.append('')
-    msgList.append(path)
-    msgList.append('')
-    msgList.append(apiTime)
-    msgList.append(apiSecret)
-
-    msgStr = bytes("".join("%s\n" % k for k in msgList).strip(),'utf-8')
-
-    sig = base64.b64encode(hmac.new(key=bytes(apiSecret,'utf-8'),msg=msgStr,digestmod=hashlib.sha256).digest())
-    sig = sig.decode('utf-8')
-
-    headers = {}
-    headers['Authorization'] = 'HMACAuth {}:{}'.format(apiKey,sig)
-    headers['Date'] = apiTime
-    headers['Content-type'] = apiContentType
-
-
-    #Submit the request/get a response
-    uri = "https://"+host+path
-    print (uri)
-    print (headers)
-    response = requests.request(method='GET', url=uri, headers=headers, stream=True)
-
-    #Check to make sure the request was ok
-    if(response.status_code != 200):
-        print(('Request response went bad. Got back a ', response.status_code, ' code, meaning the request was ', response.reason))
-    else:
-        #Use the downloaded data
-        jsonData = response.json()
-        #print(json.dumps(jsonData, indent=4))
-        return jsonData
-
-
-
-
-
-
-################
-################  SENDING DATA AWAY
-################
-################
-################
-
-# Upload a json file to www
-def put_file(remotepath,localpath, localfile,prompt=1):
-    import pysftp
-    show_all = 0
-    folder = datetime.datetime.now().strftime('%Y-%m-%d-%H-%M-%S')
-    cnopts = pysftp.CnOpts()
-    cnopts.hostkeys = None
-
-    with pysftp.Connection(FTP_SITE,username=FTP_USER, password=FTP_PW,cnopts=cnopts) as sftp:
-        #todo: these paths
-        #files = sftp.listdir()
-        #print(folder + "\tI see these files on remote: ", files, "\n")
-        sftp.chdir(remotepath)
-        files = sftp.listdir()
-        if show_all: print(folder + "\tI see these files on remote: ", files, "\n")
-        localf = os.listdir(localpath)
-        if show_all: print("I see these local: ", localf)
-        if prompt:
-            input('ready to upload')
-        sftp.put(localpath+localfile, localfile, preserve_mtime=True)
-        sftp.close()
-       
-        
-        """
-        # copy files and directories from local static, to remote static,
-        # preserving modification times on the files
-        for f in localf:
-            print("This local file: " + f + " ", end=' ')
-            if not f in files: 
-                sftp.put('video_srt/'+classfoldername+'/'+f, f, preserve_mtime=True)
-                print("Uploaded.")
-            else:
-                print("Skipped.")
-        """
-
-        """if len(files)==3 and 'users.csv' in files:
-            sftp.get('courses.csv','rosters/courses-'+folder+'.csv')
-            sftp.get('users.csv','rosters/users-'+folder+'.csv')
-            sftp.get('enrollments.csv','rosters/enrollments-'+folder+'.csv')
-            print folder + '\tSaved three data files in rosters folder.'
-            
-            courses = open('rosters/courses-'+folder+'.csv','r')
-            courses.readline()
-            a = courses.readline()
-            print a
-            courses.close()
-            parts = a.split(',')
-            year = parts[1][0:4]
-            ss = parts[1][4:6]
-            #print parts[1]
-            sem = {'30':'spring', '50':'summer', '70':'fall' }
-            this_sem = sem[ss]
-            #print this_sem, "", year
-            print folder + '\tbuilding data file...'
-            convert_roster_files(this_sem,year,folder)
-            print folder + '\tmoving files...'
-            move_to_folder(this_sem,year,folder)
-        else:
-            print folder + "\tDon't see all three files."""
-    
-    
-    
-################
-################  GOOGLE DOCS
-################
-################
-################
-
-def sec(t): return "<h3>"+t+"</h3>\n"
-def para(t): return "<p>"+t+"</p>\n"
-def ul(t): return "<ul>"+t+"</ul>\n"
-def li(t): return "<li>"+t+"</li>\n"
-
-def question(t,bracket=1): 
-    ret = ''
-    match = re.search( r'\[(.*)\]', t)
-    if match and bracket:
-        ret += "<a name='" + match.group(1) + "'></a>"
-        t = re.sub( r'\[.*\]','',t)
-    else:
-        parts = t.split(' ')
-        id = ''
-        for p in parts:
-            if re.search(r'[a-zA-Z]',p[0]): id += p[0]
-        ret += "<a name='%s'></a>" % id.lower()
-    return ret + '<div class="accordion" data-accordion=""><h4 class="acrd_cntl">' + t + '</h4>\n<div class="acrd_cntnt">'
-    
-def answer(t): 
-    return t + '</div></div>\n'
-
-def read_paragraph_element(element,type="NORMAL_TEXT"):
-    """Returns the text in the given ParagraphElement.
-
-        Args:
-            element: a ParagraphElement from a Google Doc.
-    """
-    text_run = element.get('textRun')
-    begin = ''
-    end = ''
-    if not text_run:
-        return ''
-    if 'textStyle' in text_run and 'link' in text_run['textStyle']:
-        begin = '<a href="' + text_run['textStyle']['link']['url'] + '">'
-        end = '</a>'
-    if 'textStyle' in text_run and 'bold' in text_run['textStyle'] and text_run['textStyle']['bold']==True and type=="NORMAL_TEXT":
-        begin = '<strong>' + begin
-        end = end + '</strong>'
-    
-    content = text_run.get('content')
-    content = re.sub(u'\u000b','<br />\n',content)
-    
-    return begin + content + end
-    
-
-def get_doc(docid, bracket=1, verbose=0):
-    import pickle
-    import os.path
-    from googleapiclient.discovery import build
-    from google_auth_oauthlib.flow import InstalledAppFlow
-    from google.auth.transport.requests import Request
-    
-    #ooout = open(fileout,'w')
-
-    # If modifying these scopes, delete the file token.pickle.
-    SCOPES = ['https://www.googleapis.com/auth/documents.readonly']
-    creds = None
-    # The file token.pickle stores the user's access and refresh tokens, and is
-    # created automatically when the authorization flow completes for the first
-    # time.
-    if os.path.exists('token.pickle'):
-        with open('token.pickle', 'rb') as token:
-            creds = pickle.load(token)
-    # If there are no (valid) credentials available, let the user log in.
-    if not creds or not creds.valid:
-        if creds and creds.expired and creds.refresh_token:
-            creds.refresh(Request())
-        else:
-            flow = InstalledAppFlow.from_client_secrets_file(
-                'credentials.json', SCOPES)
-            creds = flow.run_local_server(port=0)
-        # Save the credentials for the next run
-        with open('token.pickle', 'wb') as token:
-            pickle.dump(creds, token)
-
-    service = build('docs', 'v1', credentials=creds)
-
-    # Retrieve the documents contents from the Docs service.
-    document = service.documents().get(documentId=docid).execute()
-    if verbose: print(document)
-        
-    tempout = codecs.open('cache/trash/gdoctemp.txt','w','utf-8')
-    tempout.write( json.dumps(document,indent=2) + "\n\n\n------------------------------------\n\n")
-    if verbose: print('The title of the document is: {}'.format(document.get('title'))) 
-    doc_content = document.get('body').get('content')
-    if verbose: print(doc_content)
-    
-    doc_objects = document.get('inlineObjects')
-    if verbose: print(doc_objects)
-    
-    doc_lists = document.get('lists')
-    
-    text = '<div class="acrd_grp" data-accordion-group="">'
-    last_type = ''
-    answer_text = ''
-    in_a_list = ''
-    
-    img_count = 1
-    img_lookup = {}
-    img_heights = {}
-    img_widths = {}
-    
-    if doc_objects:
-        for k,value in doc_objects.items():
-            tempout.write( "->" + k + "=" + json.dumps(value,indent=2) + "\n\n\n--\n\n")
-            if 'inlineObjectProperties' in value:
-                if 'embeddedObject' in value['inlineObjectProperties']:
-                    if 'imageProperties' in value['inlineObjectProperties']['embeddedObject']:
-                        if 'contentUri' in value['inlineObjectProperties']['embeddedObject']['imageProperties']:
-                            print(k)
-                            uu = value['inlineObjectProperties']['embeddedObject']['imageProperties']['contentUri']
-                            response = requests.get(uu, stream=True)
-                            name = 'image_' + str(img_count) + '.' + response.headers['content-type'].split('/')[1]
-                            img_count += 1
-                            
-                            img_lookup[k] = name
-                            
-                            with open('cache/doc_images/'+name, 'wb') as out_file:
-                                shutil.copyfileobj(response.raw, out_file)
-                            print(uu)
-                            print(response.headers)
-                            print(name)
-                            #input('x?')
-                            del response
-                    if 'size' in  value['inlineObjectProperties']['embeddedObject']: 
-                        img_heights[k] = int(value['inlineObjectProperties']['embeddedObject']['size']['height']['magnitude'])
-                        img_widths[k] = int(value['inlineObjectProperties']['embeddedObject']['size']['width']['magnitude'])
-        
-    tempout.write('- - - - - - - -\n\n')
-    #for value in doc_lists:
-    #    tempout.write( json.dumps(value,indent=2) + "\n\n\n--\n\n")
-    
-    tempout.write('- - - - - - - -\n\n')
-    list_stack = []
-    list_depth = 0
-    last_list_depth = 0
-    for value in doc_content:
-        tempout.write( json.dumps(value,indent=2) + "\n\n\n")
-        if verbose: print(json.dumps(value, sort_keys=True, indent=4))
-
-        # todo: x link, x bold, list, image.
-        tag_fxn = para
-        if 'paragraph' in value:
-            this_text = ''
-            
-            if 'bullet' in value['paragraph']:
-                # either we're (1)starting a new list, (2)in one, (3)starting a nested one, or (4)finished a nested one. 
-                
-                lid = value['paragraph']['bullet']['listId']
-                
-                if not list_stack:  # 1
-                    list_stack.append(lid)
-                else:
-                    if lid == list_stack[0]:   # 2
-                        pass
-                        
-                    else:
-                        if not lid in list_stack:   # 3
-                            list_stack.append(lid)
-                        else:                       # 4
-                            x = list_stack.pop()
-                            while x != lid: list_stack.pop()
-            elif len(list_stack) > 0:                           #  current para isn't a bullet but we still have a list open.             
-                list_stack = []
-            
-            list_depth = len(list_stack)
-            
-            deeper = list_depth - last_list_depth
-            
-            if deeper > 0:
-                answer_text += "<ul>" * deeper
-            elif deeper < 0:
-                deeper = -1 * deeper
-                answer_text += "</ul>" * deeper
-            
-            if len(list_stack):
-                tag_fxn = li
-                
-            elements = value.get('paragraph').get('elements')
-            
-            # inlineObjectElement": {
-            # "inlineObjectId": "kix.ssseeu8j9cfx",
-            
-            if 'paragraphStyle' in value.get('paragraph'):
-                style = value.get('paragraph').get('paragraphStyle')
-                #text += json.dumps(style, sort_keys=True, indent=4)
-                if 'namedStyleType' in style:
-                    type = style['namedStyleType']
-            
-            for elem in elements:
-                
-                # text content
-                this_text += read_paragraph_element(elem,type)
-                
-                # image content
-                if 'inlineObjectElement' in elem:
-                    vpi = elem['inlineObjectElement']
-                    if 'inlineObjectId' in vpi:
-                        ii = vpi['inlineObjectId']
-                        if ii in img_lookup:
-                            img = img_lookup[ii]
-                            h = img_heights[ii]
-                            w = img_widths[ii]
-                            this_text += '<img src="doc_images/%s" width="%i" height="%i" />' % (img,w,h)
-                
-                
-            
-            if last_type=='NORMAL_TEXT' and type!=last_type:
-                text += answer(answer_text)
-                answer_text = ''
-            
-            if type=='HEADING_2':
-                text += sec(this_text)
-                this_text = ''
-            elif type=='HEADING_3': 
-                text += question(this_text,bracket)
-                this_text = ''
-            else:
-                answer_text += tag_fxn(this_text)
-                this_text = ''
-            last_type = type
-            last_list_depth = list_depth
-                
-        elif 'table' in value:
-            # The text in table cells are in nested Structural Elements and tables may be
-            # nested.
-            text += "\nTABLE\n"
-            #table = value.get('table')
-            #for row in table.get('tableRows'):
-            #    cells = row.get('tableCells')
-            #    for cell in cells:
-            #        text += read_strucutural_elements(cell.get('content'))
-        #elif 'tableOfContents' in value:
-        #    # The text in the TOC is also in a Structural Element.
-        #    toc = value.get('tableOfContents')
-        #    text += read_strucutural_elements(toc.get('content'))
-        
-        #else:
-        #    print(json.dumps(value, sort_keys=True, indent=4))
-    
-    text += answer(answer_text)
-    #text += '</div>'
-    #print(text)
-    return text
-    
-######### TRY #2 ######    
-
-
-def read_paragraph_element_2(element,type="NORMAL_TEXT"):
-    text_run = element.get('textRun')
-    begin = ''
-    end = ''
-    if not text_run: return ''
-    if 'textStyle' in text_run and 'link' in text_run['textStyle']:
-        begin = '<a href="' + text_run['textStyle']['link']['url'] + '">'
-        end = '</a>'
-    if 'textStyle' in text_run and 'bold' in text_run['textStyle'] and text_run['textStyle']['bold']==True and type=="NORMAL_TEXT":
-        begin = '<strong>' + begin
-        end = end + '</strong>'
-    elif 'textStyle' in text_run and 'italic' in text_run['textStyle'] and text_run['textStyle']['italic']==True and type=="NORMAL_TEXT":
-        begin = '<em>' + begin
-        end = end + '</em>'
-    content = text_run.get('content')
-    content = re.sub(u'\u000b','<br />\n',content)
-    return begin + content + end
-
-# t is a string that begins with "Icons: " ... and contains comma(space) separated list
-def handle_icons(t):
-    text = t[7:].strip()
-    parts = text.split(", ")
-    return ('icons',parts)
-
-# t is a string that begins with "Tags: " ... and contains comma(space) separated list
-def handle_tags(t):
-    text = t[6:].strip()
-    parts = text.split(", ")
-    return ('tags',parts)
-
-def handle_question(t,bracket=1):
-    anchor = ''
-    match = re.search( r'\[(.*)\]', t)
-    if match and bracket:
-        anchor = match.group(1).lower()
-        t = re.sub( r'\[.*\]','',t)
-    else:
-        parts = t.split(' ')
-        for p in parts:
-            if re.search(r'[a-zA-Z]',p[0]): anchor += p[0].lower()
-    return ('question', t, anchor)
-
-def handle_answer(t):
-    return ('answer',t)
-
-def handle_sec(t): return ('section',t)
-def handle_para(t): return ('paragraph',t)
-def handle_ul(t): return ('unorderdedlist',t)
-def handle_li(t): return ('listitem',t)
-
-
-
-img_count = 1
-img_lookup = {}
-img_heights = {}
-img_widths = {}
-
-
-def fetch_doc_image(k,value):
-    global img_count, img_lookup, img_heights, img_widths
-    if 'inlineObjectProperties' in value:
-        if 'embeddedObject' in value['inlineObjectProperties']:
-            if 'imageProperties' in value['inlineObjectProperties']['embeddedObject']:
-                if 'contentUri' in value['inlineObjectProperties']['embeddedObject']['imageProperties']:
-                    print(k)
-                    uu = value['inlineObjectProperties']['embeddedObject']['imageProperties']['contentUri']
-                    response = requests.get(uu, stream=True)
-                    name = 'image_' + str(img_count) + '.' + response.headers['content-type'].split('/')[1]
-                    img_count += 1
-                    img_lookup[k] = name
-                    
-                    with open('cache/doc_images/'+name, 'wb') as out_file:
-                        shutil.copyfileobj(response.raw, out_file)
-                    print(uu)
-                    print(response.headers)
-                    print(name)
-                    del response
-            if 'size' in  value['inlineObjectProperties']['embeddedObject']: 
-                img_heights[k] = int(value['inlineObjectProperties']['embeddedObject']['size']['height']['magnitude'])
-                img_widths[k] = int(value['inlineObjectProperties']['embeddedObject']['size']['width']['magnitude'])
-        
-
-def get_doc_generic(docid, bracket=1, verbose=0):
-    import pickle
-    import os.path
-    from googleapiclient.discovery import build
-    from google_auth_oauthlib.flow import InstalledAppFlow
-    from google.auth.transport.requests import Request
-    global img_count, img_lookup, img_heights, img_widths
-
-# If modifying these scopes, delete the file token.pickle.
-    SCOPES = ['https://www.googleapis.com/auth/documents.readonly']
-    creds = None
-    # The file token.pickle stores the user's access and refresh tokens, and is
-    # created automatically when the authorization flow completes for the first
-    # time.
-    if os.path.exists('token.pickle'):
-        with open('token.pickle', 'rb') as token:
-            creds = pickle.load(token)
-    if not creds or not creds.valid:
-        if creds and creds.expired and creds.refresh_token:
-            creds.refresh(Request())
-        else:
-            flow = InstalledAppFlow.from_client_secrets_file(
-                'credentials.json', SCOPES)
-            creds = flow.run_local_server(port=0)
-        # Save the credentials for the next run
-        with open('token.pickle', 'wb') as token:
-            pickle.dump(creds, token)
-
-    service = build('docs', 'v1', credentials=creds)
-
-    # Retrieve the documents contents from the Docs service.
-    document = service.documents().get(documentId=docid).execute()
-    
-    tempout = codecs.open('cache/trash/gdoctemp.txt','w','utf-8')
-    tempout.write( json.dumps(document,indent=2) \
-        + "\n\n\n------------------------------------\n\n")
-    if verbose: print('The title of the document is: {}'.format(document.get('title'))) 
-
-    doc_content = document.get('body').get('content')
-    doc_objects = document.get('inlineObjects')
-    doc_lists = document.get('lists')
-    
-    #text = ''
-    result = []
-    last_type = ''
-    #answer_text = ''
-    answer = []
-    in_a_list = ''
-    
-    # Get all the images
-    for k,value in doc_objects.items():
-        tempout.write( "->" + k + "=" + json.dumps(value,indent=2) + "\n\n\n--\n\n")
-        fetched = fetch_doc_image(k,value)
-
-    list_stack = []
-    list_depth = 0
-    last_list_depth = 0
-    for value in doc_content:
-        tempout.write( json.dumps(value,indent=2) + "\n\n\n")
-        if verbose: print(json.dumps(value, sort_keys=True, indent=4))
-
-        tag_fxn = handle_para
-        if 'paragraph' in value:
-            this_text = ''
-            
-            # First we deal with if we're in a list.
-            if 'bullet' in value['paragraph']:
-                # either we're (1)starting a new list, (2)in one (do nothing), 
-                #  (3)starting a nested one, or (4)finished a nested one. 
-                lid = value['paragraph']['bullet']['listId']
-                if not list_stack:  # 1
-                    list_stack.append(lid)
-                else:
-                    if not lid == list_stack[0]:
-                        if not lid in list_stack:   # 3
-                            list_stack.append(lid)
-                        else:                       # 4
-                            x = list_stack.pop()
-                            while x != lid: list_stack.pop()
-            elif len(list_stack) > 0:    
-                #  current para isn't a bullet but we still have a list open.             
-                list_stack = []
-                
-                
-            list_depth = len(list_stack)
-            deeper = list_depth - last_list_depth
-            if deeper > 0:
-                answer.append("<ul>" * deeper)
-            elif deeper < 0:
-                deeper = -1 * deeper
-                answer.append("</ul>" * deeper)
-            if len(list_stack):
-                tag_fxn = handle_li
-            
-            # NOW the tag_fxn is either 'para' or 'li'... let's get the styling info next,
-            elements = value.get('paragraph').get('elements')
-            if 'paragraphStyle' in value.get('paragraph'):
-                style = value.get('paragraph').get('paragraphStyle')
-                if 'namedStyleType' in style:
-                    type = style['namedStyleType']
-            
-            # and FINALLY, the actual contents.
-            for elem in elements:
-                # text content
-                this_text += read_paragraph_element_2(elem,type)
-                
-                # image content
-                if 'inlineObjectElement' in elem:
-                    vpi = elem['inlineObjectElement']
-                    if 'inlineObjectId' in vpi:
-                        ii = vpi['inlineObjectId']
-                        if ii in img_lookup:
-                            img = img_lookup[ii]
-                            h = img_heights[ii]
-                            w = img_widths[ii]
-                            this_text += '<img src="doc_images/%s" width="%i" height="%i" />' % (img,w,h)
-                
-                
-            # Now for something tricky. Call an appropriate handler, based on:
-            #  (a) what is the paragraph style type?
-            #  (b) is it different from the prev one?
-            
-            if last_type=='NORMAL_TEXT' and type!=last_type:
-                if this_text.strip():
-                    result.append(handle_answer(answer))
-                answer = []
-                #answer_text = ''
-            
-            if type=='HEADING_2' and this_text.strip():
-                result.append( handle_sec(this_text) )
-                this_text = ''
-            elif type=='HEADING_3' and this_text.strip(): 
-                result.append(handle_question(this_text,bracket))
-                this_text = ''
-            else:
-                if this_text.lower().startswith('tags:'):
-                    tag_fxn = handle_tags
-                if this_text.lower().startswith('icons:'):
-                    tag_fxn = handle_icons
-                if this_text.strip():
-                    answer.append(tag_fxn(this_text))
-                this_text = ''
-            last_type = type
-            last_list_depth = list_depth
-                
-        elif 'table' in value:
-            pass
-    
-                                      
-    result.append(handle_answer(answer))
-    return json.dumps(result,indent=4)
-
-
-
-
-def process_reg_history(term='fa25'):
-    from collections import defaultdict
-    from itertools import groupby
-    from operator import itemgetter
-
-    def read_grouped_csv(path):
-        with open(path, newline='') as f:
-            fieldnames = ['datetime', 'crn', 'course', 'teacher', 'max', 'enrolled', 'waitlistmax', 'waitlisted']
-            reader = csv.DictReader(f, fieldnames=fieldnames)
-            rows = sorted(reader, key=lambda r: r['datetime'])        # Group by timestamp
-        grouped = {}
-        for ts, group in groupby(rows, key=itemgetter('datetime')):
-            grouped[ts] = {r['crn']: r for r in group}
-        return grouped
-
-    def crossed_threshold(old_val, new_val, max_val):
-        thresholds = [0.25, 0.5, 0.75, 1.0]
-        if int(max_val) == 0:
-            return False, None
-        old_ratio = int(old_val) / int(max_val)
-        new_ratio = int(new_val) / int(max_val)
-        for t in thresholds:
-            if old_ratio < t <= new_ratio:
-                return True, int(t * 100)
-        return False, None
-
-    def detect_changes(prev, curr):
-        changes = defaultdict(list)
-
-        all_crns = prev.keys() | curr.keys()
-        for crn in all_crns:
-            o, n = prev.get(crn), curr.get(crn)
-            if not o:
-                changes[crn].append((n['datetime'], "Section was added."))
-            elif not n:
-                changes[crn].append((
-                    o['datetime'],
-                    f"Section was removed (last seen: teacher {o['teacher']}, "
-                    f"{o['enrolled']}/{o['max']} enrolled, {o['waitlisted']}/{o['waitlistmax']} waitlisted)."
-                ))
-            else:
-                dt = n['datetime']
-                if o['teacher'] != n['teacher']:
-                    changes[crn].append((dt, f"Teacher changed from {o['teacher']} to {n['teacher']}."))
-                if o['enrolled'] != n['enrolled']:
-                    crossed, percent = crossed_threshold(o['enrolled'], n['enrolled'], n['max'])
-                    if crossed:
-                        changes[crn].append((dt, f"Enrollment crossed {percent}% ({n['enrolled']}/{n['max']})."))
-                if int(n['waitlisted']) > 10 and o['waitlisted'] != n['waitlisted']:
-                    changes[crn].append((dt, f"Waitlist exceeds 10: {n['waitlisted']}."))
-        return changes
-
-    def time_to_iso(s):
-        return datetime.datetime.strptime(s, "%Y-%m-%dT%H-%M").isoformat()
-    
-    def detect_changes_structured(prev, curr):
-        changes = defaultdict(list)
-
-        all_crns = prev.keys() | curr.keys()
-        for crn in all_crns:
-            o, n = prev.get(crn), curr.get(crn)
-            if not o:
-                changes[crn].append({'time':time_to_iso(n['datetime']), "type":'section update', 'message': "Section was added."})
-            elif not n:
-                changes[crn].append(
-                    {'time':time_to_iso(o['datetime']), "type":'section update', 'message': "Section was removed.",
-                     'value': o['enrolled'], 'capacity': o['max'], })
-            else:
-                dt = time_to_iso(n['datetime'])
-                if o['teacher'] != n['teacher']:
-                    changes[crn].append({'time':dt, "type":'teacher_change', 
-                     'message': f"Teacher changed from {o['teacher']} to {n['teacher']}.",
-                     'old_teacher': o['teacher'], 'new_teacher': n['teacher'], })
-                if o['enrolled'] != n['enrolled']:
-                    crossed, percent = crossed_threshold(o['enrolled'], n['enrolled'], n['max'])
-                    if crossed:
-                        changes[crn].append({'time':dt, "type":'enrollment_milestone', 
-                     'message': f"Enrollment crossed {percent}% ({n['enrolled']}/{n['max']}).",
-                     'percent':percent,'value':n['enrolled'],'capacity':n['max'] })
-                if int(n['waitlisted']) > 10 and o['waitlisted'] < n['waitlisted']:
-                    changes[crn].append({'time':dt, "type":'enrollment_milestone', 
-                     'message': f"Waitlist exceeds 10: {n['waitlisted']}).",
-                     'value':n['waitlisted']})
-        return changes
-
-
-    def process_diff_timeline(path):
-        snapshots = read_grouped_csv(path)
-        timeline = sorted(snapshots.keys())
-        timeline_diffs = []
-        timeline_diffs_structured = []
-        course_names = {}  # crn -> latest known course name
-
-        for i in range(1, len(timeline)):
-            prev_ts, curr_ts = timeline[i-1], timeline[i]
-            prev, curr = snapshots[prev_ts], snapshots[curr_ts]
-
-            # update course name map
-            for crn, row in curr.items():
-                course_names[crn] = row['course']
-
-            delta = detect_changes(prev, curr)
-            timeline_diffs.append(delta)
-
-            delta_structured = detect_changes_structured(prev,curr)
-            timeline_diffs_structured.append(delta_structured)
-
-        # Flatten and group by crn
-        crn_changes = defaultdict(list)
-        for delta in timeline_diffs:
-            for crn, changes in delta.items():
-                crn_changes[crn].extend(changes)
-
-        # Flatten and group by crn
-        crn_changes_structured = defaultdict(list)
-        for delta in timeline_diffs_structured:
-            for crn, changes in delta.items():
-                crn_changes_structured[crn].extend(changes)
-
-        # Sort changes for each CRN by datetime
-        for crn in crn_changes:
-            crn_changes[crn].sort(key=lambda x: x[0])
-
-        # Sort changes for each CRN by datetime
-        for crn in crn_changes_structured:
-            crn_changes[crn].sort(key=lambda x: x[0])
-
-        return crn_changes, crn_changes_structured, course_names
-    
-    fresh_history = requests.get(f"http://gavilan.cc/schedule/reg_history_{term}.csv").text
-    fresh_file = codecs.open(f'cache/reg_history_{term}.csv','w','utf-8')
-    fresh_file.write(fresh_history)
-    fresh_file.close()
-
-    output1 = codecs.open(f'cache/reg_timeline_{term}.txt','w','utf-8')
-    output2 = codecs.open(f'cache/reg_timeline_{term}.json','w','utf-8')
-    changes, changes_structured, course_names = process_diff_timeline(f"cache/reg_history_{term}.csv")
-
-    # once for plain text
-
-    for crn in sorted(changes, key=lambda c: course_names.get(c, "")):
-        course = course_names.get(crn, "")
-        course_output = {'code': course, 'crn':crn,'events':[]}
-        print(f"\n{course} (CRN {crn}):")
-        output1.write(f"\n{course} (CRN {crn}):\n")
-        for dt, msg in changes[crn]:
-            print(f"  [{dt}] {msg}")
-            output1.write(f"  [{dt}] {msg}\n")
-            
-            course_output['events'].append({'message':msg, 'time':time_to_iso(dt)})
-
-    # again for structured
-    crn_list = []
-
-    for crn in sorted(changes_structured, key=lambda c: course_names.get(c, "")):
-        course = course_names.get(crn, "")
-        course_output = {'code': course, 'crn':crn,'events':changes_structured[crn]}
-        crn_list.append(course_output)
-
-    output2.write( json.dumps(crn_list,indent=2) )
-    output2.close()
-
-
-
-def recreate_all():
-    # Use canonical semester short codes from semesters.py
-    try:
-        from semesters import code as SEM_CODES
-    except Exception:
-        SEM_CODES = []
-    for x in SEM_CODES:
-        try:
-            recreate_reg_data(x)
-        except Exception as e:
-            print(f'Failed on {x} with: {e}')
-
-
-def recreate_reg_data(term="fa25"):
-    from collections import defaultdict
-    from datetime import datetime
-
-    def parse_row(row):
-        dt = datetime.strptime(row['datetime'], "%Y-%m-%dT%H-%M")
-        crn = row['crn']
-        enrolled = int(row['enrolled'])
-        return dt, row['datetime'], crn, enrolled
-
-    def reduce_latest_per_day(rows):
-        latest = defaultdict(dict)  # latest[crn][date] = (dt, ts, enrolled)
-        latest_ts_by_date = {}      # date → (dt, ts) for header naming
-
-        for row in rows:
-            dt, full_ts, crn, enrolled = parse_row(row)
-            date_str = dt.date().isoformat()
-            ts_header = dt.strftime("%Y-%m-%dT%H")  # <-- this is what we want
-
-            # for each crn, per day, keep latest reading
-            if date_str not in latest[crn] or dt > latest[crn][date_str][0]:
-                latest[crn][date_str] = (dt, ts_header, enrolled)
-
-            # also record latest timestamp per day for consistent column headers
-            if date_str not in latest_ts_by_date or dt > latest_ts_by_date[date_str][0]:
-                latest_ts_by_date[date_str] = (dt, ts_header)
-
-        return latest, [ts for _, ts in sorted(latest_ts_by_date.values())]
-
-    def pivot_table(latest, headers):
-        crns = sorted(latest)
-        table = []
-
-        for crn in crns:
-            row = [crn]
-            for ts in headers:
-                date_str = ts[:10]  # match on YYYY-MM-DD
-                val = latest[crn].get(date_str)
-                if val and val[1] == ts:
-                    row.append(str(val[2]))
-                else:
-                    row.append("")
-            table.append(row)
-
-        return ['crn'] + headers, table
-
-    #with open(f"cache/reg_history_{term}.csv", newline='') as f:
-    from io import StringIO
-    url = f"https://gavilan.cc/schedule/reg_history_{term}.csv"
-
-    # Download
-    resp = requests.get(url)
-    resp.raise_for_status()   # raises if bad status
-
-    # Wrap the text in a file-like object
-    f = StringIO(resp.text)
-
-    fieldnames = ['datetime', 'crn', 'course', 'teacher', 'max', 'enrolled', 'waitlistmax', 'waitlisted']
-    reader = csv.DictReader(f, fieldnames=fieldnames)
-    rows = list(reader)
-
-    latest, headers = reduce_latest_per_day(rows)
-    header_row, table = pivot_table(latest, headers)
-
-    with open(f"cache/reg_data_{term}.csv", "w", newline='') as f:
-        writer = csv.writer(f)
-        writer.writerow(header_row)
-        writer.writerows(table)
-
-
-if __name__ == "__main__":
-    
-    print ('')
-    options = { 1: ['Fetch rosters on schedule',fetch_current_rosters_auto] ,  
-                2: ['Get canvas data 2024 style', canvas_data_2024_run ],
-                3: ['Set up canvas data 2024 style', setup_canvas_data_2024_run],
-                4: ['Narrative timeline of section updates', process_reg_history],
-                5: ['Create narrative format all semesters', recreate_all],
-                6: ['Recreate reg_data from full reg history', recreate_reg_data],
-    }
-        
-    '''1: ['Re-create schedule csv and json files from raw html',recent_schedules] ,  
-    2: ['Fetch rosters',fetch_current_rosters] , 
-    3: 
-    4: ['Compute how registration is filling up classes', schedule_filling] , 
-    5: ['Manually convert 3 csv files to joined json enrollment file.', convert_roster_files] , 
-    6: ['Canvas data: interactive sync', interactive ], 
-    7: ['Canvas data: automated sync', sync_non_interactive ], 
-    8: 
-    9: 
-    16: ['Scrape schedule from ssb', scrape_schedule_multi ], 
-    14: ['Generate latestart schedule', list_latestarts ],
-    15: ['Test ssb calls with python', scrape_schedule_py ], 
-    10: ['schedule to db', scrape_for_db ], 
-    11: ['clean argos draft schedule file', argos_data_from_cvc],
-    12: ['make expanded schedule json files of old semesters', expand_old_semesters ],
-    13: ['Parse deanza schedule', dza_sched ],
-    '''
-              
-    
-    if len(sys.argv) > 1 and re.search(r'^\d+',sys.argv[1]):
-        resp = int(sys.argv[1])
-        print("\n\nPerforming: %s\n\n" % options[resp][0])
-
-    else:
-        print ('')
-        for key in options:
-            print(str(key) + '.\t' + options[key][0])
-        
-        print('')
-        resp = input('Choose: ')
-        
-    # Call the function in the options dict
-    options[ int(resp)][1]() 
-    
-# Testing
-
-#if __name__ == "__main__":
-    #users = fetch('/api/v1/courses/69/users?per_page=100',1)
-    #print "These are the users: "
-    #print users
-    
-    #getSemesterSchedule()
-    
-    
-    #get_doc()
-    #pass
+import util
+import codecs, json, requests, re, csv, datetime, os, jsondiff, os.path
+import sys, shutil, hmac, hashlib, base64, schedule, time, pathlib
+from datetime import timedelta
+
+from canvas_secrets import apiKey, apiSecret, FTP_SITE, FTP_USER, FTP_PW, url, domain, account_id, header, header_media
+from canvas_secrets import instructure_url, instructure_username, instructure_private_key
+
+import os, asyncio
+from dap.api import DAPClient
+from dap.dap_types import Credentials
+from dap.integration.database import DatabaseConnection
+from dap.replicator.sql import SQLReplicator
+
+
+
+"""
+Everything to do with fetching data, 
+ - From iLearn, via token
+ - current roster uploads from instructures sftp site
+ - raw logs and other from canvas data repo
+ - from ssb, use firefox to scrape the schedule
+ 
+ 
+And some subsequent processing:
+ - Raw roster files, into a more compact json format
+ - Raw logs into something more useful
+"""                                              
+
+verbose = False
+
+users = {}
+users_by_id = {}
+
+# todo: all these constants for SSB  --  line 1008
+#
+# todo: https://stackoverflow.com/questions/42656247/how-can-i-use-canvas-data-rest-api-using-python
+
+
+
+    
+
+
+sys.setrecursionlimit( 100000 )
+
+local_data_folder = 'cache/canvas_data/'
+mylog = codecs.open(local_data_folder + 'temp_log.txt','w')
+
+
+
+
+class FetchError(Exception):
+    pass
+
+
+DEBUG = 0
+
+def d(s,end=''): 
+    global DEBUG
+    if end and DEBUG: print(s,end=end)
+    elif DEBUG: print(s)
+
+################
+################  CANVAS API MAIN FETCHING FUNCTIONS
+################
+################
+################
+
+
+
+
+# Main canvas querying fxn    
+def fetch(target,verbose=0,params=0,media=0):
+    # if there are more results, recursivly call myself, adding on to the results.
+    results = 0
+    if target[0:4] != "http": target = url + target
+    if verbose: 
+        print("++ Fetching: " + target)
+    if media:
+        r2 = requests.get(target, headers = header_media)
+    elif params:
+        r2 = requests.get(target, headers = header, params = params)
+    else:
+        r2 = requests.get(target, headers = header)
+    #if verbose:
+    #print "++ Got: " + r2.text
+    try:
+        results = json.loads(r2.text)
+        count = len(results)
+    except:
+        print("-- Failed to parse: ", r2.text)
+    if verbose: 
+        print("Got %i results" % count)
+    if verbose > 1:
+        print(r2.headers)
+        
+        tempout = codecs.open('cache/fetchcache.txt','a','utf-8')
+        tempout.write(r2.text+"\n\n")
+        tempout.close()
+    
+    if ('link' in r2.headers and count > 0):
+        links = r2.headers['link'].split(',')
+        for L in links:
+            ll = L.split(';')
+            link = ll[0].replace("<","")
+            link = link.replace(">","")
+            if re.search(r'next', ll[1]):
+                if (verbose):  print("++ More link: " + link)
+                #link = re.sub(r'per_page=10$', 'per_page=100', link)    # link.replace('per_page=10','per_page=500')
+                #if (verbose):  print("++ More link: " + link)
+                
+                nest = fetch(link,verbose,params,media)
+                if isinstance(results,dict): results.update(nest)
+                else: results.extend(nest)
+    return results    
+    
+# Main canvas querying fxn - stream version - don't die on big requests
+def fetch_stream(target,verbose=0): 
+    # if there are more results, recursivly call myself, adding on to the results.
+    results = 0
+    while target:
+        if target[0:4] != "http": target = url + target
+        if verbose: 
+            print("++ Fetching: " + target)
+        r2 = requests.get(target, headers = header)
+        if r2.status_code == 502:
+            raise FetchError()
+        try:
+            results = json.loads(r2.text)
+            count = len(results)
+        except:
+            print("-- Failed to parse: ", r2.text)
+        if verbose: 
+            print("Got %i results" % count)
+        if verbose > 1:
+            print(r2.headers)
+            tempout = codecs.open('cache/fetchcache.txt','a','utf-8')
+            tempout.write(r2.text+"\n\n")
+            tempout.close()
+        
+        next_link_found = 0
+        if ('link' in r2.headers and count > 0):
+            links = r2.headers['link'].split(',')
+            for L in links:
+                ll = L.split(';')
+                link = ll[0].replace("<","")
+                link = link.replace(">","")
+                if re.search(r'next', ll[1]):
+                    target = link 
+                    next_link_found = 1
+                    break
+        if not next_link_found: target = 0
+        yield results
+
+    
+# for dicts with one key, collapse that one key out, cause
+# paging makes problems... example: enrollment_terms    
+def fetch_collapse(target,collapse='',verbose=0):
+    # if there are more results, recursivly call myself, adding on to the results.
+    results = 0
+    if target[0:4] != "http": target = url + target
+    if verbose: 
+        print("++ Fetching: " + target)
+    r2 = requests.get(target, headers = header)
+    #if verbose:
+    #print "++ Got: " + r2.text
+    try:
+        results = json.loads(r2.text)
+    except:
+        print("-- Failed to parse: ", r2.text)
+    if verbose: print(r2.headers)
+    
+    if collapse and collapse in results:
+        results = results[collapse]
+    
+    if ('link' in r2.headers):
+        links = r2.headers['link'].split(',')
+        for L in links:
+            ll = L.split(';')
+            link = ll[0].replace("<","")
+            link = link.replace(">","")
+            if re.search(r'next', ll[1]):
+                if (verbose):  print("++ More link: " + link)
+                nest = fetch_collapse(link, collapse, verbose)
+                if isinstance(results,dict): results.update(nest)
+                else: results.extend(nest)
+    return results    
+    
+
+
+
+    
+    
+
+
+
+################
+################  CANVAS DATA
+################
+################
+################
+
+
+# Get canvas data 2024 style
+def canvas_data_2024_run():
+    print("Updating all tables.")
+    asyncio.run(canvas_data_2024())
+    print("Done with all tables.")
+
+
+async def canvas_data_2024():
+
+    base_url: str = os.environ["DAP_API_URL"]
+    client_id: str = os.environ["DAP_CLIENT_ID"]
+    client_secret: str = os.environ["DAP_CLIENT_SECRET"]
+    #connection_string: str = "postgresql://postgres:rolley34@192.168.1.6/db"
+
+    # todo: use secrets
+    connection_string: str = "postgresql://postgres:rolley34@192.168.1.199/db"
+    
+    desired_tables = "users,courses,communication_channels,context_modules,conversation_message_participants,conversation_messages,conversation_participants,conversations,course_sections,enrollment_states,enrollment_dates_overrides,enrollment_terms,enrollments,learning_outcome_groups,learning_outcome_question_results,learning_outcomes,pseudonyms,quizzes,scores,submissions,submission_versions,wiki_pages,wikis".split(',')
+    credentials = Credentials.create(client_id=client_id, client_secret=client_secret)
+
+    async with DatabaseConnection(connection_string).open() as db_connection:
+        async with DAPClient(base_url, credentials) as session:
+            #tables = await session.get_tables("canvas")
+            for table in desired_tables:
+                print(f"  trying to update {table} ")
+                try:
+                    #await SQLReplicator(session, db_connection).initialize("canvas", table) 
+                    await SQLReplicator(session, db_connection).synchronize("canvas", table)
+                except Exception as e:
+                    print(f"  - skipping {table} because {e}")
+
+
+
+# Get canvas data 2024 style
+def setup_canvas_data_2024_run():
+    print("Setting up all tables.")
+    asyncio.run(setup_canvas_data_2024())
+    print("Done with all tables.")
+
+
+async def setup_canvas_data_2024():
+
+    base_url: str = os.environ["DAP_API_URL"]
+    client_id: str = os.environ["DAP_CLIENT_ID"]
+    client_secret: str = os.environ["DAP_CLIENT_SECRET"]
+    #connection_string: str = "postgresql://postgres:rolley34@192.168.1.6/db"
+    connection_string: str = "postgresql://postgres:rolley34@192.168.1.192/db"
+    
+    desired_tables = "users,courses,communication_channels,context_modules,conversation_message_participants,conversation_messages,conversation_participants,conversations,course_sections,enrollment_states,enrollment_dates_overrides,enrollment_terms,enrollments,learning_outcome_groups,learning_outcome_question_results,learning_outcomes,pseudonyms,quizzes,scores,submissions,submission_versions,wiki_pages,wikis".split(',')
+    credentials = Credentials.create(client_id=client_id, client_secret=client_secret)
+
+    async with DatabaseConnection(connection_string).open() as db_connection:
+        async with DAPClient(base_url, credentials) as session:
+            #tables = await session.get_tables("canvas")
+            for table in desired_tables:
+                print(f"  {table}")
+                try:
+                    await SQLReplicator(session, db_connection).initialize("canvas", table) 
+                except Exception as e:
+                    print(f"  - skipping {table} because {e}")
+
+
+
+
+
+
+
+
+
+
+
+  
+
+
+    
+  
+
+
+
+
+
+
+################
+################  ROSTERS AND REGISTRATION
+################
+################
+################
+
+# todo: the pipeline is disorganized. Organize it to have 
+# a hope of taking all this to a higher level.
+#     
+
+# todo: where does this belong in the pipeline? compare with recent_schedules()
+
+
+
+# Take the generically named rosters uploads files and move them to a semester folder and give them a date.    
+def move_to_folder(sem,year,folder,files):
+    semester = year+sem
+    semester_path = 'cache/rosters/%s' % semester
+    if not os.path.isdir('cache/rosters/'+semester):
+        os.makedirs('cache/rosters/'+semester)
+    now = datetime.datetime.now().strftime('%Y-%m-%dT%H-%M')
+    print("+  Moving roster files to folder: %s" % semester_path)
+    if not os.path.isdir(semester_path):
+        print("+  Creating folder: %s" % semester_path)
+        os.makedirs(semester_path)
+    def safe_move(src, dst):
+        try:
+            os.replace(src, dst)
+        except Exception as ex:
+            print(f"! move failed {src} -> {dst}: {ex}")
+    if 'courses.csv' in files:
+        safe_move('cache/rosters/courses-%s.csv' % folder,     'cache/rosters/%s/courses.%s.csv' % (semester,now))
+    if 'enrollments.csv' in files:
+        safe_move('cache/rosters/enrollments-%s.csv' % folder, 'cache/rosters/%s/enrollments.%s.csv' % (semester,now))
+    if 'users.csv' in files:
+        safe_move('cache/rosters/users-%s.csv' % folder,       'cache/rosters/%s/users.%s.csv' % (semester,now))
+    
+
+
+# Take raw upload (csv) files and make one big json out of them. 
+# This relates to enrollment files, not schedule. 
+def convert_roster_files(semester="",year="",folder=""):
+    if not semester:
+        semester = input("the semester? (ex: spring) ")
+        folder = input("Folder? (ex 2020-02-25-14-58-20) ")
+    uf = open('cache/rosters/users-'+folder+'.csv','r')
+    cf = open('cache/rosters/courses-'+folder+'.csv','r')
+    ef = open('cache/rosters/enrollments-'+folder+'.csv','r')
+    u = csv.DictReader(uf)
+    c = csv.DictReader(cf)
+    e = csv.DictReader(ef)
+    uu = [i for i in u]
+    cc = [i for i in c]
+    ee = [i for i in e]
+    uf.close()
+    cf.close()
+    ef.close()
+    myrosterfile = 'cache/rosters/roster_%s_%s.json' % (year, semester) 
+
+    if os.path.exists(myrosterfile):
+        print("  --  Moving previous combined roster json file. opening %s ..." % myrosterfile)
+        last_fileobj = open(myrosterfile,'r')
+        last_file = json.load(last_fileobj)
+                             
+        last_fileobj.close()
+
+        info = last_file[3]
+        last_date = info['date_filestring']
+        
+        print('  --  writing:  cache/rosters/%s%s/roster_%s.json ...' % (year,semester,last_date))
+        
+        try:
+            os.rename(myrosterfile, 'cache/rosters/%s%s/roster_%s.json ...' % (year,semester,last_date))
+            print('  --  ok')
+        except Exception as e:
+            print("  **  Failed because i couldn't move the previous roster file: %s" % myrosterfile)
+            print(e)
+            myrosterfile = "new_" + myrosterfile
+            pass
+            #os.remove('cache/old_rosters/roster_'+semester+'.'+last_date+'.json')
+            #os.rename(myrosterfile, 'cache/old_rosters/roster_'+semester+'.'+last_date+'.json')
+
+    newinfo = {'date_filestring': datetime.datetime.now().strftime('%Y-%m-%dT%H-%M'), }
+    try:
+        new_roster = codecs.open(myrosterfile,'w', 'utf-8')
+        new_roster.write( json.dumps( [uu,cc,ee,newinfo], indent=2 ))
+        new_roster.close()
+        print("  --  Wrote roster info to: %s." % myrosterfile)
+    except Exception as e:
+        print("  **  Failed because i couldn't move the previous roster file: %s" % myrosterfile)
+        print("  **  " + str(e))
+            
+                                
+                                              
+
+
+
+# From instructure sftp site
+def fetch_current_rosters(sftp=None, label_hour=None):
+    """Download roster CSVs from Instructure SFTP and post-process.
+    - If sftp provided, reuse it (already chdir'd to SIS).
+    - Files are saved using label_hour (format YYYY-MM-DD-HH). If None, compute fallback label by flooring to the hour.
+    """
+    import pysftp
+    def log(msg):
+        ts = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
+        try:
+            with open('cache/pipeline.log.txt','a', encoding='utf-8') as f:
+                f.write(f"[{ts}] {msg}\n")
+        except Exception:
+            print(msg)
+
+    close_after = False
+    if sftp is None:
+        cnopts = pysftp.CnOpts()
+        cnopts.hostkeys = None
+        sftp = pysftp.Connection(instructure_url, username=instructure_username, private_key=instructure_private_key, cnopts=cnopts)
+        sftp.chdir('SIS')
+        close_after = True
+    try:
+        now = datetime.datetime.now()
+        if not label_hour:
+            # default fallback: floor to hour
+            label_hour = now.replace(minute=0, second=0, microsecond=0).strftime('%Y-%m-%d-%H')
+
+        files = set(sftp.listdir())
+        log(f"fetch_current_rosters: label={label_hour} files={sorted(files)}")
+        need = ['login','users','courses','enrollments']
+        saved = []
+        for name in need:
+            remote = f'{name}.csv'
+            target = f'cache/rosters/{name}-{label_hour}.csv'
+            try:
+                if remote in files:
+                    if not (os.path.exists(target) and os.path.getsize(target) > 0):
+                        temp = target + '.part'
+                        try:
+                            if os.path.exists(temp):
+                                os.remove(temp)
+                        except Exception:
+                            pass
+                        sftp.get(remote, temp)
+                        # atomic replace
+                        os.replace(temp, target)
+                        saved.append(remote)
+                        log(f'  saved {remote} -> {target}')
+                    else:
+                        log(f'  already exists: {target}, skipping')
+                else:
+                    log(f'  missing on server: {remote}')
+            except Exception as ex:
+                log(f'  download failed: {remote} -> {target} err={ex}')
+
+        if len(saved) >= 3 and 'courses.csv' in saved or os.path.exists(f'cache/rosters/courses-{label_hour}.csv'):
+            try:
+                with open(f'cache/rosters/courses-{label_hour}.csv','r') as courses:
+                    courses.readline()
+                    a = courses.readline()
+                parts = a.split(',')
+                year = parts[1][0:4]
+                ss = parts[1][4:6]
+                sem = {'30':'spring', '50':'summer', '70':'fall' }
+                this_sem = sem.get(ss, 'spring')
+                log(f"post-process for semester={this_sem} year={year} label={label_hour}")
+                convert_roster_files(this_sem,year,label_hour)
+                move_to_folder(this_sem,year,label_hour,saved)
+            except Exception as expp:
+                log(f'post-processing failed: {expp}')
+        else:
+            log('not enough files to post-process yet')
+    finally:
+        if close_after:
+            try:
+                sftp.close()
+            except Exception:
+                pass
+
+def fetch_current_rosters_auto(poll_seconds=15):
+    """Poll Instructure SFTP from the top of each hour until all 4 roster files appear, then download once.
+    - Robust logging to cache/pipeline.log.txt
+    - Stops polling for the remainder of that hour after success
+    - Tries again on the next hour
+    """
+    import pysftp
+    from contextlib import contextmanager
+
+    log_path = 'cache/pipeline.log.txt'
+
+    def log(msg):
+        ts = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
+        try:
+            with open(log_path, 'a', encoding='utf-8') as f:
+                f.write(f"[{ts}] {msg}\n")
+        except Exception:
+            print(f"[log-fail] {msg}")
+
+    @contextmanager
+    def sftp_conn():
+        cnopts = pysftp.CnOpts()
+        cnopts.hostkeys = None
+        conn = None
+        try:
+            conn = pysftp.Connection(instructure_url, username=instructure_username,
+                                     private_key=instructure_private_key, cnopts=cnopts)
+            conn.chdir('SIS')
+            yield conn
+        finally:
+            try:
+                if conn:
+                    conn.close()
+            except Exception:
+                pass
+
+    required = {'login.csv', 'users.csv', 'courses.csv', 'enrollments.csv'}
+    current_window = None  # e.g., '2025-08-30-14'
+    window_done = False
+    window_end = None
+
+    def compute_window(now):
+        """Return (label_hour_str, window_end_dt) if within a window, else (None, None).
+        Window spans [H-5m, H+5m] around each top-of-hour H.
+        If minute >= 55 -> window is next hour
+        If minute <= 5  -> window is current hour
+        Else -> not in window.
+        """
+        m = now.minute
+        if m >= 55:
+            base = (now.replace(minute=0, second=0, microsecond=0) + timedelta(hours=1))
+        elif m <= 5:
+            base = now.replace(minute=0, second=0, microsecond=0)
+        else:
+            return (None, None)
+        label = base.strftime('%Y-%m-%d-%H')
+        end = base + timedelta(minutes=5)  # end of window
+        return (label, end)
+
+    log('fetch_current_rosters_auto: starting poll loop')
+    while True:
+        now = datetime.datetime.now()
+        hour_key = now.strftime('%Y-%m-%d %H')
+        label, enddt = compute_window(now)
+        if label is None:
+            time.sleep(max(5, int(poll_seconds)))
+            continue
+
+        if label != current_window:
+            current_window = label
+            window_done = False
+            window_end = enddt
+            log(f'New window: {current_window} (until {window_end.strftime("%H:%M:%S")}). Resetting state.')
+
+        if window_done:
+            time.sleep(5)
+            continue
+
+        # Poll SFTP for files
+        try:
+            with sftp_conn() as sftp:
+                files = set(sftp.listdir())
+                missing = list(required - files)
+                if missing:
+                    log(f'Not ready yet. Missing: {missing}')
+                else:
+                    log('All required files present inside window. Starting download...')
+                    try:
+                        fetch_current_rosters(sftp=sftp, label_hour=current_window)
+                        log('Download complete. Marking window_done for this window.')
+                        window_done = True
+                    except Exception as ex_dl:
+                        import traceback
+                        log('Download failed: ' + str(ex_dl))
+                        log(traceback.format_exc())
+        except Exception as ex_list:
+            import traceback
+            log('SFTP list failed: ' + str(ex_list))
+            log(traceback.format_exc())
+
+        # Check for window timeout
+        try:
+            if not window_done and window_end and now >= window_end:
+                log(f'REPORT: window {current_window} ended without all files. Consider alerting/email.')
+                window_done = True  # stop further checks this window
+        except Exception:
+            pass
+
+        time.sleep(max(5, int(poll_seconds)))
+
+
+
+
+
+
+################
+################  SENDING DATA AWAY
+################
+################
+################
+
+# Upload a json file to www
+def put_file(remotepath,localpath, localfile,prompt=1):
+    import pysftp
+    show_all = 0
+    folder = datetime.datetime.now().strftime('%Y-%m-%d-%H-%M-%S')
+    cnopts = pysftp.CnOpts()
+    cnopts.hostkeys = None
+
+    with pysftp.Connection(FTP_SITE,username=FTP_USER, password=FTP_PW,cnopts=cnopts) as sftp:
+        #todo: these paths
+        #files = sftp.listdir()
+        #print(folder + "\tI see these files on remote: ", files, "\n")
+        sftp.chdir(remotepath)
+        files = sftp.listdir()
+        if show_all: print(folder + "\tI see these files on remote: ", files, "\n")
+        localf = os.listdir(localpath)
+        if show_all: print("I see these local: ", localf)
+        if prompt:
+            input('ready to upload')
+        sftp.put(localpath+localfile, localfile, preserve_mtime=True)
+        sftp.close()
+       
+        
+#text = 
+    result = []
+    last_type = ''
+    #answer_text = ''
+    answer = []
+    in_a_list = ''
+    
+    # Get all the images
+    for k,value in doc_objects.items():
+        tempout.write( "->" + k + "=" + json.dumps(value,indent=2) + "\n\n\n--\n\n")
+        fetched = fetch_doc_image(k,value)
+
+    list_stack = []
+    list_depth = 0
+    last_list_depth = 0
+    for value in doc_content:
+        tempout.write( json.dumps(value,indent=2) + "\n\n\n")
+        if verbose: print(json.dumps(value, sort_keys=True, indent=4))
+
+        tag_fxn = handle_para
+        if 'paragraph' in value:
+            this_text = ''
+            
+            # First we deal with if we're in a list.
+            if 'bullet' in value['paragraph']:
+                # either we're (1)starting a new list, (2)in one (do nothing), 
+                #  (3)starting a nested one, or (4)finished a nested one. 
+                lid = value['paragraph']['bullet']['listId']
+                if not list_stack:  # 1
+                    list_stack.append(lid)
+                else:
+                    if not lid == list_stack[0]:
+                        if not lid in list_stack:   # 3
+                            list_stack.append(lid)
+                        else:                       # 4
+                            x = list_stack.pop()
+                            while x != lid: list_stack.pop()
+            elif len(list_stack) > 0:    
+                #  current para isn't a bullet but we still have a list open.             
+                list_stack = []
+                
+                
+            list_depth = len(list_stack)
+            deeper = list_depth - last_list_depth
+            if deeper > 0:
+                answer.append("<ul>" * deeper)
+            elif deeper < 0:
+                deeper = -1 * deeper
+                answer.append("</ul>" * deeper)
+            if len(list_stack):
+                tag_fxn = handle_li
+            
+            # NOW the tag_fxn is either 'para' or 'li'... let's get the styling info next,
+            elements = value.get('paragraph').get('elements')
+            if 'paragraphStyle' in value.get('paragraph'):
+                style = value.get('paragraph').get('paragraphStyle')
+                if 'namedStyleType' in style:
+                    type = style['namedStyleType']
+            
+            # and FINALLY, the actual contents.
+            for elem in elements:
+                # text content
+                this_text += read_paragraph_element_2(elem,type)
+                
+                # image content
+                if 'inlineObjectElement' in elem:
+                    vpi = elem['inlineObjectElement']
+                    if 'inlineObjectId' in vpi:
+                        ii = vpi['inlineObjectId']
+                        if ii in img_lookup:
+                            img = img_lookup[ii]
+                            h = img_heights[ii]
+                            w = img_widths[ii]
+                            this_text += '<img src="doc_images/%s" width="%i" height="%i" />' % (img,w,h)
+                
+                
+            # Now for something tricky. Call an appropriate handler, based on:
+            #  (a) what is the paragraph style type?
+            #  (b) is it different from the prev one?
+            
+            if last_type=='NORMAL_TEXT' and type!=last_type:
+                if this_text.strip():
+                    result.append(handle_answer(answer))
+                answer = []
+                #answer_text = ''
+            
+            if type=='HEADING_2' and this_text.strip():
+                result.append( handle_sec(this_text) )
+                this_text = ''
+            elif type=='HEADING_3' and this_text.strip(): 
+                result.append(handle_question(this_text,bracket))
+                this_text = ''
+            else:
+                if this_text.lower().startswith('tags:'):
+                    tag_fxn = handle_tags
+                if this_text.lower().startswith('icons:'):
+                    tag_fxn = handle_icons
+                if this_text.strip():
+                    answer.append(tag_fxn(this_text))
+                this_text = ''
+            last_type = type
+            last_list_depth = list_depth
+                
+        elif 'table' in value:
+            pass
+    
+                                      
+    result.append(handle_answer(answer))
+    return json.dumps(result,indent=4)
+
+
+
+
+def process_reg_history(term='fa25'):
+    from collections import defaultdict
+    from itertools import groupby
+    from operator import itemgetter
+
+    def read_grouped_csv(path):
+        with open(path, newline='') as f:
+            fieldnames = ['datetime', 'crn', 'course', 'teacher', 'max', 'enrolled', 'waitlistmax', 'waitlisted']
+            reader = csv.DictReader(f, fieldnames=fieldnames)
+            rows = sorted(reader, key=lambda r: r['datetime'])        # Group by timestamp
+        grouped = {}
+        for ts, group in groupby(rows, key=itemgetter('datetime')):
+            grouped[ts] = {r['crn']: r for r in group}
+        return grouped
+
+    def crossed_threshold(old_val, new_val, max_val):
+        thresholds = [0.25, 0.5, 0.75, 1.0]
+        if int(max_val) == 0:
+            return False, None
+        old_ratio = int(old_val) / int(max_val)
+        new_ratio = int(new_val) / int(max_val)
+        for t in thresholds:
+            if old_ratio < t <= new_ratio:
+                return True, int(t * 100)
+        return False, None
+
+    def detect_changes(prev, curr):
+        changes = defaultdict(list)
+
+        all_crns = prev.keys() | curr.keys()
+        for crn in all_crns:
+            o, n = prev.get(crn), curr.get(crn)
+            if not o:
+                changes[crn].append((n['datetime'], "Section was added."))
+            elif not n:
+                changes[crn].append((
+                    o['datetime'],
+                    f"Section was removed (last seen: teacher {o['teacher']}, "
+                    f"{o['enrolled']}/{o['max']} enrolled, {o['waitlisted']}/{o['waitlistmax']} waitlisted)."
+                ))
+            else:
+                dt = n['datetime']
+                if o['teacher'] != n['teacher']:
+                    changes[crn].append((dt, f"Teacher changed from {o['teacher']} to {n['teacher']}."))
+                if o['enrolled'] != n['enrolled']:
+                    crossed, percent = crossed_threshold(o['enrolled'], n['enrolled'], n['max'])
+                    if crossed:
+                        changes[crn].append((dt, f"Enrollment crossed {percent}% ({n['enrolled']}/{n['max']})."))
+                if int(n['waitlisted']) > 10 and o['waitlisted'] != n['waitlisted']:
+                    changes[crn].append((dt, f"Waitlist exceeds 10: {n['waitlisted']}."))
+        return changes
+
+    def time_to_iso(s):
+        return datetime.datetime.strptime(s, "%Y-%m-%dT%H-%M").isoformat()
+    
+    def detect_changes_structured(prev, curr):
+        changes = defaultdict(list)
+
+        all_crns = prev.keys() | curr.keys()
+        for crn in all_crns:
+            o, n = prev.get(crn), curr.get(crn)
+            if not o:
+                changes[crn].append({'time':time_to_iso(n['datetime']), "type":'section update', 'message': "Section was added."})
+            elif not n:
+                changes[crn].append(
+                    {'time':time_to_iso(o['datetime']), "type":'section update', 'message': "Section was removed.",
+                     'value': o['enrolled'], 'capacity': o['max'], })
+            else:
+                dt = time_to_iso(n['datetime'])
+                if o['teacher'] != n['teacher']:
+                    changes[crn].append({'time':dt, "type":'teacher_change', 
+                     'message': f"Teacher changed from {o['teacher']} to {n['teacher']}.",
+                     'old_teacher': o['teacher'], 'new_teacher': n['teacher'], })
+                if o['enrolled'] != n['enrolled']:
+                    crossed, percent = crossed_threshold(o['enrolled'], n['enrolled'], n['max'])
+                    if crossed:
+                        changes[crn].append({'time':dt, "type":'enrollment_milestone', 
+                     'message': f"Enrollment crossed {percent}% ({n['enrolled']}/{n['max']}).",
+                     'percent':percent,'value':n['enrolled'],'capacity':n['max'] })
+                if int(n['waitlisted']) > 10 and o['waitlisted'] < n['waitlisted']:
+                    changes[crn].append({'time':dt, "type":'enrollment_milestone', 
+                     'message': f"Waitlist exceeds 10: {n['waitlisted']}).",
+                     'value':n['waitlisted']})
+        return changes
+
+
+    def process_diff_timeline(path):
+        snapshots = read_grouped_csv(path)
+        timeline = sorted(snapshots.keys())
+        timeline_diffs = []
+        timeline_diffs_structured = []
+        course_names = {}  # crn -> latest known course name
+
+        for i in range(1, len(timeline)):
+            prev_ts, curr_ts = timeline[i-1], timeline[i]
+            prev, curr = snapshots[prev_ts], snapshots[curr_ts]
+
+            # update course name map
+            for crn, row in curr.items():
+                course_names[crn] = row['course']
+
+            delta = detect_changes(prev, curr)
+            timeline_diffs.append(delta)
+
+            delta_structured = detect_changes_structured(prev,curr)
+            timeline_diffs_structured.append(delta_structured)
+
+        # Flatten and group by crn
+        crn_changes = defaultdict(list)
+        for delta in timeline_diffs:
+            for crn, changes in delta.items():
+                crn_changes[crn].extend(changes)
+
+        # Flatten and group by crn
+        crn_changes_structured = defaultdict(list)
+        for delta in timeline_diffs_structured:
+            for crn, changes in delta.items():
+                crn_changes_structured[crn].extend(changes)
+
+        # Sort changes for each CRN by datetime
+        for crn in crn_changes:
+            crn_changes[crn].sort(key=lambda x: x[0])
+
+        # Sort changes for each CRN by datetime
+        for crn in crn_changes_structured:
+            crn_changes[crn].sort(key=lambda x: x[0])
+
+        return crn_changes, crn_changes_structured, course_names
+    
+    fresh_history = requests.get(f"http://gavilan.cc/schedule/reg_history_{term}.csv").text
+    fresh_file = codecs.open(f'cache/reg_history_{term}.csv','w','utf-8')
+    fresh_file.write(fresh_history)
+    fresh_file.close()
+
+    output1 = codecs.open(f'cache/reg_timeline_{term}.txt','w','utf-8')
+    output2 = codecs.open(f'cache/reg_timeline_{term}.json','w','utf-8')
+    changes, changes_structured, course_names = process_diff_timeline(f"cache/reg_history_{term}.csv")
+
+    # once for plain text
+
+    for crn in sorted(changes, key=lambda c: course_names.get(c, "")):
+        course = course_names.get(crn, "")
+        course_output = {'code': course, 'crn':crn,'events':[]}
+        print(f"\n{course} (CRN {crn}):")
+        output1.write(f"\n{course} (CRN {crn}):\n")
+        for dt, msg in changes[crn]:
+            print(f"  [{dt}] {msg}")
+            output1.write(f"  [{dt}] {msg}\n")
+            
+            course_output['events'].append({'message':msg, 'time':time_to_iso(dt)})
+
+    # again for structured
+    crn_list = []
+
+    for crn in sorted(changes_structured, key=lambda c: course_names.get(c, "")):
+        course = course_names.get(crn, "")
+        course_output = {'code': course, 'crn':crn,'events':changes_structured[crn]}
+        crn_list.append(course_output)
+
+    output2.write( json.dumps(crn_list,indent=2) )
+    output2.close()
+
+
+
+def recreate_all():
+    # Use canonical semester short codes from semesters.py
+    try:
+        from semesters import code as SEM_CODES
+    except Exception:
+        SEM_CODES = []
+    for x in SEM_CODES:
+        try:
+            recreate_reg_data(x)
+        except Exception as e:
+            print(f'Failed on {x} with: {e}')
+
+
+def recreate_reg_data(term="fa25"):
+    from collections import defaultdict
+    from datetime import datetime
+
+    def parse_row(row):
+        dt = datetime.strptime(row['datetime'], "%Y-%m-%dT%H-%M")
+        crn = row['crn']
+        enrolled = int(row['enrolled'])
+        return dt, row['datetime'], crn, enrolled
+
+    def reduce_latest_per_day(rows):
+        latest = defaultdict(dict)  # latest[crn][date] = (dt, ts, enrolled)
+        latest_ts_by_date = {}      # date → (dt, ts) for header naming
+
+        for row in rows:
+            dt, full_ts, crn, enrolled = parse_row(row)
+            date_str = dt.date().isoformat()
+            ts_header = dt.strftime("%Y-%m-%dT%H")  # <-- this is what we want
+
+            # for each crn, per day, keep latest reading
+            if date_str not in latest[crn] or dt > latest[crn][date_str][0]:
+                latest[crn][date_str] = (dt, ts_header, enrolled)
+
+            # also record latest timestamp per day for consistent column headers
+            if date_str not in latest_ts_by_date or dt > latest_ts_by_date[date_str][0]:
+                latest_ts_by_date[date_str] = (dt, ts_header)
+
+        return latest, [ts for _, ts in sorted(latest_ts_by_date.values())]
+
+    def pivot_table(latest, headers):
+        crns = sorted(latest)
+        table = []
+
+        for crn in crns:
+            row = [crn]
+            for ts in headers:
+                date_str = ts[:10]  # match on YYYY-MM-DD
+                val = latest[crn].get(date_str)
+                if val and val[1] == ts:
+                    row.append(str(val[2]))
+                else:
+                    row.append("")
+            table.append(row)
+
+        return ['crn'] + headers, table
+
+    #with open(f"cache/reg_history_{term}.csv", newline='') as f:
+    from io import StringIO
+    url = f"https://gavilan.cc/schedule/reg_history_{term}.csv"
+
+    # Download
+    resp = requests.get(url)
+    resp.raise_for_status()   # raises if bad status
+
+    # Wrap the text in a file-like object
+    f = StringIO(resp.text)
+
+    fieldnames = ['datetime', 'crn', 'course', 'teacher', 'max', 'enrolled', 'waitlistmax', 'waitlisted']
+    reader = csv.DictReader(f, fieldnames=fieldnames)
+    rows = list(reader)
+
+    latest, headers = reduce_latest_per_day(rows)
+    header_row, table = pivot_table(latest, headers)
+
+    with open(f"cache/reg_data_{term}.csv", "w", newline='') as f:
+        writer = csv.writer(f)
+        writer.writerow(header_row)
+        writer.writerows(table)
+
+
+if __name__ == "__main__":
+    
+    print ('')
+    options = { 1: ['Fetch rosters on schedule',fetch_current_rosters_auto] ,  
+                2: ['Get canvas data 2024 style', canvas_data_2024_run ],
+                3: ['Set up canvas data 2024 style', setup_canvas_data_2024_run],
+                4: ['Narrative timeline of section updates', process_reg_history],
+                5: ['Create narrative format all semesters', recreate_all],
+                6: ['Recreate reg_data from full reg history', recreate_reg_data],
+    }
+        
+    '''1: ['Re-create schedule csv and json files from raw html',recent_schedules] ,  
+    2: ['Fetch rosters',fetch_current_rosters] , 
+    3: 
+    4: ['Compute how registration is filling up classes', schedule_filling] , 
+    5: ['Manually convert 3 csv files to joined json enrollment file.', convert_roster_files] , 
+    6: ['Canvas data: interactive sync', interactive ], 
+    7: ['Canvas data: automated sync', sync_non_interactive ], 
+    8: 
+    9: 
+    16: ['Scrape schedule from ssb', scrape_schedule_multi ], 
+    14: ['Generate latestart schedule', list_latestarts ],
+    15: ['Test ssb calls with python', scrape_schedule_py ], 
+    10: ['schedule to db', scrape_for_db ], 
+    11: ['clean argos draft schedule file', argos_data_from_cvc],
+    12: ['make expanded schedule json files of old semesters', expand_old_semesters ],
+    13: ['Parse deanza schedule', dza_sched ],
+    '''
+              
+    
+    if len(sys.argv) > 1 and re.search(r'^\d+',sys.argv[1]):
+        resp = int(sys.argv[1])
+        print("\n\nPerforming: %s\n\n" % options[resp][0])
+
+    else:
+        print ('')
+        for key in options:
+            print(str(key) + '.\t' + options[key][0])
+        
+        print('')
+        resp = input('Choose: ')
+        
+    # Call the function in the options dict
+    options[ int(resp)][1]() 
+    
+# Testing
+
+#if __name__ == "__main__":
+    #users = fetch('/api/v1/courses/69/users?per_page=100',1)
+    #print "These are the users: "
+    #print users
+    
+    #getSemesterSchedule()
+    
+    
+    #get_doc()
+    #pass
diff --git a/search.py b/search.py
index 6c3c431..b9dec48 100644
--- a/search.py
+++ b/search.py
@@ -554,3 +554,26 @@ if __name__ == "__main__":
     
     # Call the function in the options dict
     options[ int(resp)][1]() 
+def try_clustering(df):
+    from sklearn.cluster import KMeans
+    df = df.drop(['code'], axis=1)
+    kmeans = KMeans(n_clusters=4, random_state=0).fit(df)
+    return kmeans.labels_
+def nlp_sample():
+    from gensim import utils, corpora
+    from nltk import stem
+    stemmer = stem.porter.PorterStemmer()
+    strings = [
+        "Human machine interface for lab abc computer applications",
+        "A survey of user opinion of computer system response time",
+        "The EPS user interface management system",
+        "System and human system engineering testing of EPS",
+        "Relation of user perceived response time to error measurement",
+        "The generation of random binary unordered trees",
+        "The intersection graph of paths in trees",
+        "Graph minors IV Widths of trees and well quasi ordering",
+        "Graph minors A survey",
+    ]
+    processed = [[stemmer.stem(y) for y in utils.simple_preprocess(x, min_len=4)] for x in strings]
+    dictionary = corpora.Dictionary(processed)
+    return dictionary
diff --git a/users.py b/users.py
index bba2e03..4ac8b82 100644
--- a/users.py
+++ b/users.py
@@ -1938,7 +1938,8 @@ def track_users_by_teacherclass():
     print(json.dumps(g2, indent=2))
  
 
-def nlp_sample():
+## moved: nlp_sample now in search.py
+# def nlp_sample():
     # Stream a training corpus directly from S3.
     #corpus = corpora.MmCorpus("s3://path/to/corpus")
     
@@ -1955,9 +1956,7 @@ def nlp_sample():
     "Graph minors IV Widths of trees and well quasi ordering",
     "Graph minors A survey",
 ]
-    processed = [ [ stemmer.stem(y) for y in utils.simple_preprocess(x, min_len=4)] for x in strings]
-    print(processed)
-    dictionary = corpora.Dictionary( processed )
+    # moved
     dct = dictionary
     print(dictionary)
     
@@ -2980,4 +2979,3 @@ if __name__ == "__main__":
     
     # Call the function in the options dict
     options[ int(resp)][1]() 
-