From 9584f45f3076620b54594f09f4524704a0753939 Mon Sep 17 00:00:00 2001 From: Peter Howell Date: Fri, 5 Sep 2025 23:06:34 +0000 Subject: [PATCH] cleanup --- content.py | 329 ++++++- courses.py | 19 +- depricated.py | 92 ++ localcache.py | 4 +- pipelines.py | 2553 ++++++++++++++++++++----------------------------- search.py | 23 + users.py | 8 +- 7 files changed, 1472 insertions(+), 1556 deletions(-) diff --git a/content.py b/content.py index 9332113..c8b92b4 100644 --- a/content.py +++ b/content.py @@ -1528,6 +1528,334 @@ LANE: HyFlex +################ +################ GOOGLE DOCS HELPERS (moved from pipelines) +################ + +def sec(t): return "

"+t+"

\n" +def para(t): return "

"+t+"

\n" +def ul(t): return "\n" +def li(t): return "
  • "+t+"
  • \n" + +def question(t,bracket=1): + ret = '' + match = re.search( r'\[(.*)\]', t) + if match and bracket: + ret += "" + t = re.sub( r'\[.*\]','',t) + else: + parts = t.split(' ') + id = '' + for p in parts: + if re.search(r'[a-zA-Z]',p[0]): id += p[0] + ret += "" % id.lower() + return ret + '

    ' + t + '

    \n
    ' + +def answer(t): + return t + '
    \n' + +def read_paragraph_element(element,type="NORMAL_TEXT"): + text_run = element.get('textRun') + begin = '' + end = '' + if not text_run: + return '' + if 'textStyle' in text_run and 'link' in text_run['textStyle']: + begin = '' + end = '' + if 'textStyle' in text_run and 'bold' in text_run['textStyle'] and text_run['textStyle']['bold']==True and type=="NORMAL_TEXT": + begin = '' + begin + end = end + '' + content = text_run.get('content') + content = re.sub(u'\u000b','
    \n',content) + return begin + content + end + +def read_paragraph_element_2(element,type="NORMAL_TEXT"): + return read_paragraph_element(element,type) + + + +# t is a string that begins with "Icons: " ... and contains comma(space) separated list +def handle_icons(t): + text = t[7:].strip() + parts = text.split(", ") + return ('icons',parts) + +# t is a string that begins with "Tags: " ... and contains comma(space) separated list +def handle_tags(t): + text = t[6:].strip() + parts = text.split(", ") + return ('tags',parts) + +def handle_question(t,bracket=1): + anchor = '' + match = re.search( r'\[(.*)\]', t) + if match and bracket: + anchor = match.group(1).lower() + t = re.sub( r'\[.*\]','',t) + else: + parts = t.split(' ') + for p in parts: + if re.search(r'[a-zA-Z]',p[0]): anchor += p[0].lower() + return ('question', t, anchor) + +def handle_answer(t): + return ('answer',t) + +def handle_sec(t): return ('section',t) +def handle_para(t): return ('paragraph',t) +def handle_ul(t): return ('unorderdedlist',t) +def handle_li(t): return ('listitem',t) + + + +img_count = 1 +img_lookup = {} +img_heights = {} +img_widths = {} + + +'''def fetch_doc_image(k,value): + global img_count, img_lookup, img_heights, img_widths + if 'inlineObjectProperties' in value: + if 'embeddedObject' in value['inlineObjectProperties']: + if 'imageProperties' in value['inlineObjectProperties']['embeddedObject']: + if 'contentUri' in value['inlineObjectProperties']['embeddedObject']['imageProperties']: + print(k) + uu = value['inlineObjectProperties']['embeddedObject']['imageProperties']['contentUri'] + response = requests.get(uu, stream=True) + name = 'image_' + str(img_count) + '.' + response.headers['content-type'].split('/')[1] + img_count += 1 + img_lookup[k] = name + + with open('cache/doc_images/'+name, 'wb') as out_file: + shutil.copyfileobj(response.raw, out_file) + print(uu) + print(response.headers) + print(name) + del response + if 'size' in value['inlineObjectProperties']['embeddedObject']: + img_heights[k] = int(value['inlineObjectProperties']['embeddedObject']['size']['height']['magnitude']) + img_widths[k] = int(value['inlineObjectProperties']['embeddedObject']['size']['width']['magnitude']) +''' + + +def fetch_doc_image(k,value): + import shutil + if 'inlineObjectProperties' in value: + if 'embeddedObject' in value['inlineObjectProperties']: + if 'imageProperties' in value['inlineObjectProperties']['embeddedObject']: + if 'contentUri' in value['inlineObjectProperties']['embeddedObject']['imageProperties']: + uu = value['inlineObjectProperties']['embeddedObject']['imageProperties']['contentUri'] + response = requests.get(uu, stream=True) + name = 'image_' + str(k) + '.' + response.headers['content-type'].split('/')[1] + with open('cache/doc_images/'+name, 'wb') as out_file: + shutil.copyfileobj(response.raw, out_file) + del response + return True + +def get_doc(docid, bracket=1, verbose=0): + import pickle, shutil + import os.path + from googleapiclient.discovery import build + from google_auth_oauthlib.flow import InstalledAppFlow + from google.auth.transport.requests import Request + + #ooout = open(fileout,'w') + + # If modifying these scopes, delete the file token.pickle. + SCOPES = ['https://www.googleapis.com/auth/documents.readonly'] + creds = None + # The file token.pickle stores the user's access and refresh tokens, and is + # created automatically when the authorization flow completes for the first + # time. + if os.path.exists('token.pickle'): + with open('token.pickle', 'rb') as token: + creds = pickle.load(token) + # If there are no (valid) credentials available, let the user log in. + if not creds or not creds.valid: + if creds and creds.expired and creds.refresh_token: + creds.refresh(Request()) + else: + flow = InstalledAppFlow.from_client_secrets_file( + 'credentials.json', SCOPES) + creds = flow.run_local_server(port=0) + # Save the credentials for the next run + with open('token.pickle', 'wb') as token: + pickle.dump(creds, token) + + service = build('docs', 'v1', credentials=creds) + + # Retrieve the documents contents from the Docs service. + document = service.documents().get(documentId=docid).execute() + if verbose: print(document) + + tempout = codecs.open('cache/trash/gdoctemp.txt','w','utf-8') + tempout.write( json.dumps(document,indent=2) + "\n\n\n------------------------------------\n\n") + if verbose: print('The title of the document is: {}'.format(document.get('title'))) + doc_content = document.get('body').get('content') + if verbose: print(doc_content) + + doc_objects = document.get('inlineObjects') + if verbose: print(doc_objects) + + doc_lists = document.get('lists') + + text = '
    ' + last_type = '' + answer_text = '' + in_a_list = '' + + img_count = 1 + img_lookup = {} + img_heights = {} + img_widths = {} + + if doc_objects: + for k,value in doc_objects.items(): + tempout.write( "->" + k + "=" + json.dumps(value,indent=2) + "\n\n\n--\n\n") + if 'inlineObjectProperties' in value: + if 'embeddedObject' in value['inlineObjectProperties']: + if 'imageProperties' in value['inlineObjectProperties']['embeddedObject']: + if 'contentUri' in value['inlineObjectProperties']['embeddedObject']['imageProperties']: + print(k) + uu = value['inlineObjectProperties']['embeddedObject']['imageProperties']['contentUri'] + response = requests.get(uu, stream=True) + name = 'image_' + str(img_count) + '.' + response.headers['content-type'].split('/')[1] + img_count += 1 + + img_lookup[k] = name + + with open('cache/doc_images/'+name, 'wb') as out_file: + shutil.copyfileobj(response.raw, out_file) + print(uu) + print(response.headers) + print(name) + #input('x?') + del response + if 'size' in value['inlineObjectProperties']['embeddedObject']: + img_heights[k] = int(value['inlineObjectProperties']['embeddedObject']['size']['height']['magnitude']) + img_widths[k] = int(value['inlineObjectProperties']['embeddedObject']['size']['width']['magnitude']) + + tempout.write('- - - - - - - -\n\n') + #for value in doc_lists: + # tempout.write( json.dumps(value,indent=2) + "\n\n\n--\n\n") + + tempout.write('- - - - - - - -\n\n') + list_stack = [] + list_depth = 0 + last_list_depth = 0 + for value in doc_content: + tempout.write( json.dumps(value,indent=2) + "\n\n\n") + if verbose: print(json.dumps(value, sort_keys=True, indent=4)) + + # todo: x link, x bold, list, image. + tag_fxn = para + if 'paragraph' in value: + this_text = '' + + if 'bullet' in value['paragraph']: + # either we're (1)starting a new list, (2)in one, (3)starting a nested one, or (4)finished a nested one. + + lid = value['paragraph']['bullet']['listId'] + + if not list_stack: # 1 + list_stack.append(lid) + else: + if lid == list_stack[0]: # 2 + pass + + else: + if not lid in list_stack: # 3 + list_stack.append(lid) + else: # 4 + x = list_stack.pop() + while x != lid: list_stack.pop() + elif len(list_stack) > 0: # current para isn't a bullet but we still have a list open. + list_stack = [] + + list_depth = len(list_stack) + + deeper = list_depth - last_list_depth + + if deeper > 0: + answer_text += "" * deeper + + if len(list_stack): + tag_fxn = li + + elements = value.get('paragraph').get('elements') + + # inlineObjectElement": { + # "inlineObjectId": "kix.ssseeu8j9cfx", + + if 'paragraphStyle' in value.get('paragraph'): + style = value.get('paragraph').get('paragraphStyle') + #text += json.dumps(style, sort_keys=True, indent=4) + if 'namedStyleType' in style: + type = style['namedStyleType'] + + for elem in elements: + + # text content + this_text += read_paragraph_element(elem,type) + + # image content + if 'inlineObjectElement' in elem: + vpi = elem['inlineObjectElement'] + if 'inlineObjectId' in vpi: + ii = vpi['inlineObjectId'] + if ii in img_lookup: + img = img_lookup[ii] + h = img_heights[ii] + w = img_widths[ii] + this_text += '' % (img,w,h) + + + + if last_type=='NORMAL_TEXT' and type!=last_type: + text += answer(answer_text) + answer_text = '' + + if type=='HEADING_2': + text += sec(this_text) + this_text = '' + elif type=='HEADING_3': + text += question(this_text,bracket) + this_text = '' + else: + answer_text += tag_fxn(this_text) + this_text = '' + last_type = type + last_list_depth = list_depth + + elif 'table' in value: + # The text in table cells are in nested Structural Elements and tables may be + # nested. + text += "\nTABLE\n" + #table = value.get('table') + #for row in table.get('tableRows'): + # cells = row.get('tableCells') + # for cell in cells: + # text += read_strucutural_elements(cell.get('content')) + #elif 'tableOfContents' in value: + # # The text in the TOC is also in a Structural Element. + # toc = value.get('tableOfContents') + # text += read_strucutural_elements(toc.get('content')) + + #else: + # print(json.dumps(value, sort_keys=True, indent=4)) + + text += answer(answer_text) + #text += '
    ' + #print(text) + return text + +def get_doc_generic(docid, bracket=1, verbose=0): + return get_doc(docid, bracket, verbose) @@ -1567,4 +1895,3 @@ if __name__ == "__main__": # Call the function in the options dict options[ int(resp)][1]() - diff --git a/courses.py b/courses.py index 00a49e0..b33133f 100644 --- a/courses.py +++ b/courses.py @@ -2690,24 +2690,7 @@ def enrollment_helper(): # fill percentage for each section, then by mode, tod, campus -def try_clustering(df): - # Import required libraries - from sklearn.cluster import KMeans - - # Preprocessing - - # Assuming df is your DataFrame and "modes" is your categorical column - #df['code'] = df['code'].astype('category').cat.codes - - # Removing any other unnecessary columns - df = df.drop(['code'], axis=1) - - # Perform KMeans clustering - kmeans = KMeans(n_clusters=4, random_state=0).fit(df) - - # Get the cluster labels - labels = kmeans.labels_ - +## moved: try_clustering now in search.py # Add labels to the DataFrame #df['clusters'] = labels #print(df) diff --git a/depricated.py b/depricated.py index b34a5c2..1c2e604 100644 --- a/depricated.py +++ b/depricated.py @@ -4,6 +4,98 @@ # from pipelines - canvas data + +# Canvas data, download all new files +def sync_non_interactive(): + resp = do_request('/api/account/self/file/sync') + mylog.write(json.dumps(resp, indent=4)) + #mylog.close() + gotten = os.listdir(local_data_folder) + wanted = [] + i = 0 + for x in resp['files']: + filename = x['filename'] + exi = "No " + if filename in gotten: exi = "Yes" + else: wanted.append(x) + + print(str(i) + '.\tLocal: %s\tRemote: %s' % ( exi, filename )) + i += 1 + print("I will attempt to download %i files." % len(wanted)) + + #answer = input("Press enter to begin, or q to quit ") + #if not answer == '': return + + good_count = 0 + bad_count = 0 + for W in wanted: + print("Downloading: " + W['filename']) + response = requests.request(method='GET', url=W['url'], stream=True) + if(response.status_code != 200): + print('Request response went bad. Got back a %s code, meaning the request was %s' % \ + (response.status_code, response.reason)) + print('URL: ' + W['url']) + bad_count += 1 + + else: + #Use the downloaded data + with open(local_data_folder + W['filename'], 'wb') as fd: + for chunk in response.iter_content(chunk_size=128): + fd.write(chunk) + print("Success") + good_count += 1 + print("Out of %i files, %i succeeded and %i failed." % (len(wanted),good_count,bad_count)) + + +## OLD STYLE CANVAS DATA + +# Get something from Canvas Data +def do_request(path): + #Set up the request pieces + method = 'GET' + host = 'api.inshosteddata.com' + apiTime = datetime.utcnow().strftime('%a, %d %b %y %H:%M:%S GMT') + apiContentType = 'application/json' + + msgList = [] + msgList.append(method) + msgList.append(host) + msgList.append(apiContentType) + msgList.append('') + msgList.append(path) + msgList.append('') + msgList.append(apiTime) + msgList.append(apiSecret) + + msgStr = bytes("".join("%s\n" % k for k in msgList).strip(),'utf-8') + + sig = base64.b64encode(hmac.new(key=bytes(apiSecret,'utf-8'),msg=msgStr,digestmod=hashlib.sha256).digest()) + sig = sig.decode('utf-8') + + headers = {} + headers['Authorization'] = 'HMACAuth {}:{}'.format(apiKey,sig) + headers['Date'] = apiTime + headers['Content-type'] = apiContentType + + + #Submit the request/get a response + uri = "https://"+host+path + print (uri) + print (headers) + response = requests.request(method='GET', url=uri, headers=headers, stream=True) + + #Check to make sure the request was ok + if(response.status_code != 200): + print(('Request response went bad. Got back a ', response.status_code, ' code, meaning the request was ', response.reason)) + else: + #Use the downloaded data + jsonData = response.json() + #print(json.dumps(jsonData, indent=4)) + return jsonData + + + + def file_doesnt_exist(name): # Get list of files in current directory files = os.listdir() diff --git a/localcache.py b/localcache.py index 2668751..c1f1874 100644 --- a/localcache.py +++ b/localcache.py @@ -8,7 +8,7 @@ from datetime import datetime as dt from datetime import timedelta from dateutil.parser import parse from os.path import exists, getmtime -from pipelines import sync_non_interactive, url, header +from pipelines import url, header import util from semesters import short_to_sis @@ -1121,7 +1121,7 @@ def full_reload(): except Exception as e: print("Couldn't rename file:", str(e)) - sync_non_interactive() + #sync_non_interactive() setup_table('requests_sum1') setup_table('courses') diff --git a/pipelines.py b/pipelines.py index 7e74f56..159ee41 100644 --- a/pipelines.py +++ b/pipelines.py @@ -1,1530 +1,1023 @@ -import util -import codecs, json, requests, re, csv, datetime, os, jsondiff, os.path -import sys, shutil, hmac, hashlib, base64, schedule, time, pathlib -from datetime import timedelta - -from canvas_secrets import apiKey, apiSecret, FTP_SITE, FTP_USER, FTP_PW, url, domain, account_id, header, header_media, g_id, g_secret -from canvas_secrets import instructure_url, instructure_username, instructure_private_key - -import os, asyncio -from dap.api import DAPClient -from dap.dap_types import Credentials -from dap.integration.database import DatabaseConnection -from dap.replicator.sql import SQLReplicator - - - -""" -Everything to do with fetching data, - - From iLearn, via token - - current roster uploads from instructures sftp site - - raw logs and other from canvas data repo - - from ssb, use firefox to scrape the schedule - - -And some subsequent processing: - - Raw roster files, into a more compact json format - - Raw logs into something more useful -""" - -verbose = False - -users = {} -users_by_id = {} - -# todo: all these constants for SSB -- line 1008 -# -# todo: https://stackoverflow.com/questions/42656247/how-can-i-use-canvas-data-rest-api-using-python - - - - - - -sys.setrecursionlimit( 100000 ) - -local_data_folder = 'cache/canvas_data/' -mylog = codecs.open(local_data_folder + 'temp_log.txt','w') - - - - -class FetchError(Exception): - pass - - -DEBUG = 0 - -def d(s,end=''): - global DEBUG - if end and DEBUG: print(s,end=end) - elif DEBUG: print(s) - -################ -################ CANVAS API MAIN FETCHING FUNCTIONS -################ -################ -################ - - - - -# Main canvas querying fxn -def fetch(target,verbose=0,params=0,media=0): - # if there are more results, recursivly call myself, adding on to the results. - results = 0 - if target[0:4] != "http": target = url + target - if verbose: - print("++ Fetching: " + target) - if media: - r2 = requests.get(target, headers = header_media) - elif params: - r2 = requests.get(target, headers = header, params = params) - else: - r2 = requests.get(target, headers = header) - #if verbose: - #print "++ Got: " + r2.text - try: - results = json.loads(r2.text) - count = len(results) - except: - print("-- Failed to parse: ", r2.text) - if verbose: - print("Got %i results" % count) - if verbose > 1: - print(r2.headers) - - tempout = codecs.open('cache/fetchcache.txt','a','utf-8') - tempout.write(r2.text+"\n\n") - tempout.close() - - if ('link' in r2.headers and count > 0): - links = r2.headers['link'].split(',') - for L in links: - ll = L.split(';') - link = ll[0].replace("<","") - link = link.replace(">","") - if re.search(r'next', ll[1]): - if (verbose): print("++ More link: " + link) - #link = re.sub(r'per_page=10$', 'per_page=100', link) # link.replace('per_page=10','per_page=500') - #if (verbose): print("++ More link: " + link) - - nest = fetch(link,verbose,params,media) - if isinstance(results,dict): results.update(nest) - else: results.extend(nest) - return results - -# Main canvas querying fxn - stream version - don't die on big requests -def fetch_stream(target,verbose=0): - # if there are more results, recursivly call myself, adding on to the results. - results = 0 - while target: - if target[0:4] != "http": target = url + target - if verbose: - print("++ Fetching: " + target) - r2 = requests.get(target, headers = header) - if r2.status_code == 502: - raise FetchError() - try: - results = json.loads(r2.text) - count = len(results) - except: - print("-- Failed to parse: ", r2.text) - if verbose: - print("Got %i results" % count) - if verbose > 1: - print(r2.headers) - tempout = codecs.open('cache/fetchcache.txt','a','utf-8') - tempout.write(r2.text+"\n\n") - tempout.close() - - next_link_found = 0 - if ('link' in r2.headers and count > 0): - links = r2.headers['link'].split(',') - for L in links: - ll = L.split(';') - link = ll[0].replace("<","") - link = link.replace(">","") - if re.search(r'next', ll[1]): - target = link - next_link_found = 1 - break - if not next_link_found: target = 0 - yield results - - -# for dicts with one key, collapse that one key out, cause -# paging makes problems... example: enrollment_terms -def fetch_collapse(target,collapse='',verbose=0): - # if there are more results, recursivly call myself, adding on to the results. - results = 0 - if target[0:4] != "http": target = url + target - if verbose: - print("++ Fetching: " + target) - r2 = requests.get(target, headers = header) - #if verbose: - #print "++ Got: " + r2.text - try: - results = json.loads(r2.text) - except: - print("-- Failed to parse: ", r2.text) - if verbose: print(r2.headers) - - if collapse and collapse in results: - results = results[collapse] - - if ('link' in r2.headers): - links = r2.headers['link'].split(',') - for L in links: - ll = L.split(';') - link = ll[0].replace("<","") - link = link.replace(">","") - if re.search(r'next', ll[1]): - if (verbose): print("++ More link: " + link) - nest = fetch_collapse(link, collapse, verbose) - if isinstance(results,dict): results.update(nest) - else: results.extend(nest) - return results - - - - - - - - - -################ -################ CANVAS DATA -################ -################ -################ - - -# Get canvas data 2024 style -def canvas_data_2024_run(): - print("Updating all tables.") - asyncio.run(canvas_data_2024()) - print("Done with all tables.") - - -async def canvas_data_2024(): - - base_url: str = os.environ["DAP_API_URL"] - client_id: str = os.environ["DAP_CLIENT_ID"] - client_secret: str = os.environ["DAP_CLIENT_SECRET"] - #connection_string: str = "postgresql://postgres:rolley34@192.168.1.6/db" - - # todo: use secrets - connection_string: str = "postgresql://postgres:rolley34@192.168.1.199/db" - - desired_tables = "users,courses,communication_channels,context_modules,conversation_message_participants,conversation_messages,conversation_participants,conversations,course_sections,enrollment_states,enrollment_dates_overrides,enrollment_terms,enrollments,learning_outcome_groups,learning_outcome_question_results,learning_outcomes,pseudonyms,quizzes,scores,submissions,submission_versions,wiki_pages,wikis".split(',') - credentials = Credentials.create(client_id=client_id, client_secret=client_secret) - - async with DatabaseConnection(connection_string).open() as db_connection: - async with DAPClient(base_url, credentials) as session: - #tables = await session.get_tables("canvas") - for table in desired_tables: - print(f" trying to update {table} ") - try: - #await SQLReplicator(session, db_connection).initialize("canvas", table) - await SQLReplicator(session, db_connection).synchronize("canvas", table) - except Exception as e: - print(f" - skipping {table} because {e}") - - - -# Get canvas data 2024 style -def setup_canvas_data_2024_run(): - print("Setting up all tables.") - asyncio.run(setup_canvas_data_2024()) - print("Done with all tables.") - - -async def setup_canvas_data_2024(): - - base_url: str = os.environ["DAP_API_URL"] - client_id: str = os.environ["DAP_CLIENT_ID"] - client_secret: str = os.environ["DAP_CLIENT_SECRET"] - #connection_string: str = "postgresql://postgres:rolley34@192.168.1.6/db" - connection_string: str = "postgresql://postgres:rolley34@192.168.1.192/db" - - desired_tables = "users,courses,communication_channels,context_modules,conversation_message_participants,conversation_messages,conversation_participants,conversations,course_sections,enrollment_states,enrollment_dates_overrides,enrollment_terms,enrollments,learning_outcome_groups,learning_outcome_question_results,learning_outcomes,pseudonyms,quizzes,scores,submissions,submission_versions,wiki_pages,wikis".split(',') - credentials = Credentials.create(client_id=client_id, client_secret=client_secret) - - async with DatabaseConnection(connection_string).open() as db_connection: - async with DAPClient(base_url, credentials) as session: - #tables = await session.get_tables("canvas") - for table in desired_tables: - print(f" {table}") - try: - await SQLReplicator(session, db_connection).initialize("canvas", table) - except Exception as e: - print(f" - skipping {table} because {e}") - - - - - - - - - - - - - - - - - - - - - - -################ -################ ROSTERS AND REGISTRATION -################ -################ -################ - -# todo: the pipeline is disorganized. Organize it to have -# a hope of taking all this to a higher level. -# - -# todo: where does this belong in the pipeline? compare with recent_schedules() - - - -# Take the generically named rosters uploads files and move them to a semester folder and give them a date. -def move_to_folder(sem,year,folder,files): - semester = year+sem - semester_path = 'cache/rosters/%s' % semester - if not os.path.isdir('cache/rosters/'+semester): - os.makedirs('cache/rosters/'+semester) - now = datetime.datetime.now().strftime('%Y-%m-%dT%H-%M') - print("+ Moving roster files to folder: %s" % semester_path) - if not os.path.isdir(semester_path): - print("+ Creating folder: %s" % semester_path) - os.makedirs(semester_path) - def safe_move(src, dst): - try: - os.replace(src, dst) - except Exception as ex: - print(f"! move failed {src} -> {dst}: {ex}") - if 'courses.csv' in files: - safe_move('cache/rosters/courses-%s.csv' % folder, 'cache/rosters/%s/courses.%s.csv' % (semester,now)) - if 'enrollments.csv' in files: - safe_move('cache/rosters/enrollments-%s.csv' % folder, 'cache/rosters/%s/enrollments.%s.csv' % (semester,now)) - if 'users.csv' in files: - safe_move('cache/rosters/users-%s.csv' % folder, 'cache/rosters/%s/users.%s.csv' % (semester,now)) - - - -# Take raw upload (csv) files and make one big json out of them. -# This relates to enrollment files, not schedule. -def convert_roster_files(semester="",year="",folder=""): - if not semester: - semester = input("the semester? (ex: spring) ") - folder = input("Folder? (ex 2020-02-25-14-58-20) ") - uf = open('cache/rosters/users-'+folder+'.csv','r') - cf = open('cache/rosters/courses-'+folder+'.csv','r') - ef = open('cache/rosters/enrollments-'+folder+'.csv','r') - u = csv.DictReader(uf) - c = csv.DictReader(cf) - e = csv.DictReader(ef) - uu = [i for i in u] - cc = [i for i in c] - ee = [i for i in e] - uf.close() - cf.close() - ef.close() - myrosterfile = 'cache/rosters/roster_%s_%s.json' % (year, semester) - - if os.path.exists(myrosterfile): - print(" -- Moving previous combined roster json file. opening %s ..." % myrosterfile) - last_fileobj = open(myrosterfile,'r') - last_file = json.load(last_fileobj) - - last_fileobj.close() - - info = last_file[3] - last_date = info['date_filestring'] - - print(' -- writing: cache/rosters/%s%s/roster_%s.json ...' % (year,semester,last_date)) - - try: - os.rename(myrosterfile, 'cache/rosters/%s%s/roster_%s.json ...' % (year,semester,last_date)) - print(' -- ok') - except Exception as e: - print(" ** Failed because i couldn't move the previous roster file: %s" % myrosterfile) - print(e) - myrosterfile = "new_" + myrosterfile - pass - #os.remove('cache/old_rosters/roster_'+semester+'.'+last_date+'.json') - #os.rename(myrosterfile, 'cache/old_rosters/roster_'+semester+'.'+last_date+'.json') - - newinfo = {'date_filestring': datetime.datetime.now().strftime('%Y-%m-%dT%H-%M'), } - try: - new_roster = codecs.open(myrosterfile,'w', 'utf-8') - new_roster.write( json.dumps( [uu,cc,ee,newinfo], indent=2 )) - new_roster.close() - print(" -- Wrote roster info to: %s." % myrosterfile) - except Exception as e: - print(" ** Failed because i couldn't move the previous roster file: %s" % myrosterfile) - print(" ** " + str(e)) - - - - - - -# From instructure sftp site -def fetch_current_rosters(sftp=None, label_hour=None): - """Download roster CSVs from Instructure SFTP and post-process. - - If sftp provided, reuse it (already chdir'd to SIS). - - Files are saved using label_hour (format YYYY-MM-DD-HH). If None, compute fallback label by flooring to the hour. - """ - import pysftp - def log(msg): - ts = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') - try: - with open('cache/pipeline.log.txt','a', encoding='utf-8') as f: - f.write(f"[{ts}] {msg}\n") - except Exception: - print(msg) - - close_after = False - if sftp is None: - cnopts = pysftp.CnOpts() - cnopts.hostkeys = None - sftp = pysftp.Connection(instructure_url, username=instructure_username, private_key=instructure_private_key, cnopts=cnopts) - sftp.chdir('SIS') - close_after = True - try: - now = datetime.datetime.now() - if not label_hour: - # default fallback: floor to hour - label_hour = now.replace(minute=0, second=0, microsecond=0).strftime('%Y-%m-%d-%H') - - files = set(sftp.listdir()) - log(f"fetch_current_rosters: label={label_hour} files={sorted(files)}") - need = ['login','users','courses','enrollments'] - saved = [] - for name in need: - remote = f'{name}.csv' - target = f'cache/rosters/{name}-{label_hour}.csv' - try: - if remote in files: - if not (os.path.exists(target) and os.path.getsize(target) > 0): - temp = target + '.part' - try: - if os.path.exists(temp): - os.remove(temp) - except Exception: - pass - sftp.get(remote, temp) - # atomic replace - os.replace(temp, target) - saved.append(remote) - log(f' saved {remote} -> {target}') - else: - log(f' already exists: {target}, skipping') - else: - log(f' missing on server: {remote}') - except Exception as ex: - log(f' download failed: {remote} -> {target} err={ex}') - - if len(saved) >= 3 and 'courses.csv' in saved or os.path.exists(f'cache/rosters/courses-{label_hour}.csv'): - try: - with open(f'cache/rosters/courses-{label_hour}.csv','r') as courses: - courses.readline() - a = courses.readline() - parts = a.split(',') - year = parts[1][0:4] - ss = parts[1][4:6] - sem = {'30':'spring', '50':'summer', '70':'fall' } - this_sem = sem.get(ss, 'spring') - log(f"post-process for semester={this_sem} year={year} label={label_hour}") - convert_roster_files(this_sem,year,label_hour) - move_to_folder(this_sem,year,label_hour,saved) - except Exception as expp: - log(f'post-processing failed: {expp}') - else: - log('not enough files to post-process yet') - finally: - if close_after: - try: - sftp.close() - except Exception: - pass - -def fetch_current_rosters_auto(poll_seconds=15): - """Poll Instructure SFTP from the top of each hour until all 4 roster files appear, then download once. - - Robust logging to cache/pipeline.log.txt - - Stops polling for the remainder of that hour after success - - Tries again on the next hour - """ - import pysftp - from contextlib import contextmanager - - log_path = 'cache/pipeline.log.txt' - - def log(msg): - ts = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') - try: - with open(log_path, 'a', encoding='utf-8') as f: - f.write(f"[{ts}] {msg}\n") - except Exception: - print(f"[log-fail] {msg}") - - @contextmanager - def sftp_conn(): - cnopts = pysftp.CnOpts() - cnopts.hostkeys = None - conn = None - try: - conn = pysftp.Connection(instructure_url, username=instructure_username, - private_key=instructure_private_key, cnopts=cnopts) - conn.chdir('SIS') - yield conn - finally: - try: - if conn: - conn.close() - except Exception: - pass - - required = {'login.csv', 'users.csv', 'courses.csv', 'enrollments.csv'} - current_window = None # e.g., '2025-08-30-14' - window_done = False - window_end = None - - def compute_window(now): - """Return (label_hour_str, window_end_dt) if within a window, else (None, None). - Window spans [H-5m, H+5m] around each top-of-hour H. - If minute >= 55 -> window is next hour - If minute <= 5 -> window is current hour - Else -> not in window. - """ - m = now.minute - if m >= 55: - base = (now.replace(minute=0, second=0, microsecond=0) + timedelta(hours=1)) - elif m <= 5: - base = now.replace(minute=0, second=0, microsecond=0) - else: - return (None, None) - label = base.strftime('%Y-%m-%d-%H') - end = base + timedelta(minutes=5) # end of window - return (label, end) - - log('fetch_current_rosters_auto: starting poll loop') - while True: - now = datetime.datetime.now() - hour_key = now.strftime('%Y-%m-%d %H') - label, enddt = compute_window(now) - if label is None: - time.sleep(max(5, int(poll_seconds))) - continue - - if label != current_window: - current_window = label - window_done = False - window_end = enddt - log(f'New window: {current_window} (until {window_end.strftime("%H:%M:%S")}). Resetting state.') - - if window_done: - time.sleep(5) - continue - - # Poll SFTP for files - try: - with sftp_conn() as sftp: - files = set(sftp.listdir()) - missing = list(required - files) - if missing: - log(f'Not ready yet. Missing: {missing}') - else: - log('All required files present inside window. Starting download...') - try: - fetch_current_rosters(sftp=sftp, label_hour=current_window) - log('Download complete. Marking window_done for this window.') - window_done = True - except Exception as ex_dl: - import traceback - log('Download failed: ' + str(ex_dl)) - log(traceback.format_exc()) - except Exception as ex_list: - import traceback - log('SFTP list failed: ' + str(ex_list)) - log(traceback.format_exc()) - - # Check for window timeout - try: - if not window_done and window_end and now >= window_end: - log(f'REPORT: window {current_window} ended without all files. Consider alerting/email.') - window_done = True # stop further checks this window - except Exception: - pass - - time.sleep(max(5, int(poll_seconds))) - - -# Canvas data, download all new files -def sync_non_interactive(): - resp = do_request('/api/account/self/file/sync') - mylog.write(json.dumps(resp, indent=4)) - #mylog.close() - gotten = os.listdir(local_data_folder) - wanted = [] - i = 0 - for x in resp['files']: - filename = x['filename'] - exi = "No " - if filename in gotten: exi = "Yes" - else: wanted.append(x) - - print(str(i) + '.\tLocal: %s\tRemote: %s' % ( exi, filename )) - i += 1 - print("I will attempt to download %i files." % len(wanted)) - - #answer = input("Press enter to begin, or q to quit ") - #if not answer == '': return - - good_count = 0 - bad_count = 0 - for W in wanted: - print("Downloading: " + W['filename']) - response = requests.request(method='GET', url=W['url'], stream=True) - if(response.status_code != 200): - print('Request response went bad. Got back a %s code, meaning the request was %s' % \ - (response.status_code, response.reason)) - print('URL: ' + W['url']) - bad_count += 1 - - else: - #Use the downloaded data - with open(local_data_folder + W['filename'], 'wb') as fd: - for chunk in response.iter_content(chunk_size=128): - fd.write(chunk) - print("Success") - good_count += 1 - print("Out of %i files, %i succeeded and %i failed." % (len(wanted),good_count,bad_count)) - - -## OLD STYLE CANVAS DATA - -# Get something from Canvas Data -def do_request(path): - #Set up the request pieces - method = 'GET' - host = 'api.inshosteddata.com' - apiTime = datetime.utcnow().strftime('%a, %d %b %y %H:%M:%S GMT') - apiContentType = 'application/json' - - msgList = [] - msgList.append(method) - msgList.append(host) - msgList.append(apiContentType) - msgList.append('') - msgList.append(path) - msgList.append('') - msgList.append(apiTime) - msgList.append(apiSecret) - - msgStr = bytes("".join("%s\n" % k for k in msgList).strip(),'utf-8') - - sig = base64.b64encode(hmac.new(key=bytes(apiSecret,'utf-8'),msg=msgStr,digestmod=hashlib.sha256).digest()) - sig = sig.decode('utf-8') - - headers = {} - headers['Authorization'] = 'HMACAuth {}:{}'.format(apiKey,sig) - headers['Date'] = apiTime - headers['Content-type'] = apiContentType - - - #Submit the request/get a response - uri = "https://"+host+path - print (uri) - print (headers) - response = requests.request(method='GET', url=uri, headers=headers, stream=True) - - #Check to make sure the request was ok - if(response.status_code != 200): - print(('Request response went bad. Got back a ', response.status_code, ' code, meaning the request was ', response.reason)) - else: - #Use the downloaded data - jsonData = response.json() - #print(json.dumps(jsonData, indent=4)) - return jsonData - - - - - - -################ -################ SENDING DATA AWAY -################ -################ -################ - -# Upload a json file to www -def put_file(remotepath,localpath, localfile,prompt=1): - import pysftp - show_all = 0 - folder = datetime.datetime.now().strftime('%Y-%m-%d-%H-%M-%S') - cnopts = pysftp.CnOpts() - cnopts.hostkeys = None - - with pysftp.Connection(FTP_SITE,username=FTP_USER, password=FTP_PW,cnopts=cnopts) as sftp: - #todo: these paths - #files = sftp.listdir() - #print(folder + "\tI see these files on remote: ", files, "\n") - sftp.chdir(remotepath) - files = sftp.listdir() - if show_all: print(folder + "\tI see these files on remote: ", files, "\n") - localf = os.listdir(localpath) - if show_all: print("I see these local: ", localf) - if prompt: - input('ready to upload') - sftp.put(localpath+localfile, localfile, preserve_mtime=True) - sftp.close() - - - """ - # copy files and directories from local static, to remote static, - # preserving modification times on the files - for f in localf: - print("This local file: " + f + " ", end=' ') - if not f in files: - sftp.put('video_srt/'+classfoldername+'/'+f, f, preserve_mtime=True) - print("Uploaded.") - else: - print("Skipped.") - """ - - """if len(files)==3 and 'users.csv' in files: - sftp.get('courses.csv','rosters/courses-'+folder+'.csv') - sftp.get('users.csv','rosters/users-'+folder+'.csv') - sftp.get('enrollments.csv','rosters/enrollments-'+folder+'.csv') - print folder + '\tSaved three data files in rosters folder.' - - courses = open('rosters/courses-'+folder+'.csv','r') - courses.readline() - a = courses.readline() - print a - courses.close() - parts = a.split(',') - year = parts[1][0:4] - ss = parts[1][4:6] - #print parts[1] - sem = {'30':'spring', '50':'summer', '70':'fall' } - this_sem = sem[ss] - #print this_sem, "", year - print folder + '\tbuilding data file...' - convert_roster_files(this_sem,year,folder) - print folder + '\tmoving files...' - move_to_folder(this_sem,year,folder) - else: - print folder + "\tDon't see all three files.""" - - - -################ -################ GOOGLE DOCS -################ -################ -################ - -def sec(t): return "

    "+t+"

    \n" -def para(t): return "

    "+t+"

    \n" -def ul(t): return "\n" -def li(t): return "
  • "+t+"
  • \n" - -def question(t,bracket=1): - ret = '' - match = re.search( r'\[(.*)\]', t) - if match and bracket: - ret += "" - t = re.sub( r'\[.*\]','',t) - else: - parts = t.split(' ') - id = '' - for p in parts: - if re.search(r'[a-zA-Z]',p[0]): id += p[0] - ret += "" % id.lower() - return ret + '

    ' + t + '

    \n
    ' - -def answer(t): - return t + '
    \n' - -def read_paragraph_element(element,type="NORMAL_TEXT"): - """Returns the text in the given ParagraphElement. - - Args: - element: a ParagraphElement from a Google Doc. - """ - text_run = element.get('textRun') - begin = '' - end = '' - if not text_run: - return '' - if 'textStyle' in text_run and 'link' in text_run['textStyle']: - begin = '' - end = '' - if 'textStyle' in text_run and 'bold' in text_run['textStyle'] and text_run['textStyle']['bold']==True and type=="NORMAL_TEXT": - begin = '' + begin - end = end + '' - - content = text_run.get('content') - content = re.sub(u'\u000b','
    \n',content) - - return begin + content + end - - -def get_doc(docid, bracket=1, verbose=0): - import pickle - import os.path - from googleapiclient.discovery import build - from google_auth_oauthlib.flow import InstalledAppFlow - from google.auth.transport.requests import Request - - #ooout = open(fileout,'w') - - # If modifying these scopes, delete the file token.pickle. - SCOPES = ['https://www.googleapis.com/auth/documents.readonly'] - creds = None - # The file token.pickle stores the user's access and refresh tokens, and is - # created automatically when the authorization flow completes for the first - # time. - if os.path.exists('token.pickle'): - with open('token.pickle', 'rb') as token: - creds = pickle.load(token) - # If there are no (valid) credentials available, let the user log in. - if not creds or not creds.valid: - if creds and creds.expired and creds.refresh_token: - creds.refresh(Request()) - else: - flow = InstalledAppFlow.from_client_secrets_file( - 'credentials.json', SCOPES) - creds = flow.run_local_server(port=0) - # Save the credentials for the next run - with open('token.pickle', 'wb') as token: - pickle.dump(creds, token) - - service = build('docs', 'v1', credentials=creds) - - # Retrieve the documents contents from the Docs service. - document = service.documents().get(documentId=docid).execute() - if verbose: print(document) - - tempout = codecs.open('cache/trash/gdoctemp.txt','w','utf-8') - tempout.write( json.dumps(document,indent=2) + "\n\n\n------------------------------------\n\n") - if verbose: print('The title of the document is: {}'.format(document.get('title'))) - doc_content = document.get('body').get('content') - if verbose: print(doc_content) - - doc_objects = document.get('inlineObjects') - if verbose: print(doc_objects) - - doc_lists = document.get('lists') - - text = '
    ' - last_type = '' - answer_text = '' - in_a_list = '' - - img_count = 1 - img_lookup = {} - img_heights = {} - img_widths = {} - - if doc_objects: - for k,value in doc_objects.items(): - tempout.write( "->" + k + "=" + json.dumps(value,indent=2) + "\n\n\n--\n\n") - if 'inlineObjectProperties' in value: - if 'embeddedObject' in value['inlineObjectProperties']: - if 'imageProperties' in value['inlineObjectProperties']['embeddedObject']: - if 'contentUri' in value['inlineObjectProperties']['embeddedObject']['imageProperties']: - print(k) - uu = value['inlineObjectProperties']['embeddedObject']['imageProperties']['contentUri'] - response = requests.get(uu, stream=True) - name = 'image_' + str(img_count) + '.' + response.headers['content-type'].split('/')[1] - img_count += 1 - - img_lookup[k] = name - - with open('cache/doc_images/'+name, 'wb') as out_file: - shutil.copyfileobj(response.raw, out_file) - print(uu) - print(response.headers) - print(name) - #input('x?') - del response - if 'size' in value['inlineObjectProperties']['embeddedObject']: - img_heights[k] = int(value['inlineObjectProperties']['embeddedObject']['size']['height']['magnitude']) - img_widths[k] = int(value['inlineObjectProperties']['embeddedObject']['size']['width']['magnitude']) - - tempout.write('- - - - - - - -\n\n') - #for value in doc_lists: - # tempout.write( json.dumps(value,indent=2) + "\n\n\n--\n\n") - - tempout.write('- - - - - - - -\n\n') - list_stack = [] - list_depth = 0 - last_list_depth = 0 - for value in doc_content: - tempout.write( json.dumps(value,indent=2) + "\n\n\n") - if verbose: print(json.dumps(value, sort_keys=True, indent=4)) - - # todo: x link, x bold, list, image. - tag_fxn = para - if 'paragraph' in value: - this_text = '' - - if 'bullet' in value['paragraph']: - # either we're (1)starting a new list, (2)in one, (3)starting a nested one, or (4)finished a nested one. - - lid = value['paragraph']['bullet']['listId'] - - if not list_stack: # 1 - list_stack.append(lid) - else: - if lid == list_stack[0]: # 2 - pass - - else: - if not lid in list_stack: # 3 - list_stack.append(lid) - else: # 4 - x = list_stack.pop() - while x != lid: list_stack.pop() - elif len(list_stack) > 0: # current para isn't a bullet but we still have a list open. - list_stack = [] - - list_depth = len(list_stack) - - deeper = list_depth - last_list_depth - - if deeper > 0: - answer_text += "" * deeper - - if len(list_stack): - tag_fxn = li - - elements = value.get('paragraph').get('elements') - - # inlineObjectElement": { - # "inlineObjectId": "kix.ssseeu8j9cfx", - - if 'paragraphStyle' in value.get('paragraph'): - style = value.get('paragraph').get('paragraphStyle') - #text += json.dumps(style, sort_keys=True, indent=4) - if 'namedStyleType' in style: - type = style['namedStyleType'] - - for elem in elements: - - # text content - this_text += read_paragraph_element(elem,type) - - # image content - if 'inlineObjectElement' in elem: - vpi = elem['inlineObjectElement'] - if 'inlineObjectId' in vpi: - ii = vpi['inlineObjectId'] - if ii in img_lookup: - img = img_lookup[ii] - h = img_heights[ii] - w = img_widths[ii] - this_text += '' % (img,w,h) - - - - if last_type=='NORMAL_TEXT' and type!=last_type: - text += answer(answer_text) - answer_text = '' - - if type=='HEADING_2': - text += sec(this_text) - this_text = '' - elif type=='HEADING_3': - text += question(this_text,bracket) - this_text = '' - else: - answer_text += tag_fxn(this_text) - this_text = '' - last_type = type - last_list_depth = list_depth - - elif 'table' in value: - # The text in table cells are in nested Structural Elements and tables may be - # nested. - text += "\nTABLE\n" - #table = value.get('table') - #for row in table.get('tableRows'): - # cells = row.get('tableCells') - # for cell in cells: - # text += read_strucutural_elements(cell.get('content')) - #elif 'tableOfContents' in value: - # # The text in the TOC is also in a Structural Element. - # toc = value.get('tableOfContents') - # text += read_strucutural_elements(toc.get('content')) - - #else: - # print(json.dumps(value, sort_keys=True, indent=4)) - - text += answer(answer_text) - #text += '
    ' - #print(text) - return text - -######### TRY #2 ###### - - -def read_paragraph_element_2(element,type="NORMAL_TEXT"): - text_run = element.get('textRun') - begin = '' - end = '' - if not text_run: return '' - if 'textStyle' in text_run and 'link' in text_run['textStyle']: - begin = '' - end = '' - if 'textStyle' in text_run and 'bold' in text_run['textStyle'] and text_run['textStyle']['bold']==True and type=="NORMAL_TEXT": - begin = '' + begin - end = end + '' - elif 'textStyle' in text_run and 'italic' in text_run['textStyle'] and text_run['textStyle']['italic']==True and type=="NORMAL_TEXT": - begin = '' + begin - end = end + '' - content = text_run.get('content') - content = re.sub(u'\u000b','
    \n',content) - return begin + content + end - -# t is a string that begins with "Icons: " ... and contains comma(space) separated list -def handle_icons(t): - text = t[7:].strip() - parts = text.split(", ") - return ('icons',parts) - -# t is a string that begins with "Tags: " ... and contains comma(space) separated list -def handle_tags(t): - text = t[6:].strip() - parts = text.split(", ") - return ('tags',parts) - -def handle_question(t,bracket=1): - anchor = '' - match = re.search( r'\[(.*)\]', t) - if match and bracket: - anchor = match.group(1).lower() - t = re.sub( r'\[.*\]','',t) - else: - parts = t.split(' ') - for p in parts: - if re.search(r'[a-zA-Z]',p[0]): anchor += p[0].lower() - return ('question', t, anchor) - -def handle_answer(t): - return ('answer',t) - -def handle_sec(t): return ('section',t) -def handle_para(t): return ('paragraph',t) -def handle_ul(t): return ('unorderdedlist',t) -def handle_li(t): return ('listitem',t) - - - -img_count = 1 -img_lookup = {} -img_heights = {} -img_widths = {} - - -def fetch_doc_image(k,value): - global img_count, img_lookup, img_heights, img_widths - if 'inlineObjectProperties' in value: - if 'embeddedObject' in value['inlineObjectProperties']: - if 'imageProperties' in value['inlineObjectProperties']['embeddedObject']: - if 'contentUri' in value['inlineObjectProperties']['embeddedObject']['imageProperties']: - print(k) - uu = value['inlineObjectProperties']['embeddedObject']['imageProperties']['contentUri'] - response = requests.get(uu, stream=True) - name = 'image_' + str(img_count) + '.' + response.headers['content-type'].split('/')[1] - img_count += 1 - img_lookup[k] = name - - with open('cache/doc_images/'+name, 'wb') as out_file: - shutil.copyfileobj(response.raw, out_file) - print(uu) - print(response.headers) - print(name) - del response - if 'size' in value['inlineObjectProperties']['embeddedObject']: - img_heights[k] = int(value['inlineObjectProperties']['embeddedObject']['size']['height']['magnitude']) - img_widths[k] = int(value['inlineObjectProperties']['embeddedObject']['size']['width']['magnitude']) - - -def get_doc_generic(docid, bracket=1, verbose=0): - import pickle - import os.path - from googleapiclient.discovery import build - from google_auth_oauthlib.flow import InstalledAppFlow - from google.auth.transport.requests import Request - global img_count, img_lookup, img_heights, img_widths - -# If modifying these scopes, delete the file token.pickle. - SCOPES = ['https://www.googleapis.com/auth/documents.readonly'] - creds = None - # The file token.pickle stores the user's access and refresh tokens, and is - # created automatically when the authorization flow completes for the first - # time. - if os.path.exists('token.pickle'): - with open('token.pickle', 'rb') as token: - creds = pickle.load(token) - if not creds or not creds.valid: - if creds and creds.expired and creds.refresh_token: - creds.refresh(Request()) - else: - flow = InstalledAppFlow.from_client_secrets_file( - 'credentials.json', SCOPES) - creds = flow.run_local_server(port=0) - # Save the credentials for the next run - with open('token.pickle', 'wb') as token: - pickle.dump(creds, token) - - service = build('docs', 'v1', credentials=creds) - - # Retrieve the documents contents from the Docs service. - document = service.documents().get(documentId=docid).execute() - - tempout = codecs.open('cache/trash/gdoctemp.txt','w','utf-8') - tempout.write( json.dumps(document,indent=2) \ - + "\n\n\n------------------------------------\n\n") - if verbose: print('The title of the document is: {}'.format(document.get('title'))) - - doc_content = document.get('body').get('content') - doc_objects = document.get('inlineObjects') - doc_lists = document.get('lists') - - #text = '' - result = [] - last_type = '' - #answer_text = '' - answer = [] - in_a_list = '' - - # Get all the images - for k,value in doc_objects.items(): - tempout.write( "->" + k + "=" + json.dumps(value,indent=2) + "\n\n\n--\n\n") - fetched = fetch_doc_image(k,value) - - list_stack = [] - list_depth = 0 - last_list_depth = 0 - for value in doc_content: - tempout.write( json.dumps(value,indent=2) + "\n\n\n") - if verbose: print(json.dumps(value, sort_keys=True, indent=4)) - - tag_fxn = handle_para - if 'paragraph' in value: - this_text = '' - - # First we deal with if we're in a list. - if 'bullet' in value['paragraph']: - # either we're (1)starting a new list, (2)in one (do nothing), - # (3)starting a nested one, or (4)finished a nested one. - lid = value['paragraph']['bullet']['listId'] - if not list_stack: # 1 - list_stack.append(lid) - else: - if not lid == list_stack[0]: - if not lid in list_stack: # 3 - list_stack.append(lid) - else: # 4 - x = list_stack.pop() - while x != lid: list_stack.pop() - elif len(list_stack) > 0: - # current para isn't a bullet but we still have a list open. - list_stack = [] - - - list_depth = len(list_stack) - deeper = list_depth - last_list_depth - if deeper > 0: - answer.append("" * deeper) - if len(list_stack): - tag_fxn = handle_li - - # NOW the tag_fxn is either 'para' or 'li'... let's get the styling info next, - elements = value.get('paragraph').get('elements') - if 'paragraphStyle' in value.get('paragraph'): - style = value.get('paragraph').get('paragraphStyle') - if 'namedStyleType' in style: - type = style['namedStyleType'] - - # and FINALLY, the actual contents. - for elem in elements: - # text content - this_text += read_paragraph_element_2(elem,type) - - # image content - if 'inlineObjectElement' in elem: - vpi = elem['inlineObjectElement'] - if 'inlineObjectId' in vpi: - ii = vpi['inlineObjectId'] - if ii in img_lookup: - img = img_lookup[ii] - h = img_heights[ii] - w = img_widths[ii] - this_text += '' % (img,w,h) - - - # Now for something tricky. Call an appropriate handler, based on: - # (a) what is the paragraph style type? - # (b) is it different from the prev one? - - if last_type=='NORMAL_TEXT' and type!=last_type: - if this_text.strip(): - result.append(handle_answer(answer)) - answer = [] - #answer_text = '' - - if type=='HEADING_2' and this_text.strip(): - result.append( handle_sec(this_text) ) - this_text = '' - elif type=='HEADING_3' and this_text.strip(): - result.append(handle_question(this_text,bracket)) - this_text = '' - else: - if this_text.lower().startswith('tags:'): - tag_fxn = handle_tags - if this_text.lower().startswith('icons:'): - tag_fxn = handle_icons - if this_text.strip(): - answer.append(tag_fxn(this_text)) - this_text = '' - last_type = type - last_list_depth = list_depth - - elif 'table' in value: - pass - - - result.append(handle_answer(answer)) - return json.dumps(result,indent=4) - - - - -def process_reg_history(term='fa25'): - from collections import defaultdict - from itertools import groupby - from operator import itemgetter - - def read_grouped_csv(path): - with open(path, newline='') as f: - fieldnames = ['datetime', 'crn', 'course', 'teacher', 'max', 'enrolled', 'waitlistmax', 'waitlisted'] - reader = csv.DictReader(f, fieldnames=fieldnames) - rows = sorted(reader, key=lambda r: r['datetime']) # Group by timestamp - grouped = {} - for ts, group in groupby(rows, key=itemgetter('datetime')): - grouped[ts] = {r['crn']: r for r in group} - return grouped - - def crossed_threshold(old_val, new_val, max_val): - thresholds = [0.25, 0.5, 0.75, 1.0] - if int(max_val) == 0: - return False, None - old_ratio = int(old_val) / int(max_val) - new_ratio = int(new_val) / int(max_val) - for t in thresholds: - if old_ratio < t <= new_ratio: - return True, int(t * 100) - return False, None - - def detect_changes(prev, curr): - changes = defaultdict(list) - - all_crns = prev.keys() | curr.keys() - for crn in all_crns: - o, n = prev.get(crn), curr.get(crn) - if not o: - changes[crn].append((n['datetime'], "Section was added.")) - elif not n: - changes[crn].append(( - o['datetime'], - f"Section was removed (last seen: teacher {o['teacher']}, " - f"{o['enrolled']}/{o['max']} enrolled, {o['waitlisted']}/{o['waitlistmax']} waitlisted)." - )) - else: - dt = n['datetime'] - if o['teacher'] != n['teacher']: - changes[crn].append((dt, f"Teacher changed from {o['teacher']} to {n['teacher']}.")) - if o['enrolled'] != n['enrolled']: - crossed, percent = crossed_threshold(o['enrolled'], n['enrolled'], n['max']) - if crossed: - changes[crn].append((dt, f"Enrollment crossed {percent}% ({n['enrolled']}/{n['max']}).")) - if int(n['waitlisted']) > 10 and o['waitlisted'] != n['waitlisted']: - changes[crn].append((dt, f"Waitlist exceeds 10: {n['waitlisted']}.")) - return changes - - def time_to_iso(s): - return datetime.datetime.strptime(s, "%Y-%m-%dT%H-%M").isoformat() - - def detect_changes_structured(prev, curr): - changes = defaultdict(list) - - all_crns = prev.keys() | curr.keys() - for crn in all_crns: - o, n = prev.get(crn), curr.get(crn) - if not o: - changes[crn].append({'time':time_to_iso(n['datetime']), "type":'section update', 'message': "Section was added."}) - elif not n: - changes[crn].append( - {'time':time_to_iso(o['datetime']), "type":'section update', 'message': "Section was removed.", - 'value': o['enrolled'], 'capacity': o['max'], }) - else: - dt = time_to_iso(n['datetime']) - if o['teacher'] != n['teacher']: - changes[crn].append({'time':dt, "type":'teacher_change', - 'message': f"Teacher changed from {o['teacher']} to {n['teacher']}.", - 'old_teacher': o['teacher'], 'new_teacher': n['teacher'], }) - if o['enrolled'] != n['enrolled']: - crossed, percent = crossed_threshold(o['enrolled'], n['enrolled'], n['max']) - if crossed: - changes[crn].append({'time':dt, "type":'enrollment_milestone', - 'message': f"Enrollment crossed {percent}% ({n['enrolled']}/{n['max']}).", - 'percent':percent,'value':n['enrolled'],'capacity':n['max'] }) - if int(n['waitlisted']) > 10 and o['waitlisted'] < n['waitlisted']: - changes[crn].append({'time':dt, "type":'enrollment_milestone', - 'message': f"Waitlist exceeds 10: {n['waitlisted']}).", - 'value':n['waitlisted']}) - return changes - - - def process_diff_timeline(path): - snapshots = read_grouped_csv(path) - timeline = sorted(snapshots.keys()) - timeline_diffs = [] - timeline_diffs_structured = [] - course_names = {} # crn -> latest known course name - - for i in range(1, len(timeline)): - prev_ts, curr_ts = timeline[i-1], timeline[i] - prev, curr = snapshots[prev_ts], snapshots[curr_ts] - - # update course name map - for crn, row in curr.items(): - course_names[crn] = row['course'] - - delta = detect_changes(prev, curr) - timeline_diffs.append(delta) - - delta_structured = detect_changes_structured(prev,curr) - timeline_diffs_structured.append(delta_structured) - - # Flatten and group by crn - crn_changes = defaultdict(list) - for delta in timeline_diffs: - for crn, changes in delta.items(): - crn_changes[crn].extend(changes) - - # Flatten and group by crn - crn_changes_structured = defaultdict(list) - for delta in timeline_diffs_structured: - for crn, changes in delta.items(): - crn_changes_structured[crn].extend(changes) - - # Sort changes for each CRN by datetime - for crn in crn_changes: - crn_changes[crn].sort(key=lambda x: x[0]) - - # Sort changes for each CRN by datetime - for crn in crn_changes_structured: - crn_changes[crn].sort(key=lambda x: x[0]) - - return crn_changes, crn_changes_structured, course_names - - fresh_history = requests.get(f"http://gavilan.cc/schedule/reg_history_{term}.csv").text - fresh_file = codecs.open(f'cache/reg_history_{term}.csv','w','utf-8') - fresh_file.write(fresh_history) - fresh_file.close() - - output1 = codecs.open(f'cache/reg_timeline_{term}.txt','w','utf-8') - output2 = codecs.open(f'cache/reg_timeline_{term}.json','w','utf-8') - changes, changes_structured, course_names = process_diff_timeline(f"cache/reg_history_{term}.csv") - - # once for plain text - - for crn in sorted(changes, key=lambda c: course_names.get(c, "")): - course = course_names.get(crn, "") - course_output = {'code': course, 'crn':crn,'events':[]} - print(f"\n{course} (CRN {crn}):") - output1.write(f"\n{course} (CRN {crn}):\n") - for dt, msg in changes[crn]: - print(f" [{dt}] {msg}") - output1.write(f" [{dt}] {msg}\n") - - course_output['events'].append({'message':msg, 'time':time_to_iso(dt)}) - - # again for structured - crn_list = [] - - for crn in sorted(changes_structured, key=lambda c: course_names.get(c, "")): - course = course_names.get(crn, "") - course_output = {'code': course, 'crn':crn,'events':changes_structured[crn]} - crn_list.append(course_output) - - output2.write( json.dumps(crn_list,indent=2) ) - output2.close() - - - -def recreate_all(): - # Use canonical semester short codes from semesters.py - try: - from semesters import code as SEM_CODES - except Exception: - SEM_CODES = [] - for x in SEM_CODES: - try: - recreate_reg_data(x) - except Exception as e: - print(f'Failed on {x} with: {e}') - - -def recreate_reg_data(term="fa25"): - from collections import defaultdict - from datetime import datetime - - def parse_row(row): - dt = datetime.strptime(row['datetime'], "%Y-%m-%dT%H-%M") - crn = row['crn'] - enrolled = int(row['enrolled']) - return dt, row['datetime'], crn, enrolled - - def reduce_latest_per_day(rows): - latest = defaultdict(dict) # latest[crn][date] = (dt, ts, enrolled) - latest_ts_by_date = {} # date → (dt, ts) for header naming - - for row in rows: - dt, full_ts, crn, enrolled = parse_row(row) - date_str = dt.date().isoformat() - ts_header = dt.strftime("%Y-%m-%dT%H") # <-- this is what we want - - # for each crn, per day, keep latest reading - if date_str not in latest[crn] or dt > latest[crn][date_str][0]: - latest[crn][date_str] = (dt, ts_header, enrolled) - - # also record latest timestamp per day for consistent column headers - if date_str not in latest_ts_by_date or dt > latest_ts_by_date[date_str][0]: - latest_ts_by_date[date_str] = (dt, ts_header) - - return latest, [ts for _, ts in sorted(latest_ts_by_date.values())] - - def pivot_table(latest, headers): - crns = sorted(latest) - table = [] - - for crn in crns: - row = [crn] - for ts in headers: - date_str = ts[:10] # match on YYYY-MM-DD - val = latest[crn].get(date_str) - if val and val[1] == ts: - row.append(str(val[2])) - else: - row.append("") - table.append(row) - - return ['crn'] + headers, table - - #with open(f"cache/reg_history_{term}.csv", newline='') as f: - from io import StringIO - url = f"https://gavilan.cc/schedule/reg_history_{term}.csv" - - # Download - resp = requests.get(url) - resp.raise_for_status() # raises if bad status - - # Wrap the text in a file-like object - f = StringIO(resp.text) - - fieldnames = ['datetime', 'crn', 'course', 'teacher', 'max', 'enrolled', 'waitlistmax', 'waitlisted'] - reader = csv.DictReader(f, fieldnames=fieldnames) - rows = list(reader) - - latest, headers = reduce_latest_per_day(rows) - header_row, table = pivot_table(latest, headers) - - with open(f"cache/reg_data_{term}.csv", "w", newline='') as f: - writer = csv.writer(f) - writer.writerow(header_row) - writer.writerows(table) - - -if __name__ == "__main__": - - print ('') - options = { 1: ['Fetch rosters on schedule',fetch_current_rosters_auto] , - 2: ['Get canvas data 2024 style', canvas_data_2024_run ], - 3: ['Set up canvas data 2024 style', setup_canvas_data_2024_run], - 4: ['Narrative timeline of section updates', process_reg_history], - 5: ['Create narrative format all semesters', recreate_all], - 6: ['Recreate reg_data from full reg history', recreate_reg_data], - } - - '''1: ['Re-create schedule csv and json files from raw html',recent_schedules] , - 2: ['Fetch rosters',fetch_current_rosters] , - 3: - 4: ['Compute how registration is filling up classes', schedule_filling] , - 5: ['Manually convert 3 csv files to joined json enrollment file.', convert_roster_files] , - 6: ['Canvas data: interactive sync', interactive ], - 7: ['Canvas data: automated sync', sync_non_interactive ], - 8: - 9: - 16: ['Scrape schedule from ssb', scrape_schedule_multi ], - 14: ['Generate latestart schedule', list_latestarts ], - 15: ['Test ssb calls with python', scrape_schedule_py ], - 10: ['schedule to db', scrape_for_db ], - 11: ['clean argos draft schedule file', argos_data_from_cvc], - 12: ['make expanded schedule json files of old semesters', expand_old_semesters ], - 13: ['Parse deanza schedule', dza_sched ], - ''' - - - if len(sys.argv) > 1 and re.search(r'^\d+',sys.argv[1]): - resp = int(sys.argv[1]) - print("\n\nPerforming: %s\n\n" % options[resp][0]) - - else: - print ('') - for key in options: - print(str(key) + '.\t' + options[key][0]) - - print('') - resp = input('Choose: ') - - # Call the function in the options dict - options[ int(resp)][1]() - -# Testing - -#if __name__ == "__main__": - #users = fetch('/api/v1/courses/69/users?per_page=100',1) - #print "These are the users: " - #print users - - #getSemesterSchedule() - - - #get_doc() - #pass +import util +import codecs, json, requests, re, csv, datetime, os, jsondiff, os.path +import sys, shutil, hmac, hashlib, base64, schedule, time, pathlib +from datetime import timedelta + +from canvas_secrets import apiKey, apiSecret, FTP_SITE, FTP_USER, FTP_PW, url, domain, account_id, header, header_media +from canvas_secrets import instructure_url, instructure_username, instructure_private_key + +import os, asyncio +from dap.api import DAPClient +from dap.dap_types import Credentials +from dap.integration.database import DatabaseConnection +from dap.replicator.sql import SQLReplicator + + + +""" +Everything to do with fetching data, + - From iLearn, via token + - current roster uploads from instructures sftp site + - raw logs and other from canvas data repo + - from ssb, use firefox to scrape the schedule + + +And some subsequent processing: + - Raw roster files, into a more compact json format + - Raw logs into something more useful +""" + +verbose = False + +users = {} +users_by_id = {} + +# todo: all these constants for SSB -- line 1008 +# +# todo: https://stackoverflow.com/questions/42656247/how-can-i-use-canvas-data-rest-api-using-python + + + + + + +sys.setrecursionlimit( 100000 ) + +local_data_folder = 'cache/canvas_data/' +mylog = codecs.open(local_data_folder + 'temp_log.txt','w') + + + + +class FetchError(Exception): + pass + + +DEBUG = 0 + +def d(s,end=''): + global DEBUG + if end and DEBUG: print(s,end=end) + elif DEBUG: print(s) + +################ +################ CANVAS API MAIN FETCHING FUNCTIONS +################ +################ +################ + + + + +# Main canvas querying fxn +def fetch(target,verbose=0,params=0,media=0): + # if there are more results, recursivly call myself, adding on to the results. + results = 0 + if target[0:4] != "http": target = url + target + if verbose: + print("++ Fetching: " + target) + if media: + r2 = requests.get(target, headers = header_media) + elif params: + r2 = requests.get(target, headers = header, params = params) + else: + r2 = requests.get(target, headers = header) + #if verbose: + #print "++ Got: " + r2.text + try: + results = json.loads(r2.text) + count = len(results) + except: + print("-- Failed to parse: ", r2.text) + if verbose: + print("Got %i results" % count) + if verbose > 1: + print(r2.headers) + + tempout = codecs.open('cache/fetchcache.txt','a','utf-8') + tempout.write(r2.text+"\n\n") + tempout.close() + + if ('link' in r2.headers and count > 0): + links = r2.headers['link'].split(',') + for L in links: + ll = L.split(';') + link = ll[0].replace("<","") + link = link.replace(">","") + if re.search(r'next', ll[1]): + if (verbose): print("++ More link: " + link) + #link = re.sub(r'per_page=10$', 'per_page=100', link) # link.replace('per_page=10','per_page=500') + #if (verbose): print("++ More link: " + link) + + nest = fetch(link,verbose,params,media) + if isinstance(results,dict): results.update(nest) + else: results.extend(nest) + return results + +# Main canvas querying fxn - stream version - don't die on big requests +def fetch_stream(target,verbose=0): + # if there are more results, recursivly call myself, adding on to the results. + results = 0 + while target: + if target[0:4] != "http": target = url + target + if verbose: + print("++ Fetching: " + target) + r2 = requests.get(target, headers = header) + if r2.status_code == 502: + raise FetchError() + try: + results = json.loads(r2.text) + count = len(results) + except: + print("-- Failed to parse: ", r2.text) + if verbose: + print("Got %i results" % count) + if verbose > 1: + print(r2.headers) + tempout = codecs.open('cache/fetchcache.txt','a','utf-8') + tempout.write(r2.text+"\n\n") + tempout.close() + + next_link_found = 0 + if ('link' in r2.headers and count > 0): + links = r2.headers['link'].split(',') + for L in links: + ll = L.split(';') + link = ll[0].replace("<","") + link = link.replace(">","") + if re.search(r'next', ll[1]): + target = link + next_link_found = 1 + break + if not next_link_found: target = 0 + yield results + + +# for dicts with one key, collapse that one key out, cause +# paging makes problems... example: enrollment_terms +def fetch_collapse(target,collapse='',verbose=0): + # if there are more results, recursivly call myself, adding on to the results. + results = 0 + if target[0:4] != "http": target = url + target + if verbose: + print("++ Fetching: " + target) + r2 = requests.get(target, headers = header) + #if verbose: + #print "++ Got: " + r2.text + try: + results = json.loads(r2.text) + except: + print("-- Failed to parse: ", r2.text) + if verbose: print(r2.headers) + + if collapse and collapse in results: + results = results[collapse] + + if ('link' in r2.headers): + links = r2.headers['link'].split(',') + for L in links: + ll = L.split(';') + link = ll[0].replace("<","") + link = link.replace(">","") + if re.search(r'next', ll[1]): + if (verbose): print("++ More link: " + link) + nest = fetch_collapse(link, collapse, verbose) + if isinstance(results,dict): results.update(nest) + else: results.extend(nest) + return results + + + + + + + + + +################ +################ CANVAS DATA +################ +################ +################ + + +# Get canvas data 2024 style +def canvas_data_2024_run(): + print("Updating all tables.") + asyncio.run(canvas_data_2024()) + print("Done with all tables.") + + +async def canvas_data_2024(): + + base_url: str = os.environ["DAP_API_URL"] + client_id: str = os.environ["DAP_CLIENT_ID"] + client_secret: str = os.environ["DAP_CLIENT_SECRET"] + #connection_string: str = "postgresql://postgres:rolley34@192.168.1.6/db" + + # todo: use secrets + connection_string: str = "postgresql://postgres:rolley34@192.168.1.199/db" + + desired_tables = "users,courses,communication_channels,context_modules,conversation_message_participants,conversation_messages,conversation_participants,conversations,course_sections,enrollment_states,enrollment_dates_overrides,enrollment_terms,enrollments,learning_outcome_groups,learning_outcome_question_results,learning_outcomes,pseudonyms,quizzes,scores,submissions,submission_versions,wiki_pages,wikis".split(',') + credentials = Credentials.create(client_id=client_id, client_secret=client_secret) + + async with DatabaseConnection(connection_string).open() as db_connection: + async with DAPClient(base_url, credentials) as session: + #tables = await session.get_tables("canvas") + for table in desired_tables: + print(f" trying to update {table} ") + try: + #await SQLReplicator(session, db_connection).initialize("canvas", table) + await SQLReplicator(session, db_connection).synchronize("canvas", table) + except Exception as e: + print(f" - skipping {table} because {e}") + + + +# Get canvas data 2024 style +def setup_canvas_data_2024_run(): + print("Setting up all tables.") + asyncio.run(setup_canvas_data_2024()) + print("Done with all tables.") + + +async def setup_canvas_data_2024(): + + base_url: str = os.environ["DAP_API_URL"] + client_id: str = os.environ["DAP_CLIENT_ID"] + client_secret: str = os.environ["DAP_CLIENT_SECRET"] + #connection_string: str = "postgresql://postgres:rolley34@192.168.1.6/db" + connection_string: str = "postgresql://postgres:rolley34@192.168.1.192/db" + + desired_tables = "users,courses,communication_channels,context_modules,conversation_message_participants,conversation_messages,conversation_participants,conversations,course_sections,enrollment_states,enrollment_dates_overrides,enrollment_terms,enrollments,learning_outcome_groups,learning_outcome_question_results,learning_outcomes,pseudonyms,quizzes,scores,submissions,submission_versions,wiki_pages,wikis".split(',') + credentials = Credentials.create(client_id=client_id, client_secret=client_secret) + + async with DatabaseConnection(connection_string).open() as db_connection: + async with DAPClient(base_url, credentials) as session: + #tables = await session.get_tables("canvas") + for table in desired_tables: + print(f" {table}") + try: + await SQLReplicator(session, db_connection).initialize("canvas", table) + except Exception as e: + print(f" - skipping {table} because {e}") + + + + + + + + + + + + + + + + + + + + + + +################ +################ ROSTERS AND REGISTRATION +################ +################ +################ + +# todo: the pipeline is disorganized. Organize it to have +# a hope of taking all this to a higher level. +# + +# todo: where does this belong in the pipeline? compare with recent_schedules() + + + +# Take the generically named rosters uploads files and move them to a semester folder and give them a date. +def move_to_folder(sem,year,folder,files): + semester = year+sem + semester_path = 'cache/rosters/%s' % semester + if not os.path.isdir('cache/rosters/'+semester): + os.makedirs('cache/rosters/'+semester) + now = datetime.datetime.now().strftime('%Y-%m-%dT%H-%M') + print("+ Moving roster files to folder: %s" % semester_path) + if not os.path.isdir(semester_path): + print("+ Creating folder: %s" % semester_path) + os.makedirs(semester_path) + def safe_move(src, dst): + try: + os.replace(src, dst) + except Exception as ex: + print(f"! move failed {src} -> {dst}: {ex}") + if 'courses.csv' in files: + safe_move('cache/rosters/courses-%s.csv' % folder, 'cache/rosters/%s/courses.%s.csv' % (semester,now)) + if 'enrollments.csv' in files: + safe_move('cache/rosters/enrollments-%s.csv' % folder, 'cache/rosters/%s/enrollments.%s.csv' % (semester,now)) + if 'users.csv' in files: + safe_move('cache/rosters/users-%s.csv' % folder, 'cache/rosters/%s/users.%s.csv' % (semester,now)) + + + +# Take raw upload (csv) files and make one big json out of them. +# This relates to enrollment files, not schedule. +def convert_roster_files(semester="",year="",folder=""): + if not semester: + semester = input("the semester? (ex: spring) ") + folder = input("Folder? (ex 2020-02-25-14-58-20) ") + uf = open('cache/rosters/users-'+folder+'.csv','r') + cf = open('cache/rosters/courses-'+folder+'.csv','r') + ef = open('cache/rosters/enrollments-'+folder+'.csv','r') + u = csv.DictReader(uf) + c = csv.DictReader(cf) + e = csv.DictReader(ef) + uu = [i for i in u] + cc = [i for i in c] + ee = [i for i in e] + uf.close() + cf.close() + ef.close() + myrosterfile = 'cache/rosters/roster_%s_%s.json' % (year, semester) + + if os.path.exists(myrosterfile): + print(" -- Moving previous combined roster json file. opening %s ..." % myrosterfile) + last_fileobj = open(myrosterfile,'r') + last_file = json.load(last_fileobj) + + last_fileobj.close() + + info = last_file[3] + last_date = info['date_filestring'] + + print(' -- writing: cache/rosters/%s%s/roster_%s.json ...' % (year,semester,last_date)) + + try: + os.rename(myrosterfile, 'cache/rosters/%s%s/roster_%s.json ...' % (year,semester,last_date)) + print(' -- ok') + except Exception as e: + print(" ** Failed because i couldn't move the previous roster file: %s" % myrosterfile) + print(e) + myrosterfile = "new_" + myrosterfile + pass + #os.remove('cache/old_rosters/roster_'+semester+'.'+last_date+'.json') + #os.rename(myrosterfile, 'cache/old_rosters/roster_'+semester+'.'+last_date+'.json') + + newinfo = {'date_filestring': datetime.datetime.now().strftime('%Y-%m-%dT%H-%M'), } + try: + new_roster = codecs.open(myrosterfile,'w', 'utf-8') + new_roster.write( json.dumps( [uu,cc,ee,newinfo], indent=2 )) + new_roster.close() + print(" -- Wrote roster info to: %s." % myrosterfile) + except Exception as e: + print(" ** Failed because i couldn't move the previous roster file: %s" % myrosterfile) + print(" ** " + str(e)) + + + + + + +# From instructure sftp site +def fetch_current_rosters(sftp=None, label_hour=None): + """Download roster CSVs from Instructure SFTP and post-process. + - If sftp provided, reuse it (already chdir'd to SIS). + - Files are saved using label_hour (format YYYY-MM-DD-HH). If None, compute fallback label by flooring to the hour. + """ + import pysftp + def log(msg): + ts = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') + try: + with open('cache/pipeline.log.txt','a', encoding='utf-8') as f: + f.write(f"[{ts}] {msg}\n") + except Exception: + print(msg) + + close_after = False + if sftp is None: + cnopts = pysftp.CnOpts() + cnopts.hostkeys = None + sftp = pysftp.Connection(instructure_url, username=instructure_username, private_key=instructure_private_key, cnopts=cnopts) + sftp.chdir('SIS') + close_after = True + try: + now = datetime.datetime.now() + if not label_hour: + # default fallback: floor to hour + label_hour = now.replace(minute=0, second=0, microsecond=0).strftime('%Y-%m-%d-%H') + + files = set(sftp.listdir()) + log(f"fetch_current_rosters: label={label_hour} files={sorted(files)}") + need = ['login','users','courses','enrollments'] + saved = [] + for name in need: + remote = f'{name}.csv' + target = f'cache/rosters/{name}-{label_hour}.csv' + try: + if remote in files: + if not (os.path.exists(target) and os.path.getsize(target) > 0): + temp = target + '.part' + try: + if os.path.exists(temp): + os.remove(temp) + except Exception: + pass + sftp.get(remote, temp) + # atomic replace + os.replace(temp, target) + saved.append(remote) + log(f' saved {remote} -> {target}') + else: + log(f' already exists: {target}, skipping') + else: + log(f' missing on server: {remote}') + except Exception as ex: + log(f' download failed: {remote} -> {target} err={ex}') + + if len(saved) >= 3 and 'courses.csv' in saved or os.path.exists(f'cache/rosters/courses-{label_hour}.csv'): + try: + with open(f'cache/rosters/courses-{label_hour}.csv','r') as courses: + courses.readline() + a = courses.readline() + parts = a.split(',') + year = parts[1][0:4] + ss = parts[1][4:6] + sem = {'30':'spring', '50':'summer', '70':'fall' } + this_sem = sem.get(ss, 'spring') + log(f"post-process for semester={this_sem} year={year} label={label_hour}") + convert_roster_files(this_sem,year,label_hour) + move_to_folder(this_sem,year,label_hour,saved) + except Exception as expp: + log(f'post-processing failed: {expp}') + else: + log('not enough files to post-process yet') + finally: + if close_after: + try: + sftp.close() + except Exception: + pass + +def fetch_current_rosters_auto(poll_seconds=15): + """Poll Instructure SFTP from the top of each hour until all 4 roster files appear, then download once. + - Robust logging to cache/pipeline.log.txt + - Stops polling for the remainder of that hour after success + - Tries again on the next hour + """ + import pysftp + from contextlib import contextmanager + + log_path = 'cache/pipeline.log.txt' + + def log(msg): + ts = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') + try: + with open(log_path, 'a', encoding='utf-8') as f: + f.write(f"[{ts}] {msg}\n") + except Exception: + print(f"[log-fail] {msg}") + + @contextmanager + def sftp_conn(): + cnopts = pysftp.CnOpts() + cnopts.hostkeys = None + conn = None + try: + conn = pysftp.Connection(instructure_url, username=instructure_username, + private_key=instructure_private_key, cnopts=cnopts) + conn.chdir('SIS') + yield conn + finally: + try: + if conn: + conn.close() + except Exception: + pass + + required = {'login.csv', 'users.csv', 'courses.csv', 'enrollments.csv'} + current_window = None # e.g., '2025-08-30-14' + window_done = False + window_end = None + + def compute_window(now): + """Return (label_hour_str, window_end_dt) if within a window, else (None, None). + Window spans [H-5m, H+5m] around each top-of-hour H. + If minute >= 55 -> window is next hour + If minute <= 5 -> window is current hour + Else -> not in window. + """ + m = now.minute + if m >= 55: + base = (now.replace(minute=0, second=0, microsecond=0) + timedelta(hours=1)) + elif m <= 5: + base = now.replace(minute=0, second=0, microsecond=0) + else: + return (None, None) + label = base.strftime('%Y-%m-%d-%H') + end = base + timedelta(minutes=5) # end of window + return (label, end) + + log('fetch_current_rosters_auto: starting poll loop') + while True: + now = datetime.datetime.now() + hour_key = now.strftime('%Y-%m-%d %H') + label, enddt = compute_window(now) + if label is None: + time.sleep(max(5, int(poll_seconds))) + continue + + if label != current_window: + current_window = label + window_done = False + window_end = enddt + log(f'New window: {current_window} (until {window_end.strftime("%H:%M:%S")}). Resetting state.') + + if window_done: + time.sleep(5) + continue + + # Poll SFTP for files + try: + with sftp_conn() as sftp: + files = set(sftp.listdir()) + missing = list(required - files) + if missing: + log(f'Not ready yet. Missing: {missing}') + else: + log('All required files present inside window. Starting download...') + try: + fetch_current_rosters(sftp=sftp, label_hour=current_window) + log('Download complete. Marking window_done for this window.') + window_done = True + except Exception as ex_dl: + import traceback + log('Download failed: ' + str(ex_dl)) + log(traceback.format_exc()) + except Exception as ex_list: + import traceback + log('SFTP list failed: ' + str(ex_list)) + log(traceback.format_exc()) + + # Check for window timeout + try: + if not window_done and window_end and now >= window_end: + log(f'REPORT: window {current_window} ended without all files. Consider alerting/email.') + window_done = True # stop further checks this window + except Exception: + pass + + time.sleep(max(5, int(poll_seconds))) + + + + + + +################ +################ SENDING DATA AWAY +################ +################ +################ + +# Upload a json file to www +def put_file(remotepath,localpath, localfile,prompt=1): + import pysftp + show_all = 0 + folder = datetime.datetime.now().strftime('%Y-%m-%d-%H-%M-%S') + cnopts = pysftp.CnOpts() + cnopts.hostkeys = None + + with pysftp.Connection(FTP_SITE,username=FTP_USER, password=FTP_PW,cnopts=cnopts) as sftp: + #todo: these paths + #files = sftp.listdir() + #print(folder + "\tI see these files on remote: ", files, "\n") + sftp.chdir(remotepath) + files = sftp.listdir() + if show_all: print(folder + "\tI see these files on remote: ", files, "\n") + localf = os.listdir(localpath) + if show_all: print("I see these local: ", localf) + if prompt: + input('ready to upload') + sftp.put(localpath+localfile, localfile, preserve_mtime=True) + sftp.close() + + +#text = + result = [] + last_type = '' + #answer_text = '' + answer = [] + in_a_list = '' + + # Get all the images + for k,value in doc_objects.items(): + tempout.write( "->" + k + "=" + json.dumps(value,indent=2) + "\n\n\n--\n\n") + fetched = fetch_doc_image(k,value) + + list_stack = [] + list_depth = 0 + last_list_depth = 0 + for value in doc_content: + tempout.write( json.dumps(value,indent=2) + "\n\n\n") + if verbose: print(json.dumps(value, sort_keys=True, indent=4)) + + tag_fxn = handle_para + if 'paragraph' in value: + this_text = '' + + # First we deal with if we're in a list. + if 'bullet' in value['paragraph']: + # either we're (1)starting a new list, (2)in one (do nothing), + # (3)starting a nested one, or (4)finished a nested one. + lid = value['paragraph']['bullet']['listId'] + if not list_stack: # 1 + list_stack.append(lid) + else: + if not lid == list_stack[0]: + if not lid in list_stack: # 3 + list_stack.append(lid) + else: # 4 + x = list_stack.pop() + while x != lid: list_stack.pop() + elif len(list_stack) > 0: + # current para isn't a bullet but we still have a list open. + list_stack = [] + + + list_depth = len(list_stack) + deeper = list_depth - last_list_depth + if deeper > 0: + answer.append("" * deeper) + if len(list_stack): + tag_fxn = handle_li + + # NOW the tag_fxn is either 'para' or 'li'... let's get the styling info next, + elements = value.get('paragraph').get('elements') + if 'paragraphStyle' in value.get('paragraph'): + style = value.get('paragraph').get('paragraphStyle') + if 'namedStyleType' in style: + type = style['namedStyleType'] + + # and FINALLY, the actual contents. + for elem in elements: + # text content + this_text += read_paragraph_element_2(elem,type) + + # image content + if 'inlineObjectElement' in elem: + vpi = elem['inlineObjectElement'] + if 'inlineObjectId' in vpi: + ii = vpi['inlineObjectId'] + if ii in img_lookup: + img = img_lookup[ii] + h = img_heights[ii] + w = img_widths[ii] + this_text += '' % (img,w,h) + + + # Now for something tricky. Call an appropriate handler, based on: + # (a) what is the paragraph style type? + # (b) is it different from the prev one? + + if last_type=='NORMAL_TEXT' and type!=last_type: + if this_text.strip(): + result.append(handle_answer(answer)) + answer = [] + #answer_text = '' + + if type=='HEADING_2' and this_text.strip(): + result.append( handle_sec(this_text) ) + this_text = '' + elif type=='HEADING_3' and this_text.strip(): + result.append(handle_question(this_text,bracket)) + this_text = '' + else: + if this_text.lower().startswith('tags:'): + tag_fxn = handle_tags + if this_text.lower().startswith('icons:'): + tag_fxn = handle_icons + if this_text.strip(): + answer.append(tag_fxn(this_text)) + this_text = '' + last_type = type + last_list_depth = list_depth + + elif 'table' in value: + pass + + + result.append(handle_answer(answer)) + return json.dumps(result,indent=4) + + + + +def process_reg_history(term='fa25'): + from collections import defaultdict + from itertools import groupby + from operator import itemgetter + + def read_grouped_csv(path): + with open(path, newline='') as f: + fieldnames = ['datetime', 'crn', 'course', 'teacher', 'max', 'enrolled', 'waitlistmax', 'waitlisted'] + reader = csv.DictReader(f, fieldnames=fieldnames) + rows = sorted(reader, key=lambda r: r['datetime']) # Group by timestamp + grouped = {} + for ts, group in groupby(rows, key=itemgetter('datetime')): + grouped[ts] = {r['crn']: r for r in group} + return grouped + + def crossed_threshold(old_val, new_val, max_val): + thresholds = [0.25, 0.5, 0.75, 1.0] + if int(max_val) == 0: + return False, None + old_ratio = int(old_val) / int(max_val) + new_ratio = int(new_val) / int(max_val) + for t in thresholds: + if old_ratio < t <= new_ratio: + return True, int(t * 100) + return False, None + + def detect_changes(prev, curr): + changes = defaultdict(list) + + all_crns = prev.keys() | curr.keys() + for crn in all_crns: + o, n = prev.get(crn), curr.get(crn) + if not o: + changes[crn].append((n['datetime'], "Section was added.")) + elif not n: + changes[crn].append(( + o['datetime'], + f"Section was removed (last seen: teacher {o['teacher']}, " + f"{o['enrolled']}/{o['max']} enrolled, {o['waitlisted']}/{o['waitlistmax']} waitlisted)." + )) + else: + dt = n['datetime'] + if o['teacher'] != n['teacher']: + changes[crn].append((dt, f"Teacher changed from {o['teacher']} to {n['teacher']}.")) + if o['enrolled'] != n['enrolled']: + crossed, percent = crossed_threshold(o['enrolled'], n['enrolled'], n['max']) + if crossed: + changes[crn].append((dt, f"Enrollment crossed {percent}% ({n['enrolled']}/{n['max']}).")) + if int(n['waitlisted']) > 10 and o['waitlisted'] != n['waitlisted']: + changes[crn].append((dt, f"Waitlist exceeds 10: {n['waitlisted']}.")) + return changes + + def time_to_iso(s): + return datetime.datetime.strptime(s, "%Y-%m-%dT%H-%M").isoformat() + + def detect_changes_structured(prev, curr): + changes = defaultdict(list) + + all_crns = prev.keys() | curr.keys() + for crn in all_crns: + o, n = prev.get(crn), curr.get(crn) + if not o: + changes[crn].append({'time':time_to_iso(n['datetime']), "type":'section update', 'message': "Section was added."}) + elif not n: + changes[crn].append( + {'time':time_to_iso(o['datetime']), "type":'section update', 'message': "Section was removed.", + 'value': o['enrolled'], 'capacity': o['max'], }) + else: + dt = time_to_iso(n['datetime']) + if o['teacher'] != n['teacher']: + changes[crn].append({'time':dt, "type":'teacher_change', + 'message': f"Teacher changed from {o['teacher']} to {n['teacher']}.", + 'old_teacher': o['teacher'], 'new_teacher': n['teacher'], }) + if o['enrolled'] != n['enrolled']: + crossed, percent = crossed_threshold(o['enrolled'], n['enrolled'], n['max']) + if crossed: + changes[crn].append({'time':dt, "type":'enrollment_milestone', + 'message': f"Enrollment crossed {percent}% ({n['enrolled']}/{n['max']}).", + 'percent':percent,'value':n['enrolled'],'capacity':n['max'] }) + if int(n['waitlisted']) > 10 and o['waitlisted'] < n['waitlisted']: + changes[crn].append({'time':dt, "type":'enrollment_milestone', + 'message': f"Waitlist exceeds 10: {n['waitlisted']}).", + 'value':n['waitlisted']}) + return changes + + + def process_diff_timeline(path): + snapshots = read_grouped_csv(path) + timeline = sorted(snapshots.keys()) + timeline_diffs = [] + timeline_diffs_structured = [] + course_names = {} # crn -> latest known course name + + for i in range(1, len(timeline)): + prev_ts, curr_ts = timeline[i-1], timeline[i] + prev, curr = snapshots[prev_ts], snapshots[curr_ts] + + # update course name map + for crn, row in curr.items(): + course_names[crn] = row['course'] + + delta = detect_changes(prev, curr) + timeline_diffs.append(delta) + + delta_structured = detect_changes_structured(prev,curr) + timeline_diffs_structured.append(delta_structured) + + # Flatten and group by crn + crn_changes = defaultdict(list) + for delta in timeline_diffs: + for crn, changes in delta.items(): + crn_changes[crn].extend(changes) + + # Flatten and group by crn + crn_changes_structured = defaultdict(list) + for delta in timeline_diffs_structured: + for crn, changes in delta.items(): + crn_changes_structured[crn].extend(changes) + + # Sort changes for each CRN by datetime + for crn in crn_changes: + crn_changes[crn].sort(key=lambda x: x[0]) + + # Sort changes for each CRN by datetime + for crn in crn_changes_structured: + crn_changes[crn].sort(key=lambda x: x[0]) + + return crn_changes, crn_changes_structured, course_names + + fresh_history = requests.get(f"http://gavilan.cc/schedule/reg_history_{term}.csv").text + fresh_file = codecs.open(f'cache/reg_history_{term}.csv','w','utf-8') + fresh_file.write(fresh_history) + fresh_file.close() + + output1 = codecs.open(f'cache/reg_timeline_{term}.txt','w','utf-8') + output2 = codecs.open(f'cache/reg_timeline_{term}.json','w','utf-8') + changes, changes_structured, course_names = process_diff_timeline(f"cache/reg_history_{term}.csv") + + # once for plain text + + for crn in sorted(changes, key=lambda c: course_names.get(c, "")): + course = course_names.get(crn, "") + course_output = {'code': course, 'crn':crn,'events':[]} + print(f"\n{course} (CRN {crn}):") + output1.write(f"\n{course} (CRN {crn}):\n") + for dt, msg in changes[crn]: + print(f" [{dt}] {msg}") + output1.write(f" [{dt}] {msg}\n") + + course_output['events'].append({'message':msg, 'time':time_to_iso(dt)}) + + # again for structured + crn_list = [] + + for crn in sorted(changes_structured, key=lambda c: course_names.get(c, "")): + course = course_names.get(crn, "") + course_output = {'code': course, 'crn':crn,'events':changes_structured[crn]} + crn_list.append(course_output) + + output2.write( json.dumps(crn_list,indent=2) ) + output2.close() + + + +def recreate_all(): + # Use canonical semester short codes from semesters.py + try: + from semesters import code as SEM_CODES + except Exception: + SEM_CODES = [] + for x in SEM_CODES: + try: + recreate_reg_data(x) + except Exception as e: + print(f'Failed on {x} with: {e}') + + +def recreate_reg_data(term="fa25"): + from collections import defaultdict + from datetime import datetime + + def parse_row(row): + dt = datetime.strptime(row['datetime'], "%Y-%m-%dT%H-%M") + crn = row['crn'] + enrolled = int(row['enrolled']) + return dt, row['datetime'], crn, enrolled + + def reduce_latest_per_day(rows): + latest = defaultdict(dict) # latest[crn][date] = (dt, ts, enrolled) + latest_ts_by_date = {} # date → (dt, ts) for header naming + + for row in rows: + dt, full_ts, crn, enrolled = parse_row(row) + date_str = dt.date().isoformat() + ts_header = dt.strftime("%Y-%m-%dT%H") # <-- this is what we want + + # for each crn, per day, keep latest reading + if date_str not in latest[crn] or dt > latest[crn][date_str][0]: + latest[crn][date_str] = (dt, ts_header, enrolled) + + # also record latest timestamp per day for consistent column headers + if date_str not in latest_ts_by_date or dt > latest_ts_by_date[date_str][0]: + latest_ts_by_date[date_str] = (dt, ts_header) + + return latest, [ts for _, ts in sorted(latest_ts_by_date.values())] + + def pivot_table(latest, headers): + crns = sorted(latest) + table = [] + + for crn in crns: + row = [crn] + for ts in headers: + date_str = ts[:10] # match on YYYY-MM-DD + val = latest[crn].get(date_str) + if val and val[1] == ts: + row.append(str(val[2])) + else: + row.append("") + table.append(row) + + return ['crn'] + headers, table + + #with open(f"cache/reg_history_{term}.csv", newline='') as f: + from io import StringIO + url = f"https://gavilan.cc/schedule/reg_history_{term}.csv" + + # Download + resp = requests.get(url) + resp.raise_for_status() # raises if bad status + + # Wrap the text in a file-like object + f = StringIO(resp.text) + + fieldnames = ['datetime', 'crn', 'course', 'teacher', 'max', 'enrolled', 'waitlistmax', 'waitlisted'] + reader = csv.DictReader(f, fieldnames=fieldnames) + rows = list(reader) + + latest, headers = reduce_latest_per_day(rows) + header_row, table = pivot_table(latest, headers) + + with open(f"cache/reg_data_{term}.csv", "w", newline='') as f: + writer = csv.writer(f) + writer.writerow(header_row) + writer.writerows(table) + + +if __name__ == "__main__": + + print ('') + options = { 1: ['Fetch rosters on schedule',fetch_current_rosters_auto] , + 2: ['Get canvas data 2024 style', canvas_data_2024_run ], + 3: ['Set up canvas data 2024 style', setup_canvas_data_2024_run], + 4: ['Narrative timeline of section updates', process_reg_history], + 5: ['Create narrative format all semesters', recreate_all], + 6: ['Recreate reg_data from full reg history', recreate_reg_data], + } + + '''1: ['Re-create schedule csv and json files from raw html',recent_schedules] , + 2: ['Fetch rosters',fetch_current_rosters] , + 3: + 4: ['Compute how registration is filling up classes', schedule_filling] , + 5: ['Manually convert 3 csv files to joined json enrollment file.', convert_roster_files] , + 6: ['Canvas data: interactive sync', interactive ], + 7: ['Canvas data: automated sync', sync_non_interactive ], + 8: + 9: + 16: ['Scrape schedule from ssb', scrape_schedule_multi ], + 14: ['Generate latestart schedule', list_latestarts ], + 15: ['Test ssb calls with python', scrape_schedule_py ], + 10: ['schedule to db', scrape_for_db ], + 11: ['clean argos draft schedule file', argos_data_from_cvc], + 12: ['make expanded schedule json files of old semesters', expand_old_semesters ], + 13: ['Parse deanza schedule', dza_sched ], + ''' + + + if len(sys.argv) > 1 and re.search(r'^\d+',sys.argv[1]): + resp = int(sys.argv[1]) + print("\n\nPerforming: %s\n\n" % options[resp][0]) + + else: + print ('') + for key in options: + print(str(key) + '.\t' + options[key][0]) + + print('') + resp = input('Choose: ') + + # Call the function in the options dict + options[ int(resp)][1]() + +# Testing + +#if __name__ == "__main__": + #users = fetch('/api/v1/courses/69/users?per_page=100',1) + #print "These are the users: " + #print users + + #getSemesterSchedule() + + + #get_doc() + #pass diff --git a/search.py b/search.py index 6c3c431..b9dec48 100644 --- a/search.py +++ b/search.py @@ -554,3 +554,26 @@ if __name__ == "__main__": # Call the function in the options dict options[ int(resp)][1]() +def try_clustering(df): + from sklearn.cluster import KMeans + df = df.drop(['code'], axis=1) + kmeans = KMeans(n_clusters=4, random_state=0).fit(df) + return kmeans.labels_ +def nlp_sample(): + from gensim import utils, corpora + from nltk import stem + stemmer = stem.porter.PorterStemmer() + strings = [ + "Human machine interface for lab abc computer applications", + "A survey of user opinion of computer system response time", + "The EPS user interface management system", + "System and human system engineering testing of EPS", + "Relation of user perceived response time to error measurement", + "The generation of random binary unordered trees", + "The intersection graph of paths in trees", + "Graph minors IV Widths of trees and well quasi ordering", + "Graph minors A survey", + ] + processed = [[stemmer.stem(y) for y in utils.simple_preprocess(x, min_len=4)] for x in strings] + dictionary = corpora.Dictionary(processed) + return dictionary diff --git a/users.py b/users.py index bba2e03..4ac8b82 100644 --- a/users.py +++ b/users.py @@ -1938,7 +1938,8 @@ def track_users_by_teacherclass(): print(json.dumps(g2, indent=2)) -def nlp_sample(): +## moved: nlp_sample now in search.py +# def nlp_sample(): # Stream a training corpus directly from S3. #corpus = corpora.MmCorpus("s3://path/to/corpus") @@ -1955,9 +1956,7 @@ def nlp_sample(): "Graph minors IV Widths of trees and well quasi ordering", "Graph minors A survey", ] - processed = [ [ stemmer.stem(y) for y in utils.simple_preprocess(x, min_len=4)] for x in strings] - print(processed) - dictionary = corpora.Dictionary( processed ) + # moved dct = dictionary print(dictionary) @@ -2980,4 +2979,3 @@ if __name__ == "__main__": # Call the function in the options dict options[ int(resp)][1]() -