diff --git a/content.py b/content.py
index 9332113..c8b92b4 100644
--- a/content.py
+++ b/content.py
@@ -1528,6 +1528,334 @@ LANE: HyFlex
+################
+################ GOOGLE DOCS HELPERS (moved from pipelines)
+################
+
+def sec(t): return "
"+t+"
\n"
+def para(t): return ""+t+"
\n"
+def ul(t): return "\n"
+def li(t): return ""+t+"\n"
+
+def question(t,bracket=1):
+ ret = ''
+ match = re.search( r'\[(.*)\]', t)
+ if match and bracket:
+ ret += ""
+ t = re.sub( r'\[.*\]','',t)
+ else:
+ parts = t.split(' ')
+ id = ''
+ for p in parts:
+ if re.search(r'[a-zA-Z]',p[0]): id += p[0]
+ ret += "" % id.lower()
+ return ret + '' + t + '
\n
'
+
+def answer(t):
+ return t + '
\n'
+
+def read_paragraph_element(element,type="NORMAL_TEXT"):
+ text_run = element.get('textRun')
+ begin = ''
+ end = ''
+ if not text_run:
+ return ''
+ if 'textStyle' in text_run and 'link' in text_run['textStyle']:
+ begin = ''
+ end = ''
+ if 'textStyle' in text_run and 'bold' in text_run['textStyle'] and text_run['textStyle']['bold']==True and type=="NORMAL_TEXT":
+ begin = '' + begin
+ end = end + ''
+ content = text_run.get('content')
+ content = re.sub(u'\u000b','
\n',content)
+ return begin + content + end
+
+def read_paragraph_element_2(element,type="NORMAL_TEXT"):
+ return read_paragraph_element(element,type)
+
+
+
+# t is a string that begins with "Icons: " ... and contains comma(space) separated list
+def handle_icons(t):
+ text = t[7:].strip()
+ parts = text.split(", ")
+ return ('icons',parts)
+
+# t is a string that begins with "Tags: " ... and contains comma(space) separated list
+def handle_tags(t):
+ text = t[6:].strip()
+ parts = text.split(", ")
+ return ('tags',parts)
+
+def handle_question(t,bracket=1):
+ anchor = ''
+ match = re.search( r'\[(.*)\]', t)
+ if match and bracket:
+ anchor = match.group(1).lower()
+ t = re.sub( r'\[.*\]','',t)
+ else:
+ parts = t.split(' ')
+ for p in parts:
+ if re.search(r'[a-zA-Z]',p[0]): anchor += p[0].lower()
+ return ('question', t, anchor)
+
+def handle_answer(t):
+ return ('answer',t)
+
+def handle_sec(t): return ('section',t)
+def handle_para(t): return ('paragraph',t)
+def handle_ul(t): return ('unorderdedlist',t)
+def handle_li(t): return ('listitem',t)
+
+
+
+img_count = 1
+img_lookup = {}
+img_heights = {}
+img_widths = {}
+
+
+'''def fetch_doc_image(k,value):
+ global img_count, img_lookup, img_heights, img_widths
+ if 'inlineObjectProperties' in value:
+ if 'embeddedObject' in value['inlineObjectProperties']:
+ if 'imageProperties' in value['inlineObjectProperties']['embeddedObject']:
+ if 'contentUri' in value['inlineObjectProperties']['embeddedObject']['imageProperties']:
+ print(k)
+ uu = value['inlineObjectProperties']['embeddedObject']['imageProperties']['contentUri']
+ response = requests.get(uu, stream=True)
+ name = 'image_' + str(img_count) + '.' + response.headers['content-type'].split('/')[1]
+ img_count += 1
+ img_lookup[k] = name
+
+ with open('cache/doc_images/'+name, 'wb') as out_file:
+ shutil.copyfileobj(response.raw, out_file)
+ print(uu)
+ print(response.headers)
+ print(name)
+ del response
+ if 'size' in value['inlineObjectProperties']['embeddedObject']:
+ img_heights[k] = int(value['inlineObjectProperties']['embeddedObject']['size']['height']['magnitude'])
+ img_widths[k] = int(value['inlineObjectProperties']['embeddedObject']['size']['width']['magnitude'])
+'''
+
+
+def fetch_doc_image(k,value):
+ import shutil
+ if 'inlineObjectProperties' in value:
+ if 'embeddedObject' in value['inlineObjectProperties']:
+ if 'imageProperties' in value['inlineObjectProperties']['embeddedObject']:
+ if 'contentUri' in value['inlineObjectProperties']['embeddedObject']['imageProperties']:
+ uu = value['inlineObjectProperties']['embeddedObject']['imageProperties']['contentUri']
+ response = requests.get(uu, stream=True)
+ name = 'image_' + str(k) + '.' + response.headers['content-type'].split('/')[1]
+ with open('cache/doc_images/'+name, 'wb') as out_file:
+ shutil.copyfileobj(response.raw, out_file)
+ del response
+ return True
+
+def get_doc(docid, bracket=1, verbose=0):
+ import pickle, shutil
+ import os.path
+ from googleapiclient.discovery import build
+ from google_auth_oauthlib.flow import InstalledAppFlow
+ from google.auth.transport.requests import Request
+
+ #ooout = open(fileout,'w')
+
+ # If modifying these scopes, delete the file token.pickle.
+ SCOPES = ['https://www.googleapis.com/auth/documents.readonly']
+ creds = None
+ # The file token.pickle stores the user's access and refresh tokens, and is
+ # created automatically when the authorization flow completes for the first
+ # time.
+ if os.path.exists('token.pickle'):
+ with open('token.pickle', 'rb') as token:
+ creds = pickle.load(token)
+ # If there are no (valid) credentials available, let the user log in.
+ if not creds or not creds.valid:
+ if creds and creds.expired and creds.refresh_token:
+ creds.refresh(Request())
+ else:
+ flow = InstalledAppFlow.from_client_secrets_file(
+ 'credentials.json', SCOPES)
+ creds = flow.run_local_server(port=0)
+ # Save the credentials for the next run
+ with open('token.pickle', 'wb') as token:
+ pickle.dump(creds, token)
+
+ service = build('docs', 'v1', credentials=creds)
+
+ # Retrieve the documents contents from the Docs service.
+ document = service.documents().get(documentId=docid).execute()
+ if verbose: print(document)
+
+ tempout = codecs.open('cache/trash/gdoctemp.txt','w','utf-8')
+ tempout.write( json.dumps(document,indent=2) + "\n\n\n------------------------------------\n\n")
+ if verbose: print('The title of the document is: {}'.format(document.get('title')))
+ doc_content = document.get('body').get('content')
+ if verbose: print(doc_content)
+
+ doc_objects = document.get('inlineObjects')
+ if verbose: print(doc_objects)
+
+ doc_lists = document.get('lists')
+
+ text = ''
+ last_type = ''
+ answer_text = ''
+ in_a_list = ''
+
+ img_count = 1
+ img_lookup = {}
+ img_heights = {}
+ img_widths = {}
+
+ if doc_objects:
+ for k,value in doc_objects.items():
+ tempout.write( "->" + k + "=" + json.dumps(value,indent=2) + "\n\n\n--\n\n")
+ if 'inlineObjectProperties' in value:
+ if 'embeddedObject' in value['inlineObjectProperties']:
+ if 'imageProperties' in value['inlineObjectProperties']['embeddedObject']:
+ if 'contentUri' in value['inlineObjectProperties']['embeddedObject']['imageProperties']:
+ print(k)
+ uu = value['inlineObjectProperties']['embeddedObject']['imageProperties']['contentUri']
+ response = requests.get(uu, stream=True)
+ name = 'image_' + str(img_count) + '.' + response.headers['content-type'].split('/')[1]
+ img_count += 1
+
+ img_lookup[k] = name
+
+ with open('cache/doc_images/'+name, 'wb') as out_file:
+ shutil.copyfileobj(response.raw, out_file)
+ print(uu)
+ print(response.headers)
+ print(name)
+ #input('x?')
+ del response
+ if 'size' in value['inlineObjectProperties']['embeddedObject']:
+ img_heights[k] = int(value['inlineObjectProperties']['embeddedObject']['size']['height']['magnitude'])
+ img_widths[k] = int(value['inlineObjectProperties']['embeddedObject']['size']['width']['magnitude'])
+
+ tempout.write('- - - - - - - -\n\n')
+ #for value in doc_lists:
+ # tempout.write( json.dumps(value,indent=2) + "\n\n\n--\n\n")
+
+ tempout.write('- - - - - - - -\n\n')
+ list_stack = []
+ list_depth = 0
+ last_list_depth = 0
+ for value in doc_content:
+ tempout.write( json.dumps(value,indent=2) + "\n\n\n")
+ if verbose: print(json.dumps(value, sort_keys=True, indent=4))
+
+ # todo: x link, x bold, list, image.
+ tag_fxn = para
+ if 'paragraph' in value:
+ this_text = ''
+
+ if 'bullet' in value['paragraph']:
+ # either we're (1)starting a new list, (2)in one, (3)starting a nested one, or (4)finished a nested one.
+
+ lid = value['paragraph']['bullet']['listId']
+
+ if not list_stack: # 1
+ list_stack.append(lid)
+ else:
+ if lid == list_stack[0]: # 2
+ pass
+
+ else:
+ if not lid in list_stack: # 3
+ list_stack.append(lid)
+ else: # 4
+ x = list_stack.pop()
+ while x != lid: list_stack.pop()
+ elif len(list_stack) > 0: # current para isn't a bullet but we still have a list open.
+ list_stack = []
+
+ list_depth = len(list_stack)
+
+ deeper = list_depth - last_list_depth
+
+ if deeper > 0:
+ answer_text += "
" * deeper
+ elif deeper < 0:
+ deeper = -1 * deeper
+ answer_text += "
" * deeper
+
+ if len(list_stack):
+ tag_fxn = li
+
+ elements = value.get('paragraph').get('elements')
+
+ # inlineObjectElement": {
+ # "inlineObjectId": "kix.ssseeu8j9cfx",
+
+ if 'paragraphStyle' in value.get('paragraph'):
+ style = value.get('paragraph').get('paragraphStyle')
+ #text += json.dumps(style, sort_keys=True, indent=4)
+ if 'namedStyleType' in style:
+ type = style['namedStyleType']
+
+ for elem in elements:
+
+ # text content
+ this_text += read_paragraph_element(elem,type)
+
+ # image content
+ if 'inlineObjectElement' in elem:
+ vpi = elem['inlineObjectElement']
+ if 'inlineObjectId' in vpi:
+ ii = vpi['inlineObjectId']
+ if ii in img_lookup:
+ img = img_lookup[ii]
+ h = img_heights[ii]
+ w = img_widths[ii]
+ this_text += '

' % (img,w,h)
+
+
+
+ if last_type=='NORMAL_TEXT' and type!=last_type:
+ text += answer(answer_text)
+ answer_text = ''
+
+ if type=='HEADING_2':
+ text += sec(this_text)
+ this_text = ''
+ elif type=='HEADING_3':
+ text += question(this_text,bracket)
+ this_text = ''
+ else:
+ answer_text += tag_fxn(this_text)
+ this_text = ''
+ last_type = type
+ last_list_depth = list_depth
+
+ elif 'table' in value:
+ # The text in table cells are in nested Structural Elements and tables may be
+ # nested.
+ text += "\nTABLE\n"
+ #table = value.get('table')
+ #for row in table.get('tableRows'):
+ # cells = row.get('tableCells')
+ # for cell in cells:
+ # text += read_strucutural_elements(cell.get('content'))
+ #elif 'tableOfContents' in value:
+ # # The text in the TOC is also in a Structural Element.
+ # toc = value.get('tableOfContents')
+ # text += read_strucutural_elements(toc.get('content'))
+
+ #else:
+ # print(json.dumps(value, sort_keys=True, indent=4))
+
+ text += answer(answer_text)
+ #text += '
'
+ #print(text)
+ return text
+
+def get_doc_generic(docid, bracket=1, verbose=0):
+ return get_doc(docid, bracket, verbose)
@@ -1567,4 +1895,3 @@ if __name__ == "__main__":
# Call the function in the options dict
options[ int(resp)][1]()
-
diff --git a/courses.py b/courses.py
index 00a49e0..b33133f 100644
--- a/courses.py
+++ b/courses.py
@@ -2690,24 +2690,7 @@ def enrollment_helper():
# fill percentage for each section, then by mode, tod, campus
-def try_clustering(df):
- # Import required libraries
- from sklearn.cluster import KMeans
-
- # Preprocessing
-
- # Assuming df is your DataFrame and "modes" is your categorical column
- #df['code'] = df['code'].astype('category').cat.codes
-
- # Removing any other unnecessary columns
- df = df.drop(['code'], axis=1)
-
- # Perform KMeans clustering
- kmeans = KMeans(n_clusters=4, random_state=0).fit(df)
-
- # Get the cluster labels
- labels = kmeans.labels_
-
+## moved: try_clustering now in search.py
# Add labels to the DataFrame
#df['clusters'] = labels
#print(df)
diff --git a/depricated.py b/depricated.py
index b34a5c2..1c2e604 100644
--- a/depricated.py
+++ b/depricated.py
@@ -4,6 +4,98 @@
# from pipelines - canvas data
+
+# Canvas data, download all new files
+def sync_non_interactive():
+ resp = do_request('/api/account/self/file/sync')
+ mylog.write(json.dumps(resp, indent=4))
+ #mylog.close()
+ gotten = os.listdir(local_data_folder)
+ wanted = []
+ i = 0
+ for x in resp['files']:
+ filename = x['filename']
+ exi = "No "
+ if filename in gotten: exi = "Yes"
+ else: wanted.append(x)
+
+ print(str(i) + '.\tLocal: %s\tRemote: %s' % ( exi, filename ))
+ i += 1
+ print("I will attempt to download %i files." % len(wanted))
+
+ #answer = input("Press enter to begin, or q to quit ")
+ #if not answer == '': return
+
+ good_count = 0
+ bad_count = 0
+ for W in wanted:
+ print("Downloading: " + W['filename'])
+ response = requests.request(method='GET', url=W['url'], stream=True)
+ if(response.status_code != 200):
+ print('Request response went bad. Got back a %s code, meaning the request was %s' % \
+ (response.status_code, response.reason))
+ print('URL: ' + W['url'])
+ bad_count += 1
+
+ else:
+ #Use the downloaded data
+ with open(local_data_folder + W['filename'], 'wb') as fd:
+ for chunk in response.iter_content(chunk_size=128):
+ fd.write(chunk)
+ print("Success")
+ good_count += 1
+ print("Out of %i files, %i succeeded and %i failed." % (len(wanted),good_count,bad_count))
+
+
+## OLD STYLE CANVAS DATA
+
+# Get something from Canvas Data
+def do_request(path):
+ #Set up the request pieces
+ method = 'GET'
+ host = 'api.inshosteddata.com'
+ apiTime = datetime.utcnow().strftime('%a, %d %b %y %H:%M:%S GMT')
+ apiContentType = 'application/json'
+
+ msgList = []
+ msgList.append(method)
+ msgList.append(host)
+ msgList.append(apiContentType)
+ msgList.append('')
+ msgList.append(path)
+ msgList.append('')
+ msgList.append(apiTime)
+ msgList.append(apiSecret)
+
+ msgStr = bytes("".join("%s\n" % k for k in msgList).strip(),'utf-8')
+
+ sig = base64.b64encode(hmac.new(key=bytes(apiSecret,'utf-8'),msg=msgStr,digestmod=hashlib.sha256).digest())
+ sig = sig.decode('utf-8')
+
+ headers = {}
+ headers['Authorization'] = 'HMACAuth {}:{}'.format(apiKey,sig)
+ headers['Date'] = apiTime
+ headers['Content-type'] = apiContentType
+
+
+ #Submit the request/get a response
+ uri = "https://"+host+path
+ print (uri)
+ print (headers)
+ response = requests.request(method='GET', url=uri, headers=headers, stream=True)
+
+ #Check to make sure the request was ok
+ if(response.status_code != 200):
+ print(('Request response went bad. Got back a ', response.status_code, ' code, meaning the request was ', response.reason))
+ else:
+ #Use the downloaded data
+ jsonData = response.json()
+ #print(json.dumps(jsonData, indent=4))
+ return jsonData
+
+
+
+
def file_doesnt_exist(name):
# Get list of files in current directory
files = os.listdir()
diff --git a/localcache.py b/localcache.py
index 2668751..c1f1874 100644
--- a/localcache.py
+++ b/localcache.py
@@ -8,7 +8,7 @@ from datetime import datetime as dt
from datetime import timedelta
from dateutil.parser import parse
from os.path import exists, getmtime
-from pipelines import sync_non_interactive, url, header
+from pipelines import url, header
import util
from semesters import short_to_sis
@@ -1121,7 +1121,7 @@ def full_reload():
except Exception as e:
print("Couldn't rename file:", str(e))
- sync_non_interactive()
+ #sync_non_interactive()
setup_table('requests_sum1')
setup_table('courses')
diff --git a/pipelines.py b/pipelines.py
index 7e74f56..159ee41 100644
--- a/pipelines.py
+++ b/pipelines.py
@@ -1,1530 +1,1023 @@
-import util
-import codecs, json, requests, re, csv, datetime, os, jsondiff, os.path
-import sys, shutil, hmac, hashlib, base64, schedule, time, pathlib
-from datetime import timedelta
-
-from canvas_secrets import apiKey, apiSecret, FTP_SITE, FTP_USER, FTP_PW, url, domain, account_id, header, header_media, g_id, g_secret
-from canvas_secrets import instructure_url, instructure_username, instructure_private_key
-
-import os, asyncio
-from dap.api import DAPClient
-from dap.dap_types import Credentials
-from dap.integration.database import DatabaseConnection
-from dap.replicator.sql import SQLReplicator
-
-
-
-"""
-Everything to do with fetching data,
- - From iLearn, via token
- - current roster uploads from instructures sftp site
- - raw logs and other from canvas data repo
- - from ssb, use firefox to scrape the schedule
-
-
-And some subsequent processing:
- - Raw roster files, into a more compact json format
- - Raw logs into something more useful
-"""
-
-verbose = False
-
-users = {}
-users_by_id = {}
-
-# todo: all these constants for SSB -- line 1008
-#
-# todo: https://stackoverflow.com/questions/42656247/how-can-i-use-canvas-data-rest-api-using-python
-
-
-
-
-
-
-sys.setrecursionlimit( 100000 )
-
-local_data_folder = 'cache/canvas_data/'
-mylog = codecs.open(local_data_folder + 'temp_log.txt','w')
-
-
-
-
-class FetchError(Exception):
- pass
-
-
-DEBUG = 0
-
-def d(s,end=''):
- global DEBUG
- if end and DEBUG: print(s,end=end)
- elif DEBUG: print(s)
-
-################
-################ CANVAS API MAIN FETCHING FUNCTIONS
-################
-################
-################
-
-
-
-
-# Main canvas querying fxn
-def fetch(target,verbose=0,params=0,media=0):
- # if there are more results, recursivly call myself, adding on to the results.
- results = 0
- if target[0:4] != "http": target = url + target
- if verbose:
- print("++ Fetching: " + target)
- if media:
- r2 = requests.get(target, headers = header_media)
- elif params:
- r2 = requests.get(target, headers = header, params = params)
- else:
- r2 = requests.get(target, headers = header)
- #if verbose:
- #print "++ Got: " + r2.text
- try:
- results = json.loads(r2.text)
- count = len(results)
- except:
- print("-- Failed to parse: ", r2.text)
- if verbose:
- print("Got %i results" % count)
- if verbose > 1:
- print(r2.headers)
-
- tempout = codecs.open('cache/fetchcache.txt','a','utf-8')
- tempout.write(r2.text+"\n\n")
- tempout.close()
-
- if ('link' in r2.headers and count > 0):
- links = r2.headers['link'].split(',')
- for L in links:
- ll = L.split(';')
- link = ll[0].replace("<","")
- link = link.replace(">","")
- if re.search(r'next', ll[1]):
- if (verbose): print("++ More link: " + link)
- #link = re.sub(r'per_page=10$', 'per_page=100', link) # link.replace('per_page=10','per_page=500')
- #if (verbose): print("++ More link: " + link)
-
- nest = fetch(link,verbose,params,media)
- if isinstance(results,dict): results.update(nest)
- else: results.extend(nest)
- return results
-
-# Main canvas querying fxn - stream version - don't die on big requests
-def fetch_stream(target,verbose=0):
- # if there are more results, recursivly call myself, adding on to the results.
- results = 0
- while target:
- if target[0:4] != "http": target = url + target
- if verbose:
- print("++ Fetching: " + target)
- r2 = requests.get(target, headers = header)
- if r2.status_code == 502:
- raise FetchError()
- try:
- results = json.loads(r2.text)
- count = len(results)
- except:
- print("-- Failed to parse: ", r2.text)
- if verbose:
- print("Got %i results" % count)
- if verbose > 1:
- print(r2.headers)
- tempout = codecs.open('cache/fetchcache.txt','a','utf-8')
- tempout.write(r2.text+"\n\n")
- tempout.close()
-
- next_link_found = 0
- if ('link' in r2.headers and count > 0):
- links = r2.headers['link'].split(',')
- for L in links:
- ll = L.split(';')
- link = ll[0].replace("<","")
- link = link.replace(">","")
- if re.search(r'next', ll[1]):
- target = link
- next_link_found = 1
- break
- if not next_link_found: target = 0
- yield results
-
-
-# for dicts with one key, collapse that one key out, cause
-# paging makes problems... example: enrollment_terms
-def fetch_collapse(target,collapse='',verbose=0):
- # if there are more results, recursivly call myself, adding on to the results.
- results = 0
- if target[0:4] != "http": target = url + target
- if verbose:
- print("++ Fetching: " + target)
- r2 = requests.get(target, headers = header)
- #if verbose:
- #print "++ Got: " + r2.text
- try:
- results = json.loads(r2.text)
- except:
- print("-- Failed to parse: ", r2.text)
- if verbose: print(r2.headers)
-
- if collapse and collapse in results:
- results = results[collapse]
-
- if ('link' in r2.headers):
- links = r2.headers['link'].split(',')
- for L in links:
- ll = L.split(';')
- link = ll[0].replace("<","")
- link = link.replace(">","")
- if re.search(r'next', ll[1]):
- if (verbose): print("++ More link: " + link)
- nest = fetch_collapse(link, collapse, verbose)
- if isinstance(results,dict): results.update(nest)
- else: results.extend(nest)
- return results
-
-
-
-
-
-
-
-
-
-################
-################ CANVAS DATA
-################
-################
-################
-
-
-# Get canvas data 2024 style
-def canvas_data_2024_run():
- print("Updating all tables.")
- asyncio.run(canvas_data_2024())
- print("Done with all tables.")
-
-
-async def canvas_data_2024():
-
- base_url: str = os.environ["DAP_API_URL"]
- client_id: str = os.environ["DAP_CLIENT_ID"]
- client_secret: str = os.environ["DAP_CLIENT_SECRET"]
- #connection_string: str = "postgresql://postgres:rolley34@192.168.1.6/db"
-
- # todo: use secrets
- connection_string: str = "postgresql://postgres:rolley34@192.168.1.199/db"
-
- desired_tables = "users,courses,communication_channels,context_modules,conversation_message_participants,conversation_messages,conversation_participants,conversations,course_sections,enrollment_states,enrollment_dates_overrides,enrollment_terms,enrollments,learning_outcome_groups,learning_outcome_question_results,learning_outcomes,pseudonyms,quizzes,scores,submissions,submission_versions,wiki_pages,wikis".split(',')
- credentials = Credentials.create(client_id=client_id, client_secret=client_secret)
-
- async with DatabaseConnection(connection_string).open() as db_connection:
- async with DAPClient(base_url, credentials) as session:
- #tables = await session.get_tables("canvas")
- for table in desired_tables:
- print(f" trying to update {table} ")
- try:
- #await SQLReplicator(session, db_connection).initialize("canvas", table)
- await SQLReplicator(session, db_connection).synchronize("canvas", table)
- except Exception as e:
- print(f" - skipping {table} because {e}")
-
-
-
-# Get canvas data 2024 style
-def setup_canvas_data_2024_run():
- print("Setting up all tables.")
- asyncio.run(setup_canvas_data_2024())
- print("Done with all tables.")
-
-
-async def setup_canvas_data_2024():
-
- base_url: str = os.environ["DAP_API_URL"]
- client_id: str = os.environ["DAP_CLIENT_ID"]
- client_secret: str = os.environ["DAP_CLIENT_SECRET"]
- #connection_string: str = "postgresql://postgres:rolley34@192.168.1.6/db"
- connection_string: str = "postgresql://postgres:rolley34@192.168.1.192/db"
-
- desired_tables = "users,courses,communication_channels,context_modules,conversation_message_participants,conversation_messages,conversation_participants,conversations,course_sections,enrollment_states,enrollment_dates_overrides,enrollment_terms,enrollments,learning_outcome_groups,learning_outcome_question_results,learning_outcomes,pseudonyms,quizzes,scores,submissions,submission_versions,wiki_pages,wikis".split(',')
- credentials = Credentials.create(client_id=client_id, client_secret=client_secret)
-
- async with DatabaseConnection(connection_string).open() as db_connection:
- async with DAPClient(base_url, credentials) as session:
- #tables = await session.get_tables("canvas")
- for table in desired_tables:
- print(f" {table}")
- try:
- await SQLReplicator(session, db_connection).initialize("canvas", table)
- except Exception as e:
- print(f" - skipping {table} because {e}")
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-################
-################ ROSTERS AND REGISTRATION
-################
-################
-################
-
-# todo: the pipeline is disorganized. Organize it to have
-# a hope of taking all this to a higher level.
-#
-
-# todo: where does this belong in the pipeline? compare with recent_schedules()
-
-
-
-# Take the generically named rosters uploads files and move them to a semester folder and give them a date.
-def move_to_folder(sem,year,folder,files):
- semester = year+sem
- semester_path = 'cache/rosters/%s' % semester
- if not os.path.isdir('cache/rosters/'+semester):
- os.makedirs('cache/rosters/'+semester)
- now = datetime.datetime.now().strftime('%Y-%m-%dT%H-%M')
- print("+ Moving roster files to folder: %s" % semester_path)
- if not os.path.isdir(semester_path):
- print("+ Creating folder: %s" % semester_path)
- os.makedirs(semester_path)
- def safe_move(src, dst):
- try:
- os.replace(src, dst)
- except Exception as ex:
- print(f"! move failed {src} -> {dst}: {ex}")
- if 'courses.csv' in files:
- safe_move('cache/rosters/courses-%s.csv' % folder, 'cache/rosters/%s/courses.%s.csv' % (semester,now))
- if 'enrollments.csv' in files:
- safe_move('cache/rosters/enrollments-%s.csv' % folder, 'cache/rosters/%s/enrollments.%s.csv' % (semester,now))
- if 'users.csv' in files:
- safe_move('cache/rosters/users-%s.csv' % folder, 'cache/rosters/%s/users.%s.csv' % (semester,now))
-
-
-
-# Take raw upload (csv) files and make one big json out of them.
-# This relates to enrollment files, not schedule.
-def convert_roster_files(semester="",year="",folder=""):
- if not semester:
- semester = input("the semester? (ex: spring) ")
- folder = input("Folder? (ex 2020-02-25-14-58-20) ")
- uf = open('cache/rosters/users-'+folder+'.csv','r')
- cf = open('cache/rosters/courses-'+folder+'.csv','r')
- ef = open('cache/rosters/enrollments-'+folder+'.csv','r')
- u = csv.DictReader(uf)
- c = csv.DictReader(cf)
- e = csv.DictReader(ef)
- uu = [i for i in u]
- cc = [i for i in c]
- ee = [i for i in e]
- uf.close()
- cf.close()
- ef.close()
- myrosterfile = 'cache/rosters/roster_%s_%s.json' % (year, semester)
-
- if os.path.exists(myrosterfile):
- print(" -- Moving previous combined roster json file. opening %s ..." % myrosterfile)
- last_fileobj = open(myrosterfile,'r')
- last_file = json.load(last_fileobj)
-
- last_fileobj.close()
-
- info = last_file[3]
- last_date = info['date_filestring']
-
- print(' -- writing: cache/rosters/%s%s/roster_%s.json ...' % (year,semester,last_date))
-
- try:
- os.rename(myrosterfile, 'cache/rosters/%s%s/roster_%s.json ...' % (year,semester,last_date))
- print(' -- ok')
- except Exception as e:
- print(" ** Failed because i couldn't move the previous roster file: %s" % myrosterfile)
- print(e)
- myrosterfile = "new_" + myrosterfile
- pass
- #os.remove('cache/old_rosters/roster_'+semester+'.'+last_date+'.json')
- #os.rename(myrosterfile, 'cache/old_rosters/roster_'+semester+'.'+last_date+'.json')
-
- newinfo = {'date_filestring': datetime.datetime.now().strftime('%Y-%m-%dT%H-%M'), }
- try:
- new_roster = codecs.open(myrosterfile,'w', 'utf-8')
- new_roster.write( json.dumps( [uu,cc,ee,newinfo], indent=2 ))
- new_roster.close()
- print(" -- Wrote roster info to: %s." % myrosterfile)
- except Exception as e:
- print(" ** Failed because i couldn't move the previous roster file: %s" % myrosterfile)
- print(" ** " + str(e))
-
-
-
-
-
-
-# From instructure sftp site
-def fetch_current_rosters(sftp=None, label_hour=None):
- """Download roster CSVs from Instructure SFTP and post-process.
- - If sftp provided, reuse it (already chdir'd to SIS).
- - Files are saved using label_hour (format YYYY-MM-DD-HH). If None, compute fallback label by flooring to the hour.
- """
- import pysftp
- def log(msg):
- ts = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
- try:
- with open('cache/pipeline.log.txt','a', encoding='utf-8') as f:
- f.write(f"[{ts}] {msg}\n")
- except Exception:
- print(msg)
-
- close_after = False
- if sftp is None:
- cnopts = pysftp.CnOpts()
- cnopts.hostkeys = None
- sftp = pysftp.Connection(instructure_url, username=instructure_username, private_key=instructure_private_key, cnopts=cnopts)
- sftp.chdir('SIS')
- close_after = True
- try:
- now = datetime.datetime.now()
- if not label_hour:
- # default fallback: floor to hour
- label_hour = now.replace(minute=0, second=0, microsecond=0).strftime('%Y-%m-%d-%H')
-
- files = set(sftp.listdir())
- log(f"fetch_current_rosters: label={label_hour} files={sorted(files)}")
- need = ['login','users','courses','enrollments']
- saved = []
- for name in need:
- remote = f'{name}.csv'
- target = f'cache/rosters/{name}-{label_hour}.csv'
- try:
- if remote in files:
- if not (os.path.exists(target) and os.path.getsize(target) > 0):
- temp = target + '.part'
- try:
- if os.path.exists(temp):
- os.remove(temp)
- except Exception:
- pass
- sftp.get(remote, temp)
- # atomic replace
- os.replace(temp, target)
- saved.append(remote)
- log(f' saved {remote} -> {target}')
- else:
- log(f' already exists: {target}, skipping')
- else:
- log(f' missing on server: {remote}')
- except Exception as ex:
- log(f' download failed: {remote} -> {target} err={ex}')
-
- if len(saved) >= 3 and 'courses.csv' in saved or os.path.exists(f'cache/rosters/courses-{label_hour}.csv'):
- try:
- with open(f'cache/rosters/courses-{label_hour}.csv','r') as courses:
- courses.readline()
- a = courses.readline()
- parts = a.split(',')
- year = parts[1][0:4]
- ss = parts[1][4:6]
- sem = {'30':'spring', '50':'summer', '70':'fall' }
- this_sem = sem.get(ss, 'spring')
- log(f"post-process for semester={this_sem} year={year} label={label_hour}")
- convert_roster_files(this_sem,year,label_hour)
- move_to_folder(this_sem,year,label_hour,saved)
- except Exception as expp:
- log(f'post-processing failed: {expp}')
- else:
- log('not enough files to post-process yet')
- finally:
- if close_after:
- try:
- sftp.close()
- except Exception:
- pass
-
-def fetch_current_rosters_auto(poll_seconds=15):
- """Poll Instructure SFTP from the top of each hour until all 4 roster files appear, then download once.
- - Robust logging to cache/pipeline.log.txt
- - Stops polling for the remainder of that hour after success
- - Tries again on the next hour
- """
- import pysftp
- from contextlib import contextmanager
-
- log_path = 'cache/pipeline.log.txt'
-
- def log(msg):
- ts = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
- try:
- with open(log_path, 'a', encoding='utf-8') as f:
- f.write(f"[{ts}] {msg}\n")
- except Exception:
- print(f"[log-fail] {msg}")
-
- @contextmanager
- def sftp_conn():
- cnopts = pysftp.CnOpts()
- cnopts.hostkeys = None
- conn = None
- try:
- conn = pysftp.Connection(instructure_url, username=instructure_username,
- private_key=instructure_private_key, cnopts=cnopts)
- conn.chdir('SIS')
- yield conn
- finally:
- try:
- if conn:
- conn.close()
- except Exception:
- pass
-
- required = {'login.csv', 'users.csv', 'courses.csv', 'enrollments.csv'}
- current_window = None # e.g., '2025-08-30-14'
- window_done = False
- window_end = None
-
- def compute_window(now):
- """Return (label_hour_str, window_end_dt) if within a window, else (None, None).
- Window spans [H-5m, H+5m] around each top-of-hour H.
- If minute >= 55 -> window is next hour
- If minute <= 5 -> window is current hour
- Else -> not in window.
- """
- m = now.minute
- if m >= 55:
- base = (now.replace(minute=0, second=0, microsecond=0) + timedelta(hours=1))
- elif m <= 5:
- base = now.replace(minute=0, second=0, microsecond=0)
- else:
- return (None, None)
- label = base.strftime('%Y-%m-%d-%H')
- end = base + timedelta(minutes=5) # end of window
- return (label, end)
-
- log('fetch_current_rosters_auto: starting poll loop')
- while True:
- now = datetime.datetime.now()
- hour_key = now.strftime('%Y-%m-%d %H')
- label, enddt = compute_window(now)
- if label is None:
- time.sleep(max(5, int(poll_seconds)))
- continue
-
- if label != current_window:
- current_window = label
- window_done = False
- window_end = enddt
- log(f'New window: {current_window} (until {window_end.strftime("%H:%M:%S")}). Resetting state.')
-
- if window_done:
- time.sleep(5)
- continue
-
- # Poll SFTP for files
- try:
- with sftp_conn() as sftp:
- files = set(sftp.listdir())
- missing = list(required - files)
- if missing:
- log(f'Not ready yet. Missing: {missing}')
- else:
- log('All required files present inside window. Starting download...')
- try:
- fetch_current_rosters(sftp=sftp, label_hour=current_window)
- log('Download complete. Marking window_done for this window.')
- window_done = True
- except Exception as ex_dl:
- import traceback
- log('Download failed: ' + str(ex_dl))
- log(traceback.format_exc())
- except Exception as ex_list:
- import traceback
- log('SFTP list failed: ' + str(ex_list))
- log(traceback.format_exc())
-
- # Check for window timeout
- try:
- if not window_done and window_end and now >= window_end:
- log(f'REPORT: window {current_window} ended without all files. Consider alerting/email.')
- window_done = True # stop further checks this window
- except Exception:
- pass
-
- time.sleep(max(5, int(poll_seconds)))
-
-
-# Canvas data, download all new files
-def sync_non_interactive():
- resp = do_request('/api/account/self/file/sync')
- mylog.write(json.dumps(resp, indent=4))
- #mylog.close()
- gotten = os.listdir(local_data_folder)
- wanted = []
- i = 0
- for x in resp['files']:
- filename = x['filename']
- exi = "No "
- if filename in gotten: exi = "Yes"
- else: wanted.append(x)
-
- print(str(i) + '.\tLocal: %s\tRemote: %s' % ( exi, filename ))
- i += 1
- print("I will attempt to download %i files." % len(wanted))
-
- #answer = input("Press enter to begin, or q to quit ")
- #if not answer == '': return
-
- good_count = 0
- bad_count = 0
- for W in wanted:
- print("Downloading: " + W['filename'])
- response = requests.request(method='GET', url=W['url'], stream=True)
- if(response.status_code != 200):
- print('Request response went bad. Got back a %s code, meaning the request was %s' % \
- (response.status_code, response.reason))
- print('URL: ' + W['url'])
- bad_count += 1
-
- else:
- #Use the downloaded data
- with open(local_data_folder + W['filename'], 'wb') as fd:
- for chunk in response.iter_content(chunk_size=128):
- fd.write(chunk)
- print("Success")
- good_count += 1
- print("Out of %i files, %i succeeded and %i failed." % (len(wanted),good_count,bad_count))
-
-
-## OLD STYLE CANVAS DATA
-
-# Get something from Canvas Data
-def do_request(path):
- #Set up the request pieces
- method = 'GET'
- host = 'api.inshosteddata.com'
- apiTime = datetime.utcnow().strftime('%a, %d %b %y %H:%M:%S GMT')
- apiContentType = 'application/json'
-
- msgList = []
- msgList.append(method)
- msgList.append(host)
- msgList.append(apiContentType)
- msgList.append('')
- msgList.append(path)
- msgList.append('')
- msgList.append(apiTime)
- msgList.append(apiSecret)
-
- msgStr = bytes("".join("%s\n" % k for k in msgList).strip(),'utf-8')
-
- sig = base64.b64encode(hmac.new(key=bytes(apiSecret,'utf-8'),msg=msgStr,digestmod=hashlib.sha256).digest())
- sig = sig.decode('utf-8')
-
- headers = {}
- headers['Authorization'] = 'HMACAuth {}:{}'.format(apiKey,sig)
- headers['Date'] = apiTime
- headers['Content-type'] = apiContentType
-
-
- #Submit the request/get a response
- uri = "https://"+host+path
- print (uri)
- print (headers)
- response = requests.request(method='GET', url=uri, headers=headers, stream=True)
-
- #Check to make sure the request was ok
- if(response.status_code != 200):
- print(('Request response went bad. Got back a ', response.status_code, ' code, meaning the request was ', response.reason))
- else:
- #Use the downloaded data
- jsonData = response.json()
- #print(json.dumps(jsonData, indent=4))
- return jsonData
-
-
-
-
-
-
-################
-################ SENDING DATA AWAY
-################
-################
-################
-
-# Upload a json file to www
-def put_file(remotepath,localpath, localfile,prompt=1):
- import pysftp
- show_all = 0
- folder = datetime.datetime.now().strftime('%Y-%m-%d-%H-%M-%S')
- cnopts = pysftp.CnOpts()
- cnopts.hostkeys = None
-
- with pysftp.Connection(FTP_SITE,username=FTP_USER, password=FTP_PW,cnopts=cnopts) as sftp:
- #todo: these paths
- #files = sftp.listdir()
- #print(folder + "\tI see these files on remote: ", files, "\n")
- sftp.chdir(remotepath)
- files = sftp.listdir()
- if show_all: print(folder + "\tI see these files on remote: ", files, "\n")
- localf = os.listdir(localpath)
- if show_all: print("I see these local: ", localf)
- if prompt:
- input('ready to upload')
- sftp.put(localpath+localfile, localfile, preserve_mtime=True)
- sftp.close()
-
-
- """
- # copy files and directories from local static, to remote static,
- # preserving modification times on the files
- for f in localf:
- print("This local file: " + f + " ", end=' ')
- if not f in files:
- sftp.put('video_srt/'+classfoldername+'/'+f, f, preserve_mtime=True)
- print("Uploaded.")
- else:
- print("Skipped.")
- """
-
- """if len(files)==3 and 'users.csv' in files:
- sftp.get('courses.csv','rosters/courses-'+folder+'.csv')
- sftp.get('users.csv','rosters/users-'+folder+'.csv')
- sftp.get('enrollments.csv','rosters/enrollments-'+folder+'.csv')
- print folder + '\tSaved three data files in rosters folder.'
-
- courses = open('rosters/courses-'+folder+'.csv','r')
- courses.readline()
- a = courses.readline()
- print a
- courses.close()
- parts = a.split(',')
- year = parts[1][0:4]
- ss = parts[1][4:6]
- #print parts[1]
- sem = {'30':'spring', '50':'summer', '70':'fall' }
- this_sem = sem[ss]
- #print this_sem, "", year
- print folder + '\tbuilding data file...'
- convert_roster_files(this_sem,year,folder)
- print folder + '\tmoving files...'
- move_to_folder(this_sem,year,folder)
- else:
- print folder + "\tDon't see all three files."""
-
-
-
-################
-################ GOOGLE DOCS
-################
-################
-################
-
-def sec(t): return ""+t+"
\n"
-def para(t): return ""+t+"
\n"
-def ul(t): return "\n"
-def li(t): return ""+t+"\n"
-
-def question(t,bracket=1):
- ret = ''
- match = re.search( r'\[(.*)\]', t)
- if match and bracket:
- ret += ""
- t = re.sub( r'\[.*\]','',t)
- else:
- parts = t.split(' ')
- id = ''
- for p in parts:
- if re.search(r'[a-zA-Z]',p[0]): id += p[0]
- ret += "" % id.lower()
- return ret + '' + t + '
\n
'
-
-def answer(t):
- return t + '
\n'
-
-def read_paragraph_element(element,type="NORMAL_TEXT"):
- """Returns the text in the given ParagraphElement.
-
- Args:
- element: a ParagraphElement from a Google Doc.
- """
- text_run = element.get('textRun')
- begin = ''
- end = ''
- if not text_run:
- return ''
- if 'textStyle' in text_run and 'link' in text_run['textStyle']:
- begin = ''
- end = ''
- if 'textStyle' in text_run and 'bold' in text_run['textStyle'] and text_run['textStyle']['bold']==True and type=="NORMAL_TEXT":
- begin = '' + begin
- end = end + ''
-
- content = text_run.get('content')
- content = re.sub(u'\u000b','
\n',content)
-
- return begin + content + end
-
-
-def get_doc(docid, bracket=1, verbose=0):
- import pickle
- import os.path
- from googleapiclient.discovery import build
- from google_auth_oauthlib.flow import InstalledAppFlow
- from google.auth.transport.requests import Request
-
- #ooout = open(fileout,'w')
-
- # If modifying these scopes, delete the file token.pickle.
- SCOPES = ['https://www.googleapis.com/auth/documents.readonly']
- creds = None
- # The file token.pickle stores the user's access and refresh tokens, and is
- # created automatically when the authorization flow completes for the first
- # time.
- if os.path.exists('token.pickle'):
- with open('token.pickle', 'rb') as token:
- creds = pickle.load(token)
- # If there are no (valid) credentials available, let the user log in.
- if not creds or not creds.valid:
- if creds and creds.expired and creds.refresh_token:
- creds.refresh(Request())
- else:
- flow = InstalledAppFlow.from_client_secrets_file(
- 'credentials.json', SCOPES)
- creds = flow.run_local_server(port=0)
- # Save the credentials for the next run
- with open('token.pickle', 'wb') as token:
- pickle.dump(creds, token)
-
- service = build('docs', 'v1', credentials=creds)
-
- # Retrieve the documents contents from the Docs service.
- document = service.documents().get(documentId=docid).execute()
- if verbose: print(document)
-
- tempout = codecs.open('cache/trash/gdoctemp.txt','w','utf-8')
- tempout.write( json.dumps(document,indent=2) + "\n\n\n------------------------------------\n\n")
- if verbose: print('The title of the document is: {}'.format(document.get('title')))
- doc_content = document.get('body').get('content')
- if verbose: print(doc_content)
-
- doc_objects = document.get('inlineObjects')
- if verbose: print(doc_objects)
-
- doc_lists = document.get('lists')
-
- text = ''
- last_type = ''
- answer_text = ''
- in_a_list = ''
-
- img_count = 1
- img_lookup = {}
- img_heights = {}
- img_widths = {}
-
- if doc_objects:
- for k,value in doc_objects.items():
- tempout.write( "->" + k + "=" + json.dumps(value,indent=2) + "\n\n\n--\n\n")
- if 'inlineObjectProperties' in value:
- if 'embeddedObject' in value['inlineObjectProperties']:
- if 'imageProperties' in value['inlineObjectProperties']['embeddedObject']:
- if 'contentUri' in value['inlineObjectProperties']['embeddedObject']['imageProperties']:
- print(k)
- uu = value['inlineObjectProperties']['embeddedObject']['imageProperties']['contentUri']
- response = requests.get(uu, stream=True)
- name = 'image_' + str(img_count) + '.' + response.headers['content-type'].split('/')[1]
- img_count += 1
-
- img_lookup[k] = name
-
- with open('cache/doc_images/'+name, 'wb') as out_file:
- shutil.copyfileobj(response.raw, out_file)
- print(uu)
- print(response.headers)
- print(name)
- #input('x?')
- del response
- if 'size' in value['inlineObjectProperties']['embeddedObject']:
- img_heights[k] = int(value['inlineObjectProperties']['embeddedObject']['size']['height']['magnitude'])
- img_widths[k] = int(value['inlineObjectProperties']['embeddedObject']['size']['width']['magnitude'])
-
- tempout.write('- - - - - - - -\n\n')
- #for value in doc_lists:
- # tempout.write( json.dumps(value,indent=2) + "\n\n\n--\n\n")
-
- tempout.write('- - - - - - - -\n\n')
- list_stack = []
- list_depth = 0
- last_list_depth = 0
- for value in doc_content:
- tempout.write( json.dumps(value,indent=2) + "\n\n\n")
- if verbose: print(json.dumps(value, sort_keys=True, indent=4))
-
- # todo: x link, x bold, list, image.
- tag_fxn = para
- if 'paragraph' in value:
- this_text = ''
-
- if 'bullet' in value['paragraph']:
- # either we're (1)starting a new list, (2)in one, (3)starting a nested one, or (4)finished a nested one.
-
- lid = value['paragraph']['bullet']['listId']
-
- if not list_stack: # 1
- list_stack.append(lid)
- else:
- if lid == list_stack[0]: # 2
- pass
-
- else:
- if not lid in list_stack: # 3
- list_stack.append(lid)
- else: # 4
- x = list_stack.pop()
- while x != lid: list_stack.pop()
- elif len(list_stack) > 0: # current para isn't a bullet but we still have a list open.
- list_stack = []
-
- list_depth = len(list_stack)
-
- deeper = list_depth - last_list_depth
-
- if deeper > 0:
- answer_text += "
" * deeper
- elif deeper < 0:
- deeper = -1 * deeper
- answer_text += "
" * deeper
-
- if len(list_stack):
- tag_fxn = li
-
- elements = value.get('paragraph').get('elements')
-
- # inlineObjectElement": {
- # "inlineObjectId": "kix.ssseeu8j9cfx",
-
- if 'paragraphStyle' in value.get('paragraph'):
- style = value.get('paragraph').get('paragraphStyle')
- #text += json.dumps(style, sort_keys=True, indent=4)
- if 'namedStyleType' in style:
- type = style['namedStyleType']
-
- for elem in elements:
-
- # text content
- this_text += read_paragraph_element(elem,type)
-
- # image content
- if 'inlineObjectElement' in elem:
- vpi = elem['inlineObjectElement']
- if 'inlineObjectId' in vpi:
- ii = vpi['inlineObjectId']
- if ii in img_lookup:
- img = img_lookup[ii]
- h = img_heights[ii]
- w = img_widths[ii]
- this_text += '

' % (img,w,h)
-
-
-
- if last_type=='NORMAL_TEXT' and type!=last_type:
- text += answer(answer_text)
- answer_text = ''
-
- if type=='HEADING_2':
- text += sec(this_text)
- this_text = ''
- elif type=='HEADING_3':
- text += question(this_text,bracket)
- this_text = ''
- else:
- answer_text += tag_fxn(this_text)
- this_text = ''
- last_type = type
- last_list_depth = list_depth
-
- elif 'table' in value:
- # The text in table cells are in nested Structural Elements and tables may be
- # nested.
- text += "\nTABLE\n"
- #table = value.get('table')
- #for row in table.get('tableRows'):
- # cells = row.get('tableCells')
- # for cell in cells:
- # text += read_strucutural_elements(cell.get('content'))
- #elif 'tableOfContents' in value:
- # # The text in the TOC is also in a Structural Element.
- # toc = value.get('tableOfContents')
- # text += read_strucutural_elements(toc.get('content'))
-
- #else:
- # print(json.dumps(value, sort_keys=True, indent=4))
-
- text += answer(answer_text)
- #text += '
'
- #print(text)
- return text
-
-######### TRY #2 ######
-
-
-def read_paragraph_element_2(element,type="NORMAL_TEXT"):
- text_run = element.get('textRun')
- begin = ''
- end = ''
- if not text_run: return ''
- if 'textStyle' in text_run and 'link' in text_run['textStyle']:
- begin = ''
- end = ''
- if 'textStyle' in text_run and 'bold' in text_run['textStyle'] and text_run['textStyle']['bold']==True and type=="NORMAL_TEXT":
- begin = '' + begin
- end = end + ''
- elif 'textStyle' in text_run and 'italic' in text_run['textStyle'] and text_run['textStyle']['italic']==True and type=="NORMAL_TEXT":
- begin = '' + begin
- end = end + ''
- content = text_run.get('content')
- content = re.sub(u'\u000b','
\n',content)
- return begin + content + end
-
-# t is a string that begins with "Icons: " ... and contains comma(space) separated list
-def handle_icons(t):
- text = t[7:].strip()
- parts = text.split(", ")
- return ('icons',parts)
-
-# t is a string that begins with "Tags: " ... and contains comma(space) separated list
-def handle_tags(t):
- text = t[6:].strip()
- parts = text.split(", ")
- return ('tags',parts)
-
-def handle_question(t,bracket=1):
- anchor = ''
- match = re.search( r'\[(.*)\]', t)
- if match and bracket:
- anchor = match.group(1).lower()
- t = re.sub( r'\[.*\]','',t)
- else:
- parts = t.split(' ')
- for p in parts:
- if re.search(r'[a-zA-Z]',p[0]): anchor += p[0].lower()
- return ('question', t, anchor)
-
-def handle_answer(t):
- return ('answer',t)
-
-def handle_sec(t): return ('section',t)
-def handle_para(t): return ('paragraph',t)
-def handle_ul(t): return ('unorderdedlist',t)
-def handle_li(t): return ('listitem',t)
-
-
-
-img_count = 1
-img_lookup = {}
-img_heights = {}
-img_widths = {}
-
-
-def fetch_doc_image(k,value):
- global img_count, img_lookup, img_heights, img_widths
- if 'inlineObjectProperties' in value:
- if 'embeddedObject' in value['inlineObjectProperties']:
- if 'imageProperties' in value['inlineObjectProperties']['embeddedObject']:
- if 'contentUri' in value['inlineObjectProperties']['embeddedObject']['imageProperties']:
- print(k)
- uu = value['inlineObjectProperties']['embeddedObject']['imageProperties']['contentUri']
- response = requests.get(uu, stream=True)
- name = 'image_' + str(img_count) + '.' + response.headers['content-type'].split('/')[1]
- img_count += 1
- img_lookup[k] = name
-
- with open('cache/doc_images/'+name, 'wb') as out_file:
- shutil.copyfileobj(response.raw, out_file)
- print(uu)
- print(response.headers)
- print(name)
- del response
- if 'size' in value['inlineObjectProperties']['embeddedObject']:
- img_heights[k] = int(value['inlineObjectProperties']['embeddedObject']['size']['height']['magnitude'])
- img_widths[k] = int(value['inlineObjectProperties']['embeddedObject']['size']['width']['magnitude'])
-
-
-def get_doc_generic(docid, bracket=1, verbose=0):
- import pickle
- import os.path
- from googleapiclient.discovery import build
- from google_auth_oauthlib.flow import InstalledAppFlow
- from google.auth.transport.requests import Request
- global img_count, img_lookup, img_heights, img_widths
-
-# If modifying these scopes, delete the file token.pickle.
- SCOPES = ['https://www.googleapis.com/auth/documents.readonly']
- creds = None
- # The file token.pickle stores the user's access and refresh tokens, and is
- # created automatically when the authorization flow completes for the first
- # time.
- if os.path.exists('token.pickle'):
- with open('token.pickle', 'rb') as token:
- creds = pickle.load(token)
- if not creds or not creds.valid:
- if creds and creds.expired and creds.refresh_token:
- creds.refresh(Request())
- else:
- flow = InstalledAppFlow.from_client_secrets_file(
- 'credentials.json', SCOPES)
- creds = flow.run_local_server(port=0)
- # Save the credentials for the next run
- with open('token.pickle', 'wb') as token:
- pickle.dump(creds, token)
-
- service = build('docs', 'v1', credentials=creds)
-
- # Retrieve the documents contents from the Docs service.
- document = service.documents().get(documentId=docid).execute()
-
- tempout = codecs.open('cache/trash/gdoctemp.txt','w','utf-8')
- tempout.write( json.dumps(document,indent=2) \
- + "\n\n\n------------------------------------\n\n")
- if verbose: print('The title of the document is: {}'.format(document.get('title')))
-
- doc_content = document.get('body').get('content')
- doc_objects = document.get('inlineObjects')
- doc_lists = document.get('lists')
-
- #text = ''
- result = []
- last_type = ''
- #answer_text = ''
- answer = []
- in_a_list = ''
-
- # Get all the images
- for k,value in doc_objects.items():
- tempout.write( "->" + k + "=" + json.dumps(value,indent=2) + "\n\n\n--\n\n")
- fetched = fetch_doc_image(k,value)
-
- list_stack = []
- list_depth = 0
- last_list_depth = 0
- for value in doc_content:
- tempout.write( json.dumps(value,indent=2) + "\n\n\n")
- if verbose: print(json.dumps(value, sort_keys=True, indent=4))
-
- tag_fxn = handle_para
- if 'paragraph' in value:
- this_text = ''
-
- # First we deal with if we're in a list.
- if 'bullet' in value['paragraph']:
- # either we're (1)starting a new list, (2)in one (do nothing),
- # (3)starting a nested one, or (4)finished a nested one.
- lid = value['paragraph']['bullet']['listId']
- if not list_stack: # 1
- list_stack.append(lid)
- else:
- if not lid == list_stack[0]:
- if not lid in list_stack: # 3
- list_stack.append(lid)
- else: # 4
- x = list_stack.pop()
- while x != lid: list_stack.pop()
- elif len(list_stack) > 0:
- # current para isn't a bullet but we still have a list open.
- list_stack = []
-
-
- list_depth = len(list_stack)
- deeper = list_depth - last_list_depth
- if deeper > 0:
- answer.append("" * deeper)
- elif deeper < 0:
- deeper = -1 * deeper
- answer.append("
" * deeper)
- if len(list_stack):
- tag_fxn = handle_li
-
- # NOW the tag_fxn is either 'para' or 'li'... let's get the styling info next,
- elements = value.get('paragraph').get('elements')
- if 'paragraphStyle' in value.get('paragraph'):
- style = value.get('paragraph').get('paragraphStyle')
- if 'namedStyleType' in style:
- type = style['namedStyleType']
-
- # and FINALLY, the actual contents.
- for elem in elements:
- # text content
- this_text += read_paragraph_element_2(elem,type)
-
- # image content
- if 'inlineObjectElement' in elem:
- vpi = elem['inlineObjectElement']
- if 'inlineObjectId' in vpi:
- ii = vpi['inlineObjectId']
- if ii in img_lookup:
- img = img_lookup[ii]
- h = img_heights[ii]
- w = img_widths[ii]
- this_text += '
' % (img,w,h)
-
-
- # Now for something tricky. Call an appropriate handler, based on:
- # (a) what is the paragraph style type?
- # (b) is it different from the prev one?
-
- if last_type=='NORMAL_TEXT' and type!=last_type:
- if this_text.strip():
- result.append(handle_answer(answer))
- answer = []
- #answer_text = ''
-
- if type=='HEADING_2' and this_text.strip():
- result.append( handle_sec(this_text) )
- this_text = ''
- elif type=='HEADING_3' and this_text.strip():
- result.append(handle_question(this_text,bracket))
- this_text = ''
- else:
- if this_text.lower().startswith('tags:'):
- tag_fxn = handle_tags
- if this_text.lower().startswith('icons:'):
- tag_fxn = handle_icons
- if this_text.strip():
- answer.append(tag_fxn(this_text))
- this_text = ''
- last_type = type
- last_list_depth = list_depth
-
- elif 'table' in value:
- pass
-
-
- result.append(handle_answer(answer))
- return json.dumps(result,indent=4)
-
-
-
-
-def process_reg_history(term='fa25'):
- from collections import defaultdict
- from itertools import groupby
- from operator import itemgetter
-
- def read_grouped_csv(path):
- with open(path, newline='') as f:
- fieldnames = ['datetime', 'crn', 'course', 'teacher', 'max', 'enrolled', 'waitlistmax', 'waitlisted']
- reader = csv.DictReader(f, fieldnames=fieldnames)
- rows = sorted(reader, key=lambda r: r['datetime']) # Group by timestamp
- grouped = {}
- for ts, group in groupby(rows, key=itemgetter('datetime')):
- grouped[ts] = {r['crn']: r for r in group}
- return grouped
-
- def crossed_threshold(old_val, new_val, max_val):
- thresholds = [0.25, 0.5, 0.75, 1.0]
- if int(max_val) == 0:
- return False, None
- old_ratio = int(old_val) / int(max_val)
- new_ratio = int(new_val) / int(max_val)
- for t in thresholds:
- if old_ratio < t <= new_ratio:
- return True, int(t * 100)
- return False, None
-
- def detect_changes(prev, curr):
- changes = defaultdict(list)
-
- all_crns = prev.keys() | curr.keys()
- for crn in all_crns:
- o, n = prev.get(crn), curr.get(crn)
- if not o:
- changes[crn].append((n['datetime'], "Section was added."))
- elif not n:
- changes[crn].append((
- o['datetime'],
- f"Section was removed (last seen: teacher {o['teacher']}, "
- f"{o['enrolled']}/{o['max']} enrolled, {o['waitlisted']}/{o['waitlistmax']} waitlisted)."
- ))
- else:
- dt = n['datetime']
- if o['teacher'] != n['teacher']:
- changes[crn].append((dt, f"Teacher changed from {o['teacher']} to {n['teacher']}."))
- if o['enrolled'] != n['enrolled']:
- crossed, percent = crossed_threshold(o['enrolled'], n['enrolled'], n['max'])
- if crossed:
- changes[crn].append((dt, f"Enrollment crossed {percent}% ({n['enrolled']}/{n['max']})."))
- if int(n['waitlisted']) > 10 and o['waitlisted'] != n['waitlisted']:
- changes[crn].append((dt, f"Waitlist exceeds 10: {n['waitlisted']}."))
- return changes
-
- def time_to_iso(s):
- return datetime.datetime.strptime(s, "%Y-%m-%dT%H-%M").isoformat()
-
- def detect_changes_structured(prev, curr):
- changes = defaultdict(list)
-
- all_crns = prev.keys() | curr.keys()
- for crn in all_crns:
- o, n = prev.get(crn), curr.get(crn)
- if not o:
- changes[crn].append({'time':time_to_iso(n['datetime']), "type":'section update', 'message': "Section was added."})
- elif not n:
- changes[crn].append(
- {'time':time_to_iso(o['datetime']), "type":'section update', 'message': "Section was removed.",
- 'value': o['enrolled'], 'capacity': o['max'], })
- else:
- dt = time_to_iso(n['datetime'])
- if o['teacher'] != n['teacher']:
- changes[crn].append({'time':dt, "type":'teacher_change',
- 'message': f"Teacher changed from {o['teacher']} to {n['teacher']}.",
- 'old_teacher': o['teacher'], 'new_teacher': n['teacher'], })
- if o['enrolled'] != n['enrolled']:
- crossed, percent = crossed_threshold(o['enrolled'], n['enrolled'], n['max'])
- if crossed:
- changes[crn].append({'time':dt, "type":'enrollment_milestone',
- 'message': f"Enrollment crossed {percent}% ({n['enrolled']}/{n['max']}).",
- 'percent':percent,'value':n['enrolled'],'capacity':n['max'] })
- if int(n['waitlisted']) > 10 and o['waitlisted'] < n['waitlisted']:
- changes[crn].append({'time':dt, "type":'enrollment_milestone',
- 'message': f"Waitlist exceeds 10: {n['waitlisted']}).",
- 'value':n['waitlisted']})
- return changes
-
-
- def process_diff_timeline(path):
- snapshots = read_grouped_csv(path)
- timeline = sorted(snapshots.keys())
- timeline_diffs = []
- timeline_diffs_structured = []
- course_names = {} # crn -> latest known course name
-
- for i in range(1, len(timeline)):
- prev_ts, curr_ts = timeline[i-1], timeline[i]
- prev, curr = snapshots[prev_ts], snapshots[curr_ts]
-
- # update course name map
- for crn, row in curr.items():
- course_names[crn] = row['course']
-
- delta = detect_changes(prev, curr)
- timeline_diffs.append(delta)
-
- delta_structured = detect_changes_structured(prev,curr)
- timeline_diffs_structured.append(delta_structured)
-
- # Flatten and group by crn
- crn_changes = defaultdict(list)
- for delta in timeline_diffs:
- for crn, changes in delta.items():
- crn_changes[crn].extend(changes)
-
- # Flatten and group by crn
- crn_changes_structured = defaultdict(list)
- for delta in timeline_diffs_structured:
- for crn, changes in delta.items():
- crn_changes_structured[crn].extend(changes)
-
- # Sort changes for each CRN by datetime
- for crn in crn_changes:
- crn_changes[crn].sort(key=lambda x: x[0])
-
- # Sort changes for each CRN by datetime
- for crn in crn_changes_structured:
- crn_changes[crn].sort(key=lambda x: x[0])
-
- return crn_changes, crn_changes_structured, course_names
-
- fresh_history = requests.get(f"http://gavilan.cc/schedule/reg_history_{term}.csv").text
- fresh_file = codecs.open(f'cache/reg_history_{term}.csv','w','utf-8')
- fresh_file.write(fresh_history)
- fresh_file.close()
-
- output1 = codecs.open(f'cache/reg_timeline_{term}.txt','w','utf-8')
- output2 = codecs.open(f'cache/reg_timeline_{term}.json','w','utf-8')
- changes, changes_structured, course_names = process_diff_timeline(f"cache/reg_history_{term}.csv")
-
- # once for plain text
-
- for crn in sorted(changes, key=lambda c: course_names.get(c, "")):
- course = course_names.get(crn, "")
- course_output = {'code': course, 'crn':crn,'events':[]}
- print(f"\n{course} (CRN {crn}):")
- output1.write(f"\n{course} (CRN {crn}):\n")
- for dt, msg in changes[crn]:
- print(f" [{dt}] {msg}")
- output1.write(f" [{dt}] {msg}\n")
-
- course_output['events'].append({'message':msg, 'time':time_to_iso(dt)})
-
- # again for structured
- crn_list = []
-
- for crn in sorted(changes_structured, key=lambda c: course_names.get(c, "")):
- course = course_names.get(crn, "")
- course_output = {'code': course, 'crn':crn,'events':changes_structured[crn]}
- crn_list.append(course_output)
-
- output2.write( json.dumps(crn_list,indent=2) )
- output2.close()
-
-
-
-def recreate_all():
- # Use canonical semester short codes from semesters.py
- try:
- from semesters import code as SEM_CODES
- except Exception:
- SEM_CODES = []
- for x in SEM_CODES:
- try:
- recreate_reg_data(x)
- except Exception as e:
- print(f'Failed on {x} with: {e}')
-
-
-def recreate_reg_data(term="fa25"):
- from collections import defaultdict
- from datetime import datetime
-
- def parse_row(row):
- dt = datetime.strptime(row['datetime'], "%Y-%m-%dT%H-%M")
- crn = row['crn']
- enrolled = int(row['enrolled'])
- return dt, row['datetime'], crn, enrolled
-
- def reduce_latest_per_day(rows):
- latest = defaultdict(dict) # latest[crn][date] = (dt, ts, enrolled)
- latest_ts_by_date = {} # date → (dt, ts) for header naming
-
- for row in rows:
- dt, full_ts, crn, enrolled = parse_row(row)
- date_str = dt.date().isoformat()
- ts_header = dt.strftime("%Y-%m-%dT%H") # <-- this is what we want
-
- # for each crn, per day, keep latest reading
- if date_str not in latest[crn] or dt > latest[crn][date_str][0]:
- latest[crn][date_str] = (dt, ts_header, enrolled)
-
- # also record latest timestamp per day for consistent column headers
- if date_str not in latest_ts_by_date or dt > latest_ts_by_date[date_str][0]:
- latest_ts_by_date[date_str] = (dt, ts_header)
-
- return latest, [ts for _, ts in sorted(latest_ts_by_date.values())]
-
- def pivot_table(latest, headers):
- crns = sorted(latest)
- table = []
-
- for crn in crns:
- row = [crn]
- for ts in headers:
- date_str = ts[:10] # match on YYYY-MM-DD
- val = latest[crn].get(date_str)
- if val and val[1] == ts:
- row.append(str(val[2]))
- else:
- row.append("")
- table.append(row)
-
- return ['crn'] + headers, table
-
- #with open(f"cache/reg_history_{term}.csv", newline='') as f:
- from io import StringIO
- url = f"https://gavilan.cc/schedule/reg_history_{term}.csv"
-
- # Download
- resp = requests.get(url)
- resp.raise_for_status() # raises if bad status
-
- # Wrap the text in a file-like object
- f = StringIO(resp.text)
-
- fieldnames = ['datetime', 'crn', 'course', 'teacher', 'max', 'enrolled', 'waitlistmax', 'waitlisted']
- reader = csv.DictReader(f, fieldnames=fieldnames)
- rows = list(reader)
-
- latest, headers = reduce_latest_per_day(rows)
- header_row, table = pivot_table(latest, headers)
-
- with open(f"cache/reg_data_{term}.csv", "w", newline='') as f:
- writer = csv.writer(f)
- writer.writerow(header_row)
- writer.writerows(table)
-
-
-if __name__ == "__main__":
-
- print ('')
- options = { 1: ['Fetch rosters on schedule',fetch_current_rosters_auto] ,
- 2: ['Get canvas data 2024 style', canvas_data_2024_run ],
- 3: ['Set up canvas data 2024 style', setup_canvas_data_2024_run],
- 4: ['Narrative timeline of section updates', process_reg_history],
- 5: ['Create narrative format all semesters', recreate_all],
- 6: ['Recreate reg_data from full reg history', recreate_reg_data],
- }
-
- '''1: ['Re-create schedule csv and json files from raw html',recent_schedules] ,
- 2: ['Fetch rosters',fetch_current_rosters] ,
- 3:
- 4: ['Compute how registration is filling up classes', schedule_filling] ,
- 5: ['Manually convert 3 csv files to joined json enrollment file.', convert_roster_files] ,
- 6: ['Canvas data: interactive sync', interactive ],
- 7: ['Canvas data: automated sync', sync_non_interactive ],
- 8:
- 9:
- 16: ['Scrape schedule from ssb', scrape_schedule_multi ],
- 14: ['Generate latestart schedule', list_latestarts ],
- 15: ['Test ssb calls with python', scrape_schedule_py ],
- 10: ['schedule to db', scrape_for_db ],
- 11: ['clean argos draft schedule file', argos_data_from_cvc],
- 12: ['make expanded schedule json files of old semesters', expand_old_semesters ],
- 13: ['Parse deanza schedule', dza_sched ],
- '''
-
-
- if len(sys.argv) > 1 and re.search(r'^\d+',sys.argv[1]):
- resp = int(sys.argv[1])
- print("\n\nPerforming: %s\n\n" % options[resp][0])
-
- else:
- print ('')
- for key in options:
- print(str(key) + '.\t' + options[key][0])
-
- print('')
- resp = input('Choose: ')
-
- # Call the function in the options dict
- options[ int(resp)][1]()
-
-# Testing
-
-#if __name__ == "__main__":
- #users = fetch('/api/v1/courses/69/users?per_page=100',1)
- #print "These are the users: "
- #print users
-
- #getSemesterSchedule()
-
-
- #get_doc()
- #pass
+import util
+import codecs, json, requests, re, csv, datetime, os, jsondiff, os.path
+import sys, shutil, hmac, hashlib, base64, schedule, time, pathlib
+from datetime import timedelta
+
+from canvas_secrets import apiKey, apiSecret, FTP_SITE, FTP_USER, FTP_PW, url, domain, account_id, header, header_media
+from canvas_secrets import instructure_url, instructure_username, instructure_private_key
+
+import os, asyncio
+from dap.api import DAPClient
+from dap.dap_types import Credentials
+from dap.integration.database import DatabaseConnection
+from dap.replicator.sql import SQLReplicator
+
+
+
+"""
+Everything to do with fetching data,
+ - From iLearn, via token
+ - current roster uploads from instructures sftp site
+ - raw logs and other from canvas data repo
+ - from ssb, use firefox to scrape the schedule
+
+
+And some subsequent processing:
+ - Raw roster files, into a more compact json format
+ - Raw logs into something more useful
+"""
+
+verbose = False
+
+users = {}
+users_by_id = {}
+
+# todo: all these constants for SSB -- line 1008
+#
+# todo: https://stackoverflow.com/questions/42656247/how-can-i-use-canvas-data-rest-api-using-python
+
+
+
+
+
+
+sys.setrecursionlimit( 100000 )
+
+local_data_folder = 'cache/canvas_data/'
+mylog = codecs.open(local_data_folder + 'temp_log.txt','w')
+
+
+
+
+class FetchError(Exception):
+ pass
+
+
+DEBUG = 0
+
+def d(s,end=''):
+ global DEBUG
+ if end and DEBUG: print(s,end=end)
+ elif DEBUG: print(s)
+
+################
+################ CANVAS API MAIN FETCHING FUNCTIONS
+################
+################
+################
+
+
+
+
+# Main canvas querying fxn
+def fetch(target,verbose=0,params=0,media=0):
+ # if there are more results, recursivly call myself, adding on to the results.
+ results = 0
+ if target[0:4] != "http": target = url + target
+ if verbose:
+ print("++ Fetching: " + target)
+ if media:
+ r2 = requests.get(target, headers = header_media)
+ elif params:
+ r2 = requests.get(target, headers = header, params = params)
+ else:
+ r2 = requests.get(target, headers = header)
+ #if verbose:
+ #print "++ Got: " + r2.text
+ try:
+ results = json.loads(r2.text)
+ count = len(results)
+ except:
+ print("-- Failed to parse: ", r2.text)
+ if verbose:
+ print("Got %i results" % count)
+ if verbose > 1:
+ print(r2.headers)
+
+ tempout = codecs.open('cache/fetchcache.txt','a','utf-8')
+ tempout.write(r2.text+"\n\n")
+ tempout.close()
+
+ if ('link' in r2.headers and count > 0):
+ links = r2.headers['link'].split(',')
+ for L in links:
+ ll = L.split(';')
+ link = ll[0].replace("<","")
+ link = link.replace(">","")
+ if re.search(r'next', ll[1]):
+ if (verbose): print("++ More link: " + link)
+ #link = re.sub(r'per_page=10$', 'per_page=100', link) # link.replace('per_page=10','per_page=500')
+ #if (verbose): print("++ More link: " + link)
+
+ nest = fetch(link,verbose,params,media)
+ if isinstance(results,dict): results.update(nest)
+ else: results.extend(nest)
+ return results
+
+# Main canvas querying fxn - stream version - don't die on big requests
+def fetch_stream(target,verbose=0):
+ # if there are more results, recursivly call myself, adding on to the results.
+ results = 0
+ while target:
+ if target[0:4] != "http": target = url + target
+ if verbose:
+ print("++ Fetching: " + target)
+ r2 = requests.get(target, headers = header)
+ if r2.status_code == 502:
+ raise FetchError()
+ try:
+ results = json.loads(r2.text)
+ count = len(results)
+ except:
+ print("-- Failed to parse: ", r2.text)
+ if verbose:
+ print("Got %i results" % count)
+ if verbose > 1:
+ print(r2.headers)
+ tempout = codecs.open('cache/fetchcache.txt','a','utf-8')
+ tempout.write(r2.text+"\n\n")
+ tempout.close()
+
+ next_link_found = 0
+ if ('link' in r2.headers and count > 0):
+ links = r2.headers['link'].split(',')
+ for L in links:
+ ll = L.split(';')
+ link = ll[0].replace("<","")
+ link = link.replace(">","")
+ if re.search(r'next', ll[1]):
+ target = link
+ next_link_found = 1
+ break
+ if not next_link_found: target = 0
+ yield results
+
+
+# for dicts with one key, collapse that one key out, cause
+# paging makes problems... example: enrollment_terms
+def fetch_collapse(target,collapse='',verbose=0):
+ # if there are more results, recursivly call myself, adding on to the results.
+ results = 0
+ if target[0:4] != "http": target = url + target
+ if verbose:
+ print("++ Fetching: " + target)
+ r2 = requests.get(target, headers = header)
+ #if verbose:
+ #print "++ Got: " + r2.text
+ try:
+ results = json.loads(r2.text)
+ except:
+ print("-- Failed to parse: ", r2.text)
+ if verbose: print(r2.headers)
+
+ if collapse and collapse in results:
+ results = results[collapse]
+
+ if ('link' in r2.headers):
+ links = r2.headers['link'].split(',')
+ for L in links:
+ ll = L.split(';')
+ link = ll[0].replace("<","")
+ link = link.replace(">","")
+ if re.search(r'next', ll[1]):
+ if (verbose): print("++ More link: " + link)
+ nest = fetch_collapse(link, collapse, verbose)
+ if isinstance(results,dict): results.update(nest)
+ else: results.extend(nest)
+ return results
+
+
+
+
+
+
+
+
+
+################
+################ CANVAS DATA
+################
+################
+################
+
+
+# Get canvas data 2024 style
+def canvas_data_2024_run():
+ print("Updating all tables.")
+ asyncio.run(canvas_data_2024())
+ print("Done with all tables.")
+
+
+async def canvas_data_2024():
+
+ base_url: str = os.environ["DAP_API_URL"]
+ client_id: str = os.environ["DAP_CLIENT_ID"]
+ client_secret: str = os.environ["DAP_CLIENT_SECRET"]
+ #connection_string: str = "postgresql://postgres:rolley34@192.168.1.6/db"
+
+ # todo: use secrets
+ connection_string: str = "postgresql://postgres:rolley34@192.168.1.199/db"
+
+ desired_tables = "users,courses,communication_channels,context_modules,conversation_message_participants,conversation_messages,conversation_participants,conversations,course_sections,enrollment_states,enrollment_dates_overrides,enrollment_terms,enrollments,learning_outcome_groups,learning_outcome_question_results,learning_outcomes,pseudonyms,quizzes,scores,submissions,submission_versions,wiki_pages,wikis".split(',')
+ credentials = Credentials.create(client_id=client_id, client_secret=client_secret)
+
+ async with DatabaseConnection(connection_string).open() as db_connection:
+ async with DAPClient(base_url, credentials) as session:
+ #tables = await session.get_tables("canvas")
+ for table in desired_tables:
+ print(f" trying to update {table} ")
+ try:
+ #await SQLReplicator(session, db_connection).initialize("canvas", table)
+ await SQLReplicator(session, db_connection).synchronize("canvas", table)
+ except Exception as e:
+ print(f" - skipping {table} because {e}")
+
+
+
+# Get canvas data 2024 style
+def setup_canvas_data_2024_run():
+ print("Setting up all tables.")
+ asyncio.run(setup_canvas_data_2024())
+ print("Done with all tables.")
+
+
+async def setup_canvas_data_2024():
+
+ base_url: str = os.environ["DAP_API_URL"]
+ client_id: str = os.environ["DAP_CLIENT_ID"]
+ client_secret: str = os.environ["DAP_CLIENT_SECRET"]
+ #connection_string: str = "postgresql://postgres:rolley34@192.168.1.6/db"
+ connection_string: str = "postgresql://postgres:rolley34@192.168.1.192/db"
+
+ desired_tables = "users,courses,communication_channels,context_modules,conversation_message_participants,conversation_messages,conversation_participants,conversations,course_sections,enrollment_states,enrollment_dates_overrides,enrollment_terms,enrollments,learning_outcome_groups,learning_outcome_question_results,learning_outcomes,pseudonyms,quizzes,scores,submissions,submission_versions,wiki_pages,wikis".split(',')
+ credentials = Credentials.create(client_id=client_id, client_secret=client_secret)
+
+ async with DatabaseConnection(connection_string).open() as db_connection:
+ async with DAPClient(base_url, credentials) as session:
+ #tables = await session.get_tables("canvas")
+ for table in desired_tables:
+ print(f" {table}")
+ try:
+ await SQLReplicator(session, db_connection).initialize("canvas", table)
+ except Exception as e:
+ print(f" - skipping {table} because {e}")
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+################
+################ ROSTERS AND REGISTRATION
+################
+################
+################
+
+# todo: the pipeline is disorganized. Organize it to have
+# a hope of taking all this to a higher level.
+#
+
+# todo: where does this belong in the pipeline? compare with recent_schedules()
+
+
+
+# Take the generically named rosters uploads files and move them to a semester folder and give them a date.
+def move_to_folder(sem,year,folder,files):
+ semester = year+sem
+ semester_path = 'cache/rosters/%s' % semester
+ if not os.path.isdir('cache/rosters/'+semester):
+ os.makedirs('cache/rosters/'+semester)
+ now = datetime.datetime.now().strftime('%Y-%m-%dT%H-%M')
+ print("+ Moving roster files to folder: %s" % semester_path)
+ if not os.path.isdir(semester_path):
+ print("+ Creating folder: %s" % semester_path)
+ os.makedirs(semester_path)
+ def safe_move(src, dst):
+ try:
+ os.replace(src, dst)
+ except Exception as ex:
+ print(f"! move failed {src} -> {dst}: {ex}")
+ if 'courses.csv' in files:
+ safe_move('cache/rosters/courses-%s.csv' % folder, 'cache/rosters/%s/courses.%s.csv' % (semester,now))
+ if 'enrollments.csv' in files:
+ safe_move('cache/rosters/enrollments-%s.csv' % folder, 'cache/rosters/%s/enrollments.%s.csv' % (semester,now))
+ if 'users.csv' in files:
+ safe_move('cache/rosters/users-%s.csv' % folder, 'cache/rosters/%s/users.%s.csv' % (semester,now))
+
+
+
+# Take raw upload (csv) files and make one big json out of them.
+# This relates to enrollment files, not schedule.
+def convert_roster_files(semester="",year="",folder=""):
+ if not semester:
+ semester = input("the semester? (ex: spring) ")
+ folder = input("Folder? (ex 2020-02-25-14-58-20) ")
+ uf = open('cache/rosters/users-'+folder+'.csv','r')
+ cf = open('cache/rosters/courses-'+folder+'.csv','r')
+ ef = open('cache/rosters/enrollments-'+folder+'.csv','r')
+ u = csv.DictReader(uf)
+ c = csv.DictReader(cf)
+ e = csv.DictReader(ef)
+ uu = [i for i in u]
+ cc = [i for i in c]
+ ee = [i for i in e]
+ uf.close()
+ cf.close()
+ ef.close()
+ myrosterfile = 'cache/rosters/roster_%s_%s.json' % (year, semester)
+
+ if os.path.exists(myrosterfile):
+ print(" -- Moving previous combined roster json file. opening %s ..." % myrosterfile)
+ last_fileobj = open(myrosterfile,'r')
+ last_file = json.load(last_fileobj)
+
+ last_fileobj.close()
+
+ info = last_file[3]
+ last_date = info['date_filestring']
+
+ print(' -- writing: cache/rosters/%s%s/roster_%s.json ...' % (year,semester,last_date))
+
+ try:
+ os.rename(myrosterfile, 'cache/rosters/%s%s/roster_%s.json ...' % (year,semester,last_date))
+ print(' -- ok')
+ except Exception as e:
+ print(" ** Failed because i couldn't move the previous roster file: %s" % myrosterfile)
+ print(e)
+ myrosterfile = "new_" + myrosterfile
+ pass
+ #os.remove('cache/old_rosters/roster_'+semester+'.'+last_date+'.json')
+ #os.rename(myrosterfile, 'cache/old_rosters/roster_'+semester+'.'+last_date+'.json')
+
+ newinfo = {'date_filestring': datetime.datetime.now().strftime('%Y-%m-%dT%H-%M'), }
+ try:
+ new_roster = codecs.open(myrosterfile,'w', 'utf-8')
+ new_roster.write( json.dumps( [uu,cc,ee,newinfo], indent=2 ))
+ new_roster.close()
+ print(" -- Wrote roster info to: %s." % myrosterfile)
+ except Exception as e:
+ print(" ** Failed because i couldn't move the previous roster file: %s" % myrosterfile)
+ print(" ** " + str(e))
+
+
+
+
+
+
+# From instructure sftp site
+def fetch_current_rosters(sftp=None, label_hour=None):
+ """Download roster CSVs from Instructure SFTP and post-process.
+ - If sftp provided, reuse it (already chdir'd to SIS).
+ - Files are saved using label_hour (format YYYY-MM-DD-HH). If None, compute fallback label by flooring to the hour.
+ """
+ import pysftp
+ def log(msg):
+ ts = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
+ try:
+ with open('cache/pipeline.log.txt','a', encoding='utf-8') as f:
+ f.write(f"[{ts}] {msg}\n")
+ except Exception:
+ print(msg)
+
+ close_after = False
+ if sftp is None:
+ cnopts = pysftp.CnOpts()
+ cnopts.hostkeys = None
+ sftp = pysftp.Connection(instructure_url, username=instructure_username, private_key=instructure_private_key, cnopts=cnopts)
+ sftp.chdir('SIS')
+ close_after = True
+ try:
+ now = datetime.datetime.now()
+ if not label_hour:
+ # default fallback: floor to hour
+ label_hour = now.replace(minute=0, second=0, microsecond=0).strftime('%Y-%m-%d-%H')
+
+ files = set(sftp.listdir())
+ log(f"fetch_current_rosters: label={label_hour} files={sorted(files)}")
+ need = ['login','users','courses','enrollments']
+ saved = []
+ for name in need:
+ remote = f'{name}.csv'
+ target = f'cache/rosters/{name}-{label_hour}.csv'
+ try:
+ if remote in files:
+ if not (os.path.exists(target) and os.path.getsize(target) > 0):
+ temp = target + '.part'
+ try:
+ if os.path.exists(temp):
+ os.remove(temp)
+ except Exception:
+ pass
+ sftp.get(remote, temp)
+ # atomic replace
+ os.replace(temp, target)
+ saved.append(remote)
+ log(f' saved {remote} -> {target}')
+ else:
+ log(f' already exists: {target}, skipping')
+ else:
+ log(f' missing on server: {remote}')
+ except Exception as ex:
+ log(f' download failed: {remote} -> {target} err={ex}')
+
+ if len(saved) >= 3 and 'courses.csv' in saved or os.path.exists(f'cache/rosters/courses-{label_hour}.csv'):
+ try:
+ with open(f'cache/rosters/courses-{label_hour}.csv','r') as courses:
+ courses.readline()
+ a = courses.readline()
+ parts = a.split(',')
+ year = parts[1][0:4]
+ ss = parts[1][4:6]
+ sem = {'30':'spring', '50':'summer', '70':'fall' }
+ this_sem = sem.get(ss, 'spring')
+ log(f"post-process for semester={this_sem} year={year} label={label_hour}")
+ convert_roster_files(this_sem,year,label_hour)
+ move_to_folder(this_sem,year,label_hour,saved)
+ except Exception as expp:
+ log(f'post-processing failed: {expp}')
+ else:
+ log('not enough files to post-process yet')
+ finally:
+ if close_after:
+ try:
+ sftp.close()
+ except Exception:
+ pass
+
+def fetch_current_rosters_auto(poll_seconds=15):
+ """Poll Instructure SFTP from the top of each hour until all 4 roster files appear, then download once.
+ - Robust logging to cache/pipeline.log.txt
+ - Stops polling for the remainder of that hour after success
+ - Tries again on the next hour
+ """
+ import pysftp
+ from contextlib import contextmanager
+
+ log_path = 'cache/pipeline.log.txt'
+
+ def log(msg):
+ ts = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
+ try:
+ with open(log_path, 'a', encoding='utf-8') as f:
+ f.write(f"[{ts}] {msg}\n")
+ except Exception:
+ print(f"[log-fail] {msg}")
+
+ @contextmanager
+ def sftp_conn():
+ cnopts = pysftp.CnOpts()
+ cnopts.hostkeys = None
+ conn = None
+ try:
+ conn = pysftp.Connection(instructure_url, username=instructure_username,
+ private_key=instructure_private_key, cnopts=cnopts)
+ conn.chdir('SIS')
+ yield conn
+ finally:
+ try:
+ if conn:
+ conn.close()
+ except Exception:
+ pass
+
+ required = {'login.csv', 'users.csv', 'courses.csv', 'enrollments.csv'}
+ current_window = None # e.g., '2025-08-30-14'
+ window_done = False
+ window_end = None
+
+ def compute_window(now):
+ """Return (label_hour_str, window_end_dt) if within a window, else (None, None).
+ Window spans [H-5m, H+5m] around each top-of-hour H.
+ If minute >= 55 -> window is next hour
+ If minute <= 5 -> window is current hour
+ Else -> not in window.
+ """
+ m = now.minute
+ if m >= 55:
+ base = (now.replace(minute=0, second=0, microsecond=0) + timedelta(hours=1))
+ elif m <= 5:
+ base = now.replace(minute=0, second=0, microsecond=0)
+ else:
+ return (None, None)
+ label = base.strftime('%Y-%m-%d-%H')
+ end = base + timedelta(minutes=5) # end of window
+ return (label, end)
+
+ log('fetch_current_rosters_auto: starting poll loop')
+ while True:
+ now = datetime.datetime.now()
+ hour_key = now.strftime('%Y-%m-%d %H')
+ label, enddt = compute_window(now)
+ if label is None:
+ time.sleep(max(5, int(poll_seconds)))
+ continue
+
+ if label != current_window:
+ current_window = label
+ window_done = False
+ window_end = enddt
+ log(f'New window: {current_window} (until {window_end.strftime("%H:%M:%S")}). Resetting state.')
+
+ if window_done:
+ time.sleep(5)
+ continue
+
+ # Poll SFTP for files
+ try:
+ with sftp_conn() as sftp:
+ files = set(sftp.listdir())
+ missing = list(required - files)
+ if missing:
+ log(f'Not ready yet. Missing: {missing}')
+ else:
+ log('All required files present inside window. Starting download...')
+ try:
+ fetch_current_rosters(sftp=sftp, label_hour=current_window)
+ log('Download complete. Marking window_done for this window.')
+ window_done = True
+ except Exception as ex_dl:
+ import traceback
+ log('Download failed: ' + str(ex_dl))
+ log(traceback.format_exc())
+ except Exception as ex_list:
+ import traceback
+ log('SFTP list failed: ' + str(ex_list))
+ log(traceback.format_exc())
+
+ # Check for window timeout
+ try:
+ if not window_done and window_end and now >= window_end:
+ log(f'REPORT: window {current_window} ended without all files. Consider alerting/email.')
+ window_done = True # stop further checks this window
+ except Exception:
+ pass
+
+ time.sleep(max(5, int(poll_seconds)))
+
+
+
+
+
+
+################
+################ SENDING DATA AWAY
+################
+################
+################
+
+# Upload a json file to www
+def put_file(remotepath,localpath, localfile,prompt=1):
+ import pysftp
+ show_all = 0
+ folder = datetime.datetime.now().strftime('%Y-%m-%d-%H-%M-%S')
+ cnopts = pysftp.CnOpts()
+ cnopts.hostkeys = None
+
+ with pysftp.Connection(FTP_SITE,username=FTP_USER, password=FTP_PW,cnopts=cnopts) as sftp:
+ #todo: these paths
+ #files = sftp.listdir()
+ #print(folder + "\tI see these files on remote: ", files, "\n")
+ sftp.chdir(remotepath)
+ files = sftp.listdir()
+ if show_all: print(folder + "\tI see these files on remote: ", files, "\n")
+ localf = os.listdir(localpath)
+ if show_all: print("I see these local: ", localf)
+ if prompt:
+ input('ready to upload')
+ sftp.put(localpath+localfile, localfile, preserve_mtime=True)
+ sftp.close()
+
+
+#text =
+ result = []
+ last_type = ''
+ #answer_text = ''
+ answer = []
+ in_a_list = ''
+
+ # Get all the images
+ for k,value in doc_objects.items():
+ tempout.write( "->" + k + "=" + json.dumps(value,indent=2) + "\n\n\n--\n\n")
+ fetched = fetch_doc_image(k,value)
+
+ list_stack = []
+ list_depth = 0
+ last_list_depth = 0
+ for value in doc_content:
+ tempout.write( json.dumps(value,indent=2) + "\n\n\n")
+ if verbose: print(json.dumps(value, sort_keys=True, indent=4))
+
+ tag_fxn = handle_para
+ if 'paragraph' in value:
+ this_text = ''
+
+ # First we deal with if we're in a list.
+ if 'bullet' in value['paragraph']:
+ # either we're (1)starting a new list, (2)in one (do nothing),
+ # (3)starting a nested one, or (4)finished a nested one.
+ lid = value['paragraph']['bullet']['listId']
+ if not list_stack: # 1
+ list_stack.append(lid)
+ else:
+ if not lid == list_stack[0]:
+ if not lid in list_stack: # 3
+ list_stack.append(lid)
+ else: # 4
+ x = list_stack.pop()
+ while x != lid: list_stack.pop()
+ elif len(list_stack) > 0:
+ # current para isn't a bullet but we still have a list open.
+ list_stack = []
+
+
+ list_depth = len(list_stack)
+ deeper = list_depth - last_list_depth
+ if deeper > 0:
+ answer.append("" * deeper)
+ elif deeper < 0:
+ deeper = -1 * deeper
+ answer.append("
" * deeper)
+ if len(list_stack):
+ tag_fxn = handle_li
+
+ # NOW the tag_fxn is either 'para' or 'li'... let's get the styling info next,
+ elements = value.get('paragraph').get('elements')
+ if 'paragraphStyle' in value.get('paragraph'):
+ style = value.get('paragraph').get('paragraphStyle')
+ if 'namedStyleType' in style:
+ type = style['namedStyleType']
+
+ # and FINALLY, the actual contents.
+ for elem in elements:
+ # text content
+ this_text += read_paragraph_element_2(elem,type)
+
+ # image content
+ if 'inlineObjectElement' in elem:
+ vpi = elem['inlineObjectElement']
+ if 'inlineObjectId' in vpi:
+ ii = vpi['inlineObjectId']
+ if ii in img_lookup:
+ img = img_lookup[ii]
+ h = img_heights[ii]
+ w = img_widths[ii]
+ this_text += '
' % (img,w,h)
+
+
+ # Now for something tricky. Call an appropriate handler, based on:
+ # (a) what is the paragraph style type?
+ # (b) is it different from the prev one?
+
+ if last_type=='NORMAL_TEXT' and type!=last_type:
+ if this_text.strip():
+ result.append(handle_answer(answer))
+ answer = []
+ #answer_text = ''
+
+ if type=='HEADING_2' and this_text.strip():
+ result.append( handle_sec(this_text) )
+ this_text = ''
+ elif type=='HEADING_3' and this_text.strip():
+ result.append(handle_question(this_text,bracket))
+ this_text = ''
+ else:
+ if this_text.lower().startswith('tags:'):
+ tag_fxn = handle_tags
+ if this_text.lower().startswith('icons:'):
+ tag_fxn = handle_icons
+ if this_text.strip():
+ answer.append(tag_fxn(this_text))
+ this_text = ''
+ last_type = type
+ last_list_depth = list_depth
+
+ elif 'table' in value:
+ pass
+
+
+ result.append(handle_answer(answer))
+ return json.dumps(result,indent=4)
+
+
+
+
+def process_reg_history(term='fa25'):
+ from collections import defaultdict
+ from itertools import groupby
+ from operator import itemgetter
+
+ def read_grouped_csv(path):
+ with open(path, newline='') as f:
+ fieldnames = ['datetime', 'crn', 'course', 'teacher', 'max', 'enrolled', 'waitlistmax', 'waitlisted']
+ reader = csv.DictReader(f, fieldnames=fieldnames)
+ rows = sorted(reader, key=lambda r: r['datetime']) # Group by timestamp
+ grouped = {}
+ for ts, group in groupby(rows, key=itemgetter('datetime')):
+ grouped[ts] = {r['crn']: r for r in group}
+ return grouped
+
+ def crossed_threshold(old_val, new_val, max_val):
+ thresholds = [0.25, 0.5, 0.75, 1.0]
+ if int(max_val) == 0:
+ return False, None
+ old_ratio = int(old_val) / int(max_val)
+ new_ratio = int(new_val) / int(max_val)
+ for t in thresholds:
+ if old_ratio < t <= new_ratio:
+ return True, int(t * 100)
+ return False, None
+
+ def detect_changes(prev, curr):
+ changes = defaultdict(list)
+
+ all_crns = prev.keys() | curr.keys()
+ for crn in all_crns:
+ o, n = prev.get(crn), curr.get(crn)
+ if not o:
+ changes[crn].append((n['datetime'], "Section was added."))
+ elif not n:
+ changes[crn].append((
+ o['datetime'],
+ f"Section was removed (last seen: teacher {o['teacher']}, "
+ f"{o['enrolled']}/{o['max']} enrolled, {o['waitlisted']}/{o['waitlistmax']} waitlisted)."
+ ))
+ else:
+ dt = n['datetime']
+ if o['teacher'] != n['teacher']:
+ changes[crn].append((dt, f"Teacher changed from {o['teacher']} to {n['teacher']}."))
+ if o['enrolled'] != n['enrolled']:
+ crossed, percent = crossed_threshold(o['enrolled'], n['enrolled'], n['max'])
+ if crossed:
+ changes[crn].append((dt, f"Enrollment crossed {percent}% ({n['enrolled']}/{n['max']})."))
+ if int(n['waitlisted']) > 10 and o['waitlisted'] != n['waitlisted']:
+ changes[crn].append((dt, f"Waitlist exceeds 10: {n['waitlisted']}."))
+ return changes
+
+ def time_to_iso(s):
+ return datetime.datetime.strptime(s, "%Y-%m-%dT%H-%M").isoformat()
+
+ def detect_changes_structured(prev, curr):
+ changes = defaultdict(list)
+
+ all_crns = prev.keys() | curr.keys()
+ for crn in all_crns:
+ o, n = prev.get(crn), curr.get(crn)
+ if not o:
+ changes[crn].append({'time':time_to_iso(n['datetime']), "type":'section update', 'message': "Section was added."})
+ elif not n:
+ changes[crn].append(
+ {'time':time_to_iso(o['datetime']), "type":'section update', 'message': "Section was removed.",
+ 'value': o['enrolled'], 'capacity': o['max'], })
+ else:
+ dt = time_to_iso(n['datetime'])
+ if o['teacher'] != n['teacher']:
+ changes[crn].append({'time':dt, "type":'teacher_change',
+ 'message': f"Teacher changed from {o['teacher']} to {n['teacher']}.",
+ 'old_teacher': o['teacher'], 'new_teacher': n['teacher'], })
+ if o['enrolled'] != n['enrolled']:
+ crossed, percent = crossed_threshold(o['enrolled'], n['enrolled'], n['max'])
+ if crossed:
+ changes[crn].append({'time':dt, "type":'enrollment_milestone',
+ 'message': f"Enrollment crossed {percent}% ({n['enrolled']}/{n['max']}).",
+ 'percent':percent,'value':n['enrolled'],'capacity':n['max'] })
+ if int(n['waitlisted']) > 10 and o['waitlisted'] < n['waitlisted']:
+ changes[crn].append({'time':dt, "type":'enrollment_milestone',
+ 'message': f"Waitlist exceeds 10: {n['waitlisted']}).",
+ 'value':n['waitlisted']})
+ return changes
+
+
+ def process_diff_timeline(path):
+ snapshots = read_grouped_csv(path)
+ timeline = sorted(snapshots.keys())
+ timeline_diffs = []
+ timeline_diffs_structured = []
+ course_names = {} # crn -> latest known course name
+
+ for i in range(1, len(timeline)):
+ prev_ts, curr_ts = timeline[i-1], timeline[i]
+ prev, curr = snapshots[prev_ts], snapshots[curr_ts]
+
+ # update course name map
+ for crn, row in curr.items():
+ course_names[crn] = row['course']
+
+ delta = detect_changes(prev, curr)
+ timeline_diffs.append(delta)
+
+ delta_structured = detect_changes_structured(prev,curr)
+ timeline_diffs_structured.append(delta_structured)
+
+ # Flatten and group by crn
+ crn_changes = defaultdict(list)
+ for delta in timeline_diffs:
+ for crn, changes in delta.items():
+ crn_changes[crn].extend(changes)
+
+ # Flatten and group by crn
+ crn_changes_structured = defaultdict(list)
+ for delta in timeline_diffs_structured:
+ for crn, changes in delta.items():
+ crn_changes_structured[crn].extend(changes)
+
+ # Sort changes for each CRN by datetime
+ for crn in crn_changes:
+ crn_changes[crn].sort(key=lambda x: x[0])
+
+ # Sort changes for each CRN by datetime
+ for crn in crn_changes_structured:
+ crn_changes[crn].sort(key=lambda x: x[0])
+
+ return crn_changes, crn_changes_structured, course_names
+
+ fresh_history = requests.get(f"http://gavilan.cc/schedule/reg_history_{term}.csv").text
+ fresh_file = codecs.open(f'cache/reg_history_{term}.csv','w','utf-8')
+ fresh_file.write(fresh_history)
+ fresh_file.close()
+
+ output1 = codecs.open(f'cache/reg_timeline_{term}.txt','w','utf-8')
+ output2 = codecs.open(f'cache/reg_timeline_{term}.json','w','utf-8')
+ changes, changes_structured, course_names = process_diff_timeline(f"cache/reg_history_{term}.csv")
+
+ # once for plain text
+
+ for crn in sorted(changes, key=lambda c: course_names.get(c, "")):
+ course = course_names.get(crn, "")
+ course_output = {'code': course, 'crn':crn,'events':[]}
+ print(f"\n{course} (CRN {crn}):")
+ output1.write(f"\n{course} (CRN {crn}):\n")
+ for dt, msg in changes[crn]:
+ print(f" [{dt}] {msg}")
+ output1.write(f" [{dt}] {msg}\n")
+
+ course_output['events'].append({'message':msg, 'time':time_to_iso(dt)})
+
+ # again for structured
+ crn_list = []
+
+ for crn in sorted(changes_structured, key=lambda c: course_names.get(c, "")):
+ course = course_names.get(crn, "")
+ course_output = {'code': course, 'crn':crn,'events':changes_structured[crn]}
+ crn_list.append(course_output)
+
+ output2.write( json.dumps(crn_list,indent=2) )
+ output2.close()
+
+
+
+def recreate_all():
+ # Use canonical semester short codes from semesters.py
+ try:
+ from semesters import code as SEM_CODES
+ except Exception:
+ SEM_CODES = []
+ for x in SEM_CODES:
+ try:
+ recreate_reg_data(x)
+ except Exception as e:
+ print(f'Failed on {x} with: {e}')
+
+
+def recreate_reg_data(term="fa25"):
+ from collections import defaultdict
+ from datetime import datetime
+
+ def parse_row(row):
+ dt = datetime.strptime(row['datetime'], "%Y-%m-%dT%H-%M")
+ crn = row['crn']
+ enrolled = int(row['enrolled'])
+ return dt, row['datetime'], crn, enrolled
+
+ def reduce_latest_per_day(rows):
+ latest = defaultdict(dict) # latest[crn][date] = (dt, ts, enrolled)
+ latest_ts_by_date = {} # date → (dt, ts) for header naming
+
+ for row in rows:
+ dt, full_ts, crn, enrolled = parse_row(row)
+ date_str = dt.date().isoformat()
+ ts_header = dt.strftime("%Y-%m-%dT%H") # <-- this is what we want
+
+ # for each crn, per day, keep latest reading
+ if date_str not in latest[crn] or dt > latest[crn][date_str][0]:
+ latest[crn][date_str] = (dt, ts_header, enrolled)
+
+ # also record latest timestamp per day for consistent column headers
+ if date_str not in latest_ts_by_date or dt > latest_ts_by_date[date_str][0]:
+ latest_ts_by_date[date_str] = (dt, ts_header)
+
+ return latest, [ts for _, ts in sorted(latest_ts_by_date.values())]
+
+ def pivot_table(latest, headers):
+ crns = sorted(latest)
+ table = []
+
+ for crn in crns:
+ row = [crn]
+ for ts in headers:
+ date_str = ts[:10] # match on YYYY-MM-DD
+ val = latest[crn].get(date_str)
+ if val and val[1] == ts:
+ row.append(str(val[2]))
+ else:
+ row.append("")
+ table.append(row)
+
+ return ['crn'] + headers, table
+
+ #with open(f"cache/reg_history_{term}.csv", newline='') as f:
+ from io import StringIO
+ url = f"https://gavilan.cc/schedule/reg_history_{term}.csv"
+
+ # Download
+ resp = requests.get(url)
+ resp.raise_for_status() # raises if bad status
+
+ # Wrap the text in a file-like object
+ f = StringIO(resp.text)
+
+ fieldnames = ['datetime', 'crn', 'course', 'teacher', 'max', 'enrolled', 'waitlistmax', 'waitlisted']
+ reader = csv.DictReader(f, fieldnames=fieldnames)
+ rows = list(reader)
+
+ latest, headers = reduce_latest_per_day(rows)
+ header_row, table = pivot_table(latest, headers)
+
+ with open(f"cache/reg_data_{term}.csv", "w", newline='') as f:
+ writer = csv.writer(f)
+ writer.writerow(header_row)
+ writer.writerows(table)
+
+
+if __name__ == "__main__":
+
+ print ('')
+ options = { 1: ['Fetch rosters on schedule',fetch_current_rosters_auto] ,
+ 2: ['Get canvas data 2024 style', canvas_data_2024_run ],
+ 3: ['Set up canvas data 2024 style', setup_canvas_data_2024_run],
+ 4: ['Narrative timeline of section updates', process_reg_history],
+ 5: ['Create narrative format all semesters', recreate_all],
+ 6: ['Recreate reg_data from full reg history', recreate_reg_data],
+ }
+
+ '''1: ['Re-create schedule csv and json files from raw html',recent_schedules] ,
+ 2: ['Fetch rosters',fetch_current_rosters] ,
+ 3:
+ 4: ['Compute how registration is filling up classes', schedule_filling] ,
+ 5: ['Manually convert 3 csv files to joined json enrollment file.', convert_roster_files] ,
+ 6: ['Canvas data: interactive sync', interactive ],
+ 7: ['Canvas data: automated sync', sync_non_interactive ],
+ 8:
+ 9:
+ 16: ['Scrape schedule from ssb', scrape_schedule_multi ],
+ 14: ['Generate latestart schedule', list_latestarts ],
+ 15: ['Test ssb calls with python', scrape_schedule_py ],
+ 10: ['schedule to db', scrape_for_db ],
+ 11: ['clean argos draft schedule file', argos_data_from_cvc],
+ 12: ['make expanded schedule json files of old semesters', expand_old_semesters ],
+ 13: ['Parse deanza schedule', dza_sched ],
+ '''
+
+
+ if len(sys.argv) > 1 and re.search(r'^\d+',sys.argv[1]):
+ resp = int(sys.argv[1])
+ print("\n\nPerforming: %s\n\n" % options[resp][0])
+
+ else:
+ print ('')
+ for key in options:
+ print(str(key) + '.\t' + options[key][0])
+
+ print('')
+ resp = input('Choose: ')
+
+ # Call the function in the options dict
+ options[ int(resp)][1]()
+
+# Testing
+
+#if __name__ == "__main__":
+ #users = fetch('/api/v1/courses/69/users?per_page=100',1)
+ #print "These are the users: "
+ #print users
+
+ #getSemesterSchedule()
+
+
+ #get_doc()
+ #pass
diff --git a/search.py b/search.py
index 6c3c431..b9dec48 100644
--- a/search.py
+++ b/search.py
@@ -554,3 +554,26 @@ if __name__ == "__main__":
# Call the function in the options dict
options[ int(resp)][1]()
+def try_clustering(df):
+ from sklearn.cluster import KMeans
+ df = df.drop(['code'], axis=1)
+ kmeans = KMeans(n_clusters=4, random_state=0).fit(df)
+ return kmeans.labels_
+def nlp_sample():
+ from gensim import utils, corpora
+ from nltk import stem
+ stemmer = stem.porter.PorterStemmer()
+ strings = [
+ "Human machine interface for lab abc computer applications",
+ "A survey of user opinion of computer system response time",
+ "The EPS user interface management system",
+ "System and human system engineering testing of EPS",
+ "Relation of user perceived response time to error measurement",
+ "The generation of random binary unordered trees",
+ "The intersection graph of paths in trees",
+ "Graph minors IV Widths of trees and well quasi ordering",
+ "Graph minors A survey",
+ ]
+ processed = [[stemmer.stem(y) for y in utils.simple_preprocess(x, min_len=4)] for x in strings]
+ dictionary = corpora.Dictionary(processed)
+ return dictionary
diff --git a/users.py b/users.py
index bba2e03..4ac8b82 100644
--- a/users.py
+++ b/users.py
@@ -1938,7 +1938,8 @@ def track_users_by_teacherclass():
print(json.dumps(g2, indent=2))
-def nlp_sample():
+## moved: nlp_sample now in search.py
+# def nlp_sample():
# Stream a training corpus directly from S3.
#corpus = corpora.MmCorpus("s3://path/to/corpus")
@@ -1955,9 +1956,7 @@ def nlp_sample():
"Graph minors IV Widths of trees and well quasi ordering",
"Graph minors A survey",
]
- processed = [ [ stemmer.stem(y) for y in utils.simple_preprocess(x, min_len=4)] for x in strings]
- print(processed)
- dictionary = corpora.Dictionary( processed )
+ # moved
dct = dictionary
print(dictionary)
@@ -2980,4 +2979,3 @@ if __name__ == "__main__":
# Call the function in the options dict
options[ int(resp)][1]()
-