import util
import codecs, json, requests, re, csv, datetime, os, jsondiff, os.path
import sys, shutil, hmac, hashlib, base64, schedule, time, pathlib
from datetime import timedelta
from canvas_secrets import apiKey, apiSecret, FTP_SITE, FTP_USER, FTP_PW, url, domain, account_id, header, header_media, g_id, g_secret
from canvas_secrets import instructure_url, instructure_username, instructure_private_key
import os, asyncio
from dap.api import DAPClient
from dap.dap_types import Credentials
from dap.integration.database import DatabaseConnection
from dap.replicator.sql import SQLReplicator
"""
Everything to do with fetching data,
- From iLearn, via token
- current roster uploads from instructures sftp site
- raw logs and other from canvas data repo
- from ssb, use firefox to scrape the schedule
And some subsequent processing:
- Raw roster files, into a more compact json format
- Raw logs into something more useful
"""
verbose = False
users = {}
users_by_id = {}
# todo: all these constants for SSB -- line 1008
#
# todo: https://stackoverflow.com/questions/42656247/how-can-i-use-canvas-data-rest-api-using-python
sys.setrecursionlimit( 100000 )
local_data_folder = 'cache/canvas_data/'
mylog = codecs.open(local_data_folder + 'temp_log.txt','w')
class FetchError(Exception):
pass
DEBUG = 0
def d(s,end=''):
global DEBUG
if end and DEBUG: print(s,end=end)
elif DEBUG: print(s)
################
################ CANVAS API MAIN FETCHING FUNCTIONS
################
################
################
# Main canvas querying fxn
def fetch(target,verbose=0,params=0,media=0):
# if there are more results, recursivly call myself, adding on to the results.
results = 0
if target[0:4] != "http": target = url + target
if verbose:
print("++ Fetching: " + target)
if media:
r2 = requests.get(target, headers = header_media)
elif params:
r2 = requests.get(target, headers = header, params = params)
else:
r2 = requests.get(target, headers = header)
#if verbose:
#print "++ Got: " + r2.text
try:
results = json.loads(r2.text)
count = len(results)
except:
print("-- Failed to parse: ", r2.text)
if verbose:
print("Got %i results" % count)
if verbose > 1:
print(r2.headers)
tempout = codecs.open('cache/fetchcache.txt','a','utf-8')
tempout.write(r2.text+"\n\n")
tempout.close()
if ('link' in r2.headers and count > 0):
links = r2.headers['link'].split(',')
for L in links:
ll = L.split(';')
link = ll[0].replace("<","")
link = link.replace(">","")
if re.search(r'next', ll[1]):
if (verbose): print("++ More link: " + link)
#link = re.sub(r'per_page=10$', 'per_page=100', link) # link.replace('per_page=10','per_page=500')
#if (verbose): print("++ More link: " + link)
nest = fetch(link,verbose,params,media)
if isinstance(results,dict): results.update(nest)
else: results.extend(nest)
return results
# Main canvas querying fxn - stream version - don't die on big requests
def fetch_stream(target,verbose=0):
# if there are more results, recursivly call myself, adding on to the results.
results = 0
while target:
if target[0:4] != "http": target = url + target
if verbose:
print("++ Fetching: " + target)
r2 = requests.get(target, headers = header)
if r2.status_code == 502:
raise FetchError()
try:
results = json.loads(r2.text)
count = len(results)
except:
print("-- Failed to parse: ", r2.text)
if verbose:
print("Got %i results" % count)
if verbose > 1:
print(r2.headers)
tempout = codecs.open('cache/fetchcache.txt','a','utf-8')
tempout.write(r2.text+"\n\n")
tempout.close()
next_link_found = 0
if ('link' in r2.headers and count > 0):
links = r2.headers['link'].split(',')
for L in links:
ll = L.split(';')
link = ll[0].replace("<","")
link = link.replace(">","")
if re.search(r'next', ll[1]):
target = link
next_link_found = 1
break
if not next_link_found: target = 0
yield results
# for dicts with one key, collapse that one key out, cause
# paging makes problems... example: enrollment_terms
def fetch_collapse(target,collapse='',verbose=0):
# if there are more results, recursivly call myself, adding on to the results.
results = 0
if target[0:4] != "http": target = url + target
if verbose:
print("++ Fetching: " + target)
r2 = requests.get(target, headers = header)
#if verbose:
#print "++ Got: " + r2.text
try:
results = json.loads(r2.text)
except:
print("-- Failed to parse: ", r2.text)
if verbose: print(r2.headers)
if collapse and collapse in results:
results = results[collapse]
if ('link' in r2.headers):
links = r2.headers['link'].split(',')
for L in links:
ll = L.split(';')
link = ll[0].replace("<","")
link = link.replace(">","")
if re.search(r'next', ll[1]):
if (verbose): print("++ More link: " + link)
nest = fetch_collapse(link, collapse, verbose)
if isinstance(results,dict): results.update(nest)
else: results.extend(nest)
return results
################
################ CANVAS DATA
################
################
################
# Get canvas data 2024 style
def canvas_data_2024_run():
print("Updating all tables.")
asyncio.run(canvas_data_2024())
print("Done with all tables.")
async def canvas_data_2024():
base_url: str = os.environ["DAP_API_URL"]
client_id: str = os.environ["DAP_CLIENT_ID"]
client_secret: str = os.environ["DAP_CLIENT_SECRET"]
#connection_string: str = "postgresql://postgres:rolley34@192.168.1.6/db"
# todo: use secrets
connection_string: str = "postgresql://postgres:rolley34@192.168.1.199/db"
desired_tables = "users,courses,communication_channels,context_modules,conversation_message_participants,conversation_messages,conversation_participants,conversations,course_sections,enrollment_states,enrollment_dates_overrides,enrollment_terms,enrollments,learning_outcome_groups,learning_outcome_question_results,learning_outcomes,pseudonyms,quizzes,scores,submissions,submission_versions,wiki_pages,wikis".split(',')
credentials = Credentials.create(client_id=client_id, client_secret=client_secret)
async with DatabaseConnection(connection_string).open() as db_connection:
async with DAPClient(base_url, credentials) as session:
#tables = await session.get_tables("canvas")
for table in desired_tables:
print(f" trying to update {table} ")
try:
#await SQLReplicator(session, db_connection).initialize("canvas", table)
await SQLReplicator(session, db_connection).synchronize("canvas", table)
except Exception as e:
print(f" - skipping {table} because {e}")
# Get canvas data 2024 style
def setup_canvas_data_2024_run():
print("Setting up all tables.")
asyncio.run(setup_canvas_data_2024())
print("Done with all tables.")
async def setup_canvas_data_2024():
base_url: str = os.environ["DAP_API_URL"]
client_id: str = os.environ["DAP_CLIENT_ID"]
client_secret: str = os.environ["DAP_CLIENT_SECRET"]
#connection_string: str = "postgresql://postgres:rolley34@192.168.1.6/db"
connection_string: str = "postgresql://postgres:rolley34@192.168.1.192/db"
desired_tables = "users,courses,communication_channels,context_modules,conversation_message_participants,conversation_messages,conversation_participants,conversations,course_sections,enrollment_states,enrollment_dates_overrides,enrollment_terms,enrollments,learning_outcome_groups,learning_outcome_question_results,learning_outcomes,pseudonyms,quizzes,scores,submissions,submission_versions,wiki_pages,wikis".split(',')
credentials = Credentials.create(client_id=client_id, client_secret=client_secret)
async with DatabaseConnection(connection_string).open() as db_connection:
async with DAPClient(base_url, credentials) as session:
#tables = await session.get_tables("canvas")
for table in desired_tables:
print(f" {table}")
try:
await SQLReplicator(session, db_connection).initialize("canvas", table)
except Exception as e:
print(f" - skipping {table} because {e}")
################
################ ROSTERS AND REGISTRATION
################
################
################
# todo: the pipeline is disorganized. Organize it to have
# a hope of taking all this to a higher level.
#
# todo: where does this belong in the pipeline? compare with recent_schedules()
# Take the generically named rosters uploads files and move them to a semester folder and give them a date.
def move_to_folder(sem,year,folder,files):
semester = year+sem
semester_path = 'cache/rosters/%s' % semester
if not os.path.isdir('cache/rosters/'+semester):
os.makedirs('cache/rosters/'+semester)
now = datetime.datetime.now().strftime('%Y-%m-%dT%H-%M')
print("+ Moving roster files to folder: %s" % semester_path)
if not os.path.isdir(semester_path):
print("+ Creating folder: %s" % semester_path)
os.makedirs(semester_path)
def safe_move(src, dst):
try:
os.replace(src, dst)
except Exception as ex:
print(f"! move failed {src} -> {dst}: {ex}")
if 'courses.csv' in files:
safe_move('cache/rosters/courses-%s.csv' % folder, 'cache/rosters/%s/courses.%s.csv' % (semester,now))
if 'enrollments.csv' in files:
safe_move('cache/rosters/enrollments-%s.csv' % folder, 'cache/rosters/%s/enrollments.%s.csv' % (semester,now))
if 'users.csv' in files:
safe_move('cache/rosters/users-%s.csv' % folder, 'cache/rosters/%s/users.%s.csv' % (semester,now))
# Take raw upload (csv) files and make one big json out of them.
# This relates to enrollment files, not schedule.
def convert_roster_files(semester="",year="",folder=""):
if not semester:
semester = input("the semester? (ex: spring) ")
folder = input("Folder? (ex 2020-02-25-14-58-20) ")
uf = open('cache/rosters/users-'+folder+'.csv','r')
cf = open('cache/rosters/courses-'+folder+'.csv','r')
ef = open('cache/rosters/enrollments-'+folder+'.csv','r')
u = csv.DictReader(uf)
c = csv.DictReader(cf)
e = csv.DictReader(ef)
uu = [i for i in u]
cc = [i for i in c]
ee = [i for i in e]
uf.close()
cf.close()
ef.close()
myrosterfile = 'cache/rosters/roster_%s_%s.json' % (year, semester)
if os.path.exists(myrosterfile):
print(" -- Moving previous combined roster json file. opening %s ..." % myrosterfile)
last_fileobj = open(myrosterfile,'r')
last_file = json.load(last_fileobj)
last_fileobj.close()
info = last_file[3]
last_date = info['date_filestring']
print(' -- writing: cache/rosters/%s%s/roster_%s.json ...' % (year,semester,last_date))
try:
os.rename(myrosterfile, 'cache/rosters/%s%s/roster_%s.json ...' % (year,semester,last_date))
print(' -- ok')
except Exception as e:
print(" ** Failed because i couldn't move the previous roster file: %s" % myrosterfile)
print(e)
myrosterfile = "new_" + myrosterfile
pass
#os.remove('cache/old_rosters/roster_'+semester+'.'+last_date+'.json')
#os.rename(myrosterfile, 'cache/old_rosters/roster_'+semester+'.'+last_date+'.json')
newinfo = {'date_filestring': datetime.datetime.now().strftime('%Y-%m-%dT%H-%M'), }
try:
new_roster = codecs.open(myrosterfile,'w', 'utf-8')
new_roster.write( json.dumps( [uu,cc,ee,newinfo], indent=2 ))
new_roster.close()
print(" -- Wrote roster info to: %s." % myrosterfile)
except Exception as e:
print(" ** Failed because i couldn't move the previous roster file: %s" % myrosterfile)
print(" ** " + str(e))
# From instructure sftp site
def fetch_current_rosters(sftp=None, label_hour=None):
"""Download roster CSVs from Instructure SFTP and post-process.
- If sftp provided, reuse it (already chdir'd to SIS).
- Files are saved using label_hour (format YYYY-MM-DD-HH). If None, compute fallback label by flooring to the hour.
"""
import pysftp
def log(msg):
ts = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
try:
with open('cache/pipeline.log.txt','a', encoding='utf-8') as f:
f.write(f"[{ts}] {msg}\n")
except Exception:
print(msg)
close_after = False
if sftp is None:
cnopts = pysftp.CnOpts()
cnopts.hostkeys = None
sftp = pysftp.Connection(instructure_url, username=instructure_username, private_key=instructure_private_key, cnopts=cnopts)
sftp.chdir('SIS')
close_after = True
try:
now = datetime.datetime.now()
if not label_hour:
# default fallback: floor to hour
label_hour = now.replace(minute=0, second=0, microsecond=0).strftime('%Y-%m-%d-%H')
files = set(sftp.listdir())
log(f"fetch_current_rosters: label={label_hour} files={sorted(files)}")
need = ['login','users','courses','enrollments']
saved = []
for name in need:
remote = f'{name}.csv'
target = f'cache/rosters/{name}-{label_hour}.csv'
try:
if remote in files:
if not (os.path.exists(target) and os.path.getsize(target) > 0):
temp = target + '.part'
try:
if os.path.exists(temp):
os.remove(temp)
except Exception:
pass
sftp.get(remote, temp)
# atomic replace
os.replace(temp, target)
saved.append(remote)
log(f' saved {remote} -> {target}')
else:
log(f' already exists: {target}, skipping')
else:
log(f' missing on server: {remote}')
except Exception as ex:
log(f' download failed: {remote} -> {target} err={ex}')
if len(saved) >= 3 and 'courses.csv' in saved or os.path.exists(f'cache/rosters/courses-{label_hour}.csv'):
try:
with open(f'cache/rosters/courses-{label_hour}.csv','r') as courses:
courses.readline()
a = courses.readline()
parts = a.split(',')
year = parts[1][0:4]
ss = parts[1][4:6]
sem = {'30':'spring', '50':'summer', '70':'fall' }
this_sem = sem.get(ss, 'spring')
log(f"post-process for semester={this_sem} year={year} label={label_hour}")
convert_roster_files(this_sem,year,label_hour)
move_to_folder(this_sem,year,label_hour,saved)
except Exception as expp:
log(f'post-processing failed: {expp}')
else:
log('not enough files to post-process yet')
finally:
if close_after:
try:
sftp.close()
except Exception:
pass
def fetch_current_rosters_auto(poll_seconds=15):
"""Poll Instructure SFTP from the top of each hour until all 4 roster files appear, then download once.
- Robust logging to cache/pipeline.log.txt
- Stops polling for the remainder of that hour after success
- Tries again on the next hour
"""
import pysftp
from contextlib import contextmanager
log_path = 'cache/pipeline.log.txt'
def log(msg):
ts = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
try:
with open(log_path, 'a', encoding='utf-8') as f:
f.write(f"[{ts}] {msg}\n")
except Exception:
print(f"[log-fail] {msg}")
@contextmanager
def sftp_conn():
cnopts = pysftp.CnOpts()
cnopts.hostkeys = None
conn = None
try:
conn = pysftp.Connection(instructure_url, username=instructure_username,
private_key=instructure_private_key, cnopts=cnopts)
conn.chdir('SIS')
yield conn
finally:
try:
if conn:
conn.close()
except Exception:
pass
required = {'login.csv', 'users.csv', 'courses.csv', 'enrollments.csv'}
current_window = None # e.g., '2025-08-30-14'
window_done = False
window_end = None
def compute_window(now):
"""Return (label_hour_str, window_end_dt) if within a window, else (None, None).
Window spans [H-5m, H+5m] around each top-of-hour H.
If minute >= 55 -> window is next hour
If minute <= 5 -> window is current hour
Else -> not in window.
"""
m = now.minute
if m >= 55:
base = (now.replace(minute=0, second=0, microsecond=0) + timedelta(hours=1))
elif m <= 5:
base = now.replace(minute=0, second=0, microsecond=0)
else:
return (None, None)
label = base.strftime('%Y-%m-%d-%H')
end = base + timedelta(minutes=5) # end of window
return (label, end)
log('fetch_current_rosters_auto: starting poll loop')
while True:
now = datetime.datetime.now()
hour_key = now.strftime('%Y-%m-%d %H')
label, enddt = compute_window(now)
if label is None:
time.sleep(max(5, int(poll_seconds)))
continue
if label != current_window:
current_window = label
window_done = False
window_end = enddt
log(f'New window: {current_window} (until {window_end.strftime("%H:%M:%S")}). Resetting state.')
if window_done:
time.sleep(5)
continue
# Poll SFTP for files
try:
with sftp_conn() as sftp:
files = set(sftp.listdir())
missing = list(required - files)
if missing:
log(f'Not ready yet. Missing: {missing}')
else:
log('All required files present inside window. Starting download...')
try:
fetch_current_rosters(sftp=sftp, label_hour=current_window)
log('Download complete. Marking window_done for this window.')
window_done = True
except Exception as ex_dl:
import traceback
log('Download failed: ' + str(ex_dl))
log(traceback.format_exc())
except Exception as ex_list:
import traceback
log('SFTP list failed: ' + str(ex_list))
log(traceback.format_exc())
# Check for window timeout
try:
if not window_done and window_end and now >= window_end:
log(f'REPORT: window {current_window} ended without all files. Consider alerting/email.')
window_done = True # stop further checks this window
except Exception:
pass
time.sleep(max(5, int(poll_seconds)))
# Canvas data, download all new files
def sync_non_interactive():
resp = do_request('/api/account/self/file/sync')
mylog.write(json.dumps(resp, indent=4))
#mylog.close()
gotten = os.listdir(local_data_folder)
wanted = []
i = 0
for x in resp['files']:
filename = x['filename']
exi = "No "
if filename in gotten: exi = "Yes"
else: wanted.append(x)
print(str(i) + '.\tLocal: %s\tRemote: %s' % ( exi, filename ))
i += 1
print("I will attempt to download %i files." % len(wanted))
#answer = input("Press enter to begin, or q to quit ")
#if not answer == '': return
good_count = 0
bad_count = 0
for W in wanted:
print("Downloading: " + W['filename'])
response = requests.request(method='GET', url=W['url'], stream=True)
if(response.status_code != 200):
print('Request response went bad. Got back a %s code, meaning the request was %s' % \
(response.status_code, response.reason))
print('URL: ' + W['url'])
bad_count += 1
else:
#Use the downloaded data
with open(local_data_folder + W['filename'], 'wb') as fd:
for chunk in response.iter_content(chunk_size=128):
fd.write(chunk)
print("Success")
good_count += 1
print("Out of %i files, %i succeeded and %i failed." % (len(wanted),good_count,bad_count))
## OLD STYLE CANVAS DATA
# Get something from Canvas Data
def do_request(path):
#Set up the request pieces
method = 'GET'
host = 'api.inshosteddata.com'
apiTime = datetime.utcnow().strftime('%a, %d %b %y %H:%M:%S GMT')
apiContentType = 'application/json'
msgList = []
msgList.append(method)
msgList.append(host)
msgList.append(apiContentType)
msgList.append('')
msgList.append(path)
msgList.append('')
msgList.append(apiTime)
msgList.append(apiSecret)
msgStr = bytes("".join("%s\n" % k for k in msgList).strip(),'utf-8')
sig = base64.b64encode(hmac.new(key=bytes(apiSecret,'utf-8'),msg=msgStr,digestmod=hashlib.sha256).digest())
sig = sig.decode('utf-8')
headers = {}
headers['Authorization'] = 'HMACAuth {}:{}'.format(apiKey,sig)
headers['Date'] = apiTime
headers['Content-type'] = apiContentType
#Submit the request/get a response
uri = "https://"+host+path
print (uri)
print (headers)
response = requests.request(method='GET', url=uri, headers=headers, stream=True)
#Check to make sure the request was ok
if(response.status_code != 200):
print(('Request response went bad. Got back a ', response.status_code, ' code, meaning the request was ', response.reason))
else:
#Use the downloaded data
jsonData = response.json()
#print(json.dumps(jsonData, indent=4))
return jsonData
################
################ SENDING DATA AWAY
################
################
################
# Upload a json file to www
def put_file(remotepath,localpath, localfile,prompt=1):
import pysftp
show_all = 0
folder = datetime.datetime.now().strftime('%Y-%m-%d-%H-%M-%S')
cnopts = pysftp.CnOpts()
cnopts.hostkeys = None
with pysftp.Connection(FTP_SITE,username=FTP_USER, password=FTP_PW,cnopts=cnopts) as sftp:
#todo: these paths
#files = sftp.listdir()
#print(folder + "\tI see these files on remote: ", files, "\n")
sftp.chdir(remotepath)
files = sftp.listdir()
if show_all: print(folder + "\tI see these files on remote: ", files, "\n")
localf = os.listdir(localpath)
if show_all: print("I see these local: ", localf)
if prompt:
input('ready to upload')
sftp.put(localpath+localfile, localfile, preserve_mtime=True)
sftp.close()
"""
# copy files and directories from local static, to remote static,
# preserving modification times on the files
for f in localf:
print("This local file: " + f + " ", end=' ')
if not f in files:
sftp.put('video_srt/'+classfoldername+'/'+f, f, preserve_mtime=True)
print("Uploaded.")
else:
print("Skipped.")
"""
"""if len(files)==3 and 'users.csv' in files:
sftp.get('courses.csv','rosters/courses-'+folder+'.csv')
sftp.get('users.csv','rosters/users-'+folder+'.csv')
sftp.get('enrollments.csv','rosters/enrollments-'+folder+'.csv')
print folder + '\tSaved three data files in rosters folder.'
courses = open('rosters/courses-'+folder+'.csv','r')
courses.readline()
a = courses.readline()
print a
courses.close()
parts = a.split(',')
year = parts[1][0:4]
ss = parts[1][4:6]
#print parts[1]
sem = {'30':'spring', '50':'summer', '70':'fall' }
this_sem = sem[ss]
#print this_sem, "", year
print folder + '\tbuilding data file...'
convert_roster_files(this_sem,year,folder)
print folder + '\tmoving files...'
move_to_folder(this_sem,year,folder)
else:
print folder + "\tDon't see all three files."""
################
################ GOOGLE DOCS
################
################
################
def sec(t): return "
"+t+"
\n"
def para(t): return ""+t+"
\n"
def ul(t): return "\n"
def li(t): return ""+t+"\n"
def question(t,bracket=1):
ret = ''
match = re.search( r'\[(.*)\]', t)
if match and bracket:
ret += ""
t = re.sub( r'\[.*\]','',t)
else:
parts = t.split(' ')
id = ''
for p in parts:
if re.search(r'[a-zA-Z]',p[0]): id += p[0]
ret += "" % id.lower()
return ret + '' + t + '
\n
'
def answer(t):
return t + '
\n'
def read_paragraph_element(element,type="NORMAL_TEXT"):
"""Returns the text in the given ParagraphElement.
Args:
element: a ParagraphElement from a Google Doc.
"""
text_run = element.get('textRun')
begin = ''
end = ''
if not text_run:
return ''
if 'textStyle' in text_run and 'link' in text_run['textStyle']:
begin = ''
end = ''
if 'textStyle' in text_run and 'bold' in text_run['textStyle'] and text_run['textStyle']['bold']==True and type=="NORMAL_TEXT":
begin = '' + begin
end = end + ''
content = text_run.get('content')
content = re.sub(u'\u000b','
\n',content)
return begin + content + end
def get_doc(docid, bracket=1, verbose=0):
import pickle
import os.path
from googleapiclient.discovery import build
from google_auth_oauthlib.flow import InstalledAppFlow
from google.auth.transport.requests import Request
#ooout = open(fileout,'w')
# If modifying these scopes, delete the file token.pickle.
SCOPES = ['https://www.googleapis.com/auth/documents.readonly']
creds = None
# The file token.pickle stores the user's access and refresh tokens, and is
# created automatically when the authorization flow completes for the first
# time.
if os.path.exists('token.pickle'):
with open('token.pickle', 'rb') as token:
creds = pickle.load(token)
# If there are no (valid) credentials available, let the user log in.
if not creds or not creds.valid:
if creds and creds.expired and creds.refresh_token:
creds.refresh(Request())
else:
flow = InstalledAppFlow.from_client_secrets_file(
'credentials.json', SCOPES)
creds = flow.run_local_server(port=0)
# Save the credentials for the next run
with open('token.pickle', 'wb') as token:
pickle.dump(creds, token)
service = build('docs', 'v1', credentials=creds)
# Retrieve the documents contents from the Docs service.
document = service.documents().get(documentId=docid).execute()
if verbose: print(document)
tempout = codecs.open('cache/trash/gdoctemp.txt','w','utf-8')
tempout.write( json.dumps(document,indent=2) + "\n\n\n------------------------------------\n\n")
if verbose: print('The title of the document is: {}'.format(document.get('title')))
doc_content = document.get('body').get('content')
if verbose: print(doc_content)
doc_objects = document.get('inlineObjects')
if verbose: print(doc_objects)
doc_lists = document.get('lists')
text = ''
last_type = ''
answer_text = ''
in_a_list = ''
img_count = 1
img_lookup = {}
img_heights = {}
img_widths = {}
if doc_objects:
for k,value in doc_objects.items():
tempout.write( "->" + k + "=" + json.dumps(value,indent=2) + "\n\n\n--\n\n")
if 'inlineObjectProperties' in value:
if 'embeddedObject' in value['inlineObjectProperties']:
if 'imageProperties' in value['inlineObjectProperties']['embeddedObject']:
if 'contentUri' in value['inlineObjectProperties']['embeddedObject']['imageProperties']:
print(k)
uu = value['inlineObjectProperties']['embeddedObject']['imageProperties']['contentUri']
response = requests.get(uu, stream=True)
name = 'image_' + str(img_count) + '.' + response.headers['content-type'].split('/')[1]
img_count += 1
img_lookup[k] = name
with open('cache/doc_images/'+name, 'wb') as out_file:
shutil.copyfileobj(response.raw, out_file)
print(uu)
print(response.headers)
print(name)
#input('x?')
del response
if 'size' in value['inlineObjectProperties']['embeddedObject']:
img_heights[k] = int(value['inlineObjectProperties']['embeddedObject']['size']['height']['magnitude'])
img_widths[k] = int(value['inlineObjectProperties']['embeddedObject']['size']['width']['magnitude'])
tempout.write('- - - - - - - -\n\n')
#for value in doc_lists:
# tempout.write( json.dumps(value,indent=2) + "\n\n\n--\n\n")
tempout.write('- - - - - - - -\n\n')
list_stack = []
list_depth = 0
last_list_depth = 0
for value in doc_content:
tempout.write( json.dumps(value,indent=2) + "\n\n\n")
if verbose: print(json.dumps(value, sort_keys=True, indent=4))
# todo: x link, x bold, list, image.
tag_fxn = para
if 'paragraph' in value:
this_text = ''
if 'bullet' in value['paragraph']:
# either we're (1)starting a new list, (2)in one, (3)starting a nested one, or (4)finished a nested one.
lid = value['paragraph']['bullet']['listId']
if not list_stack: # 1
list_stack.append(lid)
else:
if lid == list_stack[0]: # 2
pass
else:
if not lid in list_stack: # 3
list_stack.append(lid)
else: # 4
x = list_stack.pop()
while x != lid: list_stack.pop()
elif len(list_stack) > 0: # current para isn't a bullet but we still have a list open.
list_stack = []
list_depth = len(list_stack)
deeper = list_depth - last_list_depth
if deeper > 0:
answer_text += "
" * deeper
elif deeper < 0:
deeper = -1 * deeper
answer_text += "
" * deeper
if len(list_stack):
tag_fxn = li
elements = value.get('paragraph').get('elements')
# inlineObjectElement": {
# "inlineObjectId": "kix.ssseeu8j9cfx",
if 'paragraphStyle' in value.get('paragraph'):
style = value.get('paragraph').get('paragraphStyle')
#text += json.dumps(style, sort_keys=True, indent=4)
if 'namedStyleType' in style:
type = style['namedStyleType']
for elem in elements:
# text content
this_text += read_paragraph_element(elem,type)
# image content
if 'inlineObjectElement' in elem:
vpi = elem['inlineObjectElement']
if 'inlineObjectId' in vpi:
ii = vpi['inlineObjectId']
if ii in img_lookup:
img = img_lookup[ii]
h = img_heights[ii]
w = img_widths[ii]
this_text += '

' % (img,w,h)
if last_type=='NORMAL_TEXT' and type!=last_type:
text += answer(answer_text)
answer_text = ''
if type=='HEADING_2':
text += sec(this_text)
this_text = ''
elif type=='HEADING_3':
text += question(this_text,bracket)
this_text = ''
else:
answer_text += tag_fxn(this_text)
this_text = ''
last_type = type
last_list_depth = list_depth
elif 'table' in value:
# The text in table cells are in nested Structural Elements and tables may be
# nested.
text += "\nTABLE\n"
#table = value.get('table')
#for row in table.get('tableRows'):
# cells = row.get('tableCells')
# for cell in cells:
# text += read_strucutural_elements(cell.get('content'))
#elif 'tableOfContents' in value:
# # The text in the TOC is also in a Structural Element.
# toc = value.get('tableOfContents')
# text += read_strucutural_elements(toc.get('content'))
#else:
# print(json.dumps(value, sort_keys=True, indent=4))
text += answer(answer_text)
#text += '
'
#print(text)
return text
######### TRY #2 ######
def read_paragraph_element_2(element,type="NORMAL_TEXT"):
text_run = element.get('textRun')
begin = ''
end = ''
if not text_run: return ''
if 'textStyle' in text_run and 'link' in text_run['textStyle']:
begin = ''
end = ''
if 'textStyle' in text_run and 'bold' in text_run['textStyle'] and text_run['textStyle']['bold']==True and type=="NORMAL_TEXT":
begin = '' + begin
end = end + ''
elif 'textStyle' in text_run and 'italic' in text_run['textStyle'] and text_run['textStyle']['italic']==True and type=="NORMAL_TEXT":
begin = '' + begin
end = end + ''
content = text_run.get('content')
content = re.sub(u'\u000b','
\n',content)
return begin + content + end
# t is a string that begins with "Icons: " ... and contains comma(space) separated list
def handle_icons(t):
text = t[7:].strip()
parts = text.split(", ")
return ('icons',parts)
# t is a string that begins with "Tags: " ... and contains comma(space) separated list
def handle_tags(t):
text = t[6:].strip()
parts = text.split(", ")
return ('tags',parts)
def handle_question(t,bracket=1):
anchor = ''
match = re.search( r'\[(.*)\]', t)
if match and bracket:
anchor = match.group(1).lower()
t = re.sub( r'\[.*\]','',t)
else:
parts = t.split(' ')
for p in parts:
if re.search(r'[a-zA-Z]',p[0]): anchor += p[0].lower()
return ('question', t, anchor)
def handle_answer(t):
return ('answer',t)
def handle_sec(t): return ('section',t)
def handle_para(t): return ('paragraph',t)
def handle_ul(t): return ('unorderdedlist',t)
def handle_li(t): return ('listitem',t)
img_count = 1
img_lookup = {}
img_heights = {}
img_widths = {}
def fetch_doc_image(k,value):
global img_count, img_lookup, img_heights, img_widths
if 'inlineObjectProperties' in value:
if 'embeddedObject' in value['inlineObjectProperties']:
if 'imageProperties' in value['inlineObjectProperties']['embeddedObject']:
if 'contentUri' in value['inlineObjectProperties']['embeddedObject']['imageProperties']:
print(k)
uu = value['inlineObjectProperties']['embeddedObject']['imageProperties']['contentUri']
response = requests.get(uu, stream=True)
name = 'image_' + str(img_count) + '.' + response.headers['content-type'].split('/')[1]
img_count += 1
img_lookup[k] = name
with open('cache/doc_images/'+name, 'wb') as out_file:
shutil.copyfileobj(response.raw, out_file)
print(uu)
print(response.headers)
print(name)
del response
if 'size' in value['inlineObjectProperties']['embeddedObject']:
img_heights[k] = int(value['inlineObjectProperties']['embeddedObject']['size']['height']['magnitude'])
img_widths[k] = int(value['inlineObjectProperties']['embeddedObject']['size']['width']['magnitude'])
def get_doc_generic(docid, bracket=1, verbose=0):
import pickle
import os.path
from googleapiclient.discovery import build
from google_auth_oauthlib.flow import InstalledAppFlow
from google.auth.transport.requests import Request
global img_count, img_lookup, img_heights, img_widths
# If modifying these scopes, delete the file token.pickle.
SCOPES = ['https://www.googleapis.com/auth/documents.readonly']
creds = None
# The file token.pickle stores the user's access and refresh tokens, and is
# created automatically when the authorization flow completes for the first
# time.
if os.path.exists('token.pickle'):
with open('token.pickle', 'rb') as token:
creds = pickle.load(token)
if not creds or not creds.valid:
if creds and creds.expired and creds.refresh_token:
creds.refresh(Request())
else:
flow = InstalledAppFlow.from_client_secrets_file(
'credentials.json', SCOPES)
creds = flow.run_local_server(port=0)
# Save the credentials for the next run
with open('token.pickle', 'wb') as token:
pickle.dump(creds, token)
service = build('docs', 'v1', credentials=creds)
# Retrieve the documents contents from the Docs service.
document = service.documents().get(documentId=docid).execute()
tempout = codecs.open('cache/trash/gdoctemp.txt','w','utf-8')
tempout.write( json.dumps(document,indent=2) \
+ "\n\n\n------------------------------------\n\n")
if verbose: print('The title of the document is: {}'.format(document.get('title')))
doc_content = document.get('body').get('content')
doc_objects = document.get('inlineObjects')
doc_lists = document.get('lists')
#text = ''
result = []
last_type = ''
#answer_text = ''
answer = []
in_a_list = ''
# Get all the images
for k,value in doc_objects.items():
tempout.write( "->" + k + "=" + json.dumps(value,indent=2) + "\n\n\n--\n\n")
fetched = fetch_doc_image(k,value)
list_stack = []
list_depth = 0
last_list_depth = 0
for value in doc_content:
tempout.write( json.dumps(value,indent=2) + "\n\n\n")
if verbose: print(json.dumps(value, sort_keys=True, indent=4))
tag_fxn = handle_para
if 'paragraph' in value:
this_text = ''
# First we deal with if we're in a list.
if 'bullet' in value['paragraph']:
# either we're (1)starting a new list, (2)in one (do nothing),
# (3)starting a nested one, or (4)finished a nested one.
lid = value['paragraph']['bullet']['listId']
if not list_stack: # 1
list_stack.append(lid)
else:
if not lid == list_stack[0]:
if not lid in list_stack: # 3
list_stack.append(lid)
else: # 4
x = list_stack.pop()
while x != lid: list_stack.pop()
elif len(list_stack) > 0:
# current para isn't a bullet but we still have a list open.
list_stack = []
list_depth = len(list_stack)
deeper = list_depth - last_list_depth
if deeper > 0:
answer.append("" * deeper)
elif deeper < 0:
deeper = -1 * deeper
answer.append("
" * deeper)
if len(list_stack):
tag_fxn = handle_li
# NOW the tag_fxn is either 'para' or 'li'... let's get the styling info next,
elements = value.get('paragraph').get('elements')
if 'paragraphStyle' in value.get('paragraph'):
style = value.get('paragraph').get('paragraphStyle')
if 'namedStyleType' in style:
type = style['namedStyleType']
# and FINALLY, the actual contents.
for elem in elements:
# text content
this_text += read_paragraph_element_2(elem,type)
# image content
if 'inlineObjectElement' in elem:
vpi = elem['inlineObjectElement']
if 'inlineObjectId' in vpi:
ii = vpi['inlineObjectId']
if ii in img_lookup:
img = img_lookup[ii]
h = img_heights[ii]
w = img_widths[ii]
this_text += '
' % (img,w,h)
# Now for something tricky. Call an appropriate handler, based on:
# (a) what is the paragraph style type?
# (b) is it different from the prev one?
if last_type=='NORMAL_TEXT' and type!=last_type:
if this_text.strip():
result.append(handle_answer(answer))
answer = []
#answer_text = ''
if type=='HEADING_2' and this_text.strip():
result.append( handle_sec(this_text) )
this_text = ''
elif type=='HEADING_3' and this_text.strip():
result.append(handle_question(this_text,bracket))
this_text = ''
else:
if this_text.lower().startswith('tags:'):
tag_fxn = handle_tags
if this_text.lower().startswith('icons:'):
tag_fxn = handle_icons
if this_text.strip():
answer.append(tag_fxn(this_text))
this_text = ''
last_type = type
last_list_depth = list_depth
elif 'table' in value:
pass
result.append(handle_answer(answer))
return json.dumps(result,indent=4)
def process_reg_history(term='fa25'):
from collections import defaultdict
from itertools import groupby
from operator import itemgetter
def read_grouped_csv(path):
with open(path, newline='') as f:
fieldnames = ['datetime', 'crn', 'course', 'teacher', 'max', 'enrolled', 'waitlistmax', 'waitlisted']
reader = csv.DictReader(f, fieldnames=fieldnames)
rows = sorted(reader, key=lambda r: r['datetime']) # Group by timestamp
grouped = {}
for ts, group in groupby(rows, key=itemgetter('datetime')):
grouped[ts] = {r['crn']: r for r in group}
return grouped
def crossed_threshold(old_val, new_val, max_val):
thresholds = [0.25, 0.5, 0.75, 1.0]
if int(max_val) == 0:
return False, None
old_ratio = int(old_val) / int(max_val)
new_ratio = int(new_val) / int(max_val)
for t in thresholds:
if old_ratio < t <= new_ratio:
return True, int(t * 100)
return False, None
def detect_changes(prev, curr):
changes = defaultdict(list)
all_crns = prev.keys() | curr.keys()
for crn in all_crns:
o, n = prev.get(crn), curr.get(crn)
if not o:
changes[crn].append((n['datetime'], "Section was added."))
elif not n:
changes[crn].append((
o['datetime'],
f"Section was removed (last seen: teacher {o['teacher']}, "
f"{o['enrolled']}/{o['max']} enrolled, {o['waitlisted']}/{o['waitlistmax']} waitlisted)."
))
else:
dt = n['datetime']
if o['teacher'] != n['teacher']:
changes[crn].append((dt, f"Teacher changed from {o['teacher']} to {n['teacher']}."))
if o['enrolled'] != n['enrolled']:
crossed, percent = crossed_threshold(o['enrolled'], n['enrolled'], n['max'])
if crossed:
changes[crn].append((dt, f"Enrollment crossed {percent}% ({n['enrolled']}/{n['max']})."))
if int(n['waitlisted']) > 10 and o['waitlisted'] != n['waitlisted']:
changes[crn].append((dt, f"Waitlist exceeds 10: {n['waitlisted']}."))
return changes
def time_to_iso(s):
return datetime.datetime.strptime(s, "%Y-%m-%dT%H-%M").isoformat()
def detect_changes_structured(prev, curr):
changes = defaultdict(list)
all_crns = prev.keys() | curr.keys()
for crn in all_crns:
o, n = prev.get(crn), curr.get(crn)
if not o:
changes[crn].append({'time':time_to_iso(n['datetime']), "type":'section update', 'message': "Section was added."})
elif not n:
changes[crn].append(
{'time':time_to_iso(o['datetime']), "type":'section update', 'message': "Section was removed.",
'value': o['enrolled'], 'capacity': o['max'], })
else:
dt = time_to_iso(n['datetime'])
if o['teacher'] != n['teacher']:
changes[crn].append({'time':dt, "type":'teacher_change',
'message': f"Teacher changed from {o['teacher']} to {n['teacher']}.",
'old_teacher': o['teacher'], 'new_teacher': n['teacher'], })
if o['enrolled'] != n['enrolled']:
crossed, percent = crossed_threshold(o['enrolled'], n['enrolled'], n['max'])
if crossed:
changes[crn].append({'time':dt, "type":'enrollment_milestone',
'message': f"Enrollment crossed {percent}% ({n['enrolled']}/{n['max']}).",
'percent':percent,'value':n['enrolled'],'capacity':n['max'] })
if int(n['waitlisted']) > 10 and o['waitlisted'] < n['waitlisted']:
changes[crn].append({'time':dt, "type":'enrollment_milestone',
'message': f"Waitlist exceeds 10: {n['waitlisted']}).",
'value':n['waitlisted']})
return changes
def process_diff_timeline(path):
snapshots = read_grouped_csv(path)
timeline = sorted(snapshots.keys())
timeline_diffs = []
timeline_diffs_structured = []
course_names = {} # crn -> latest known course name
for i in range(1, len(timeline)):
prev_ts, curr_ts = timeline[i-1], timeline[i]
prev, curr = snapshots[prev_ts], snapshots[curr_ts]
# update course name map
for crn, row in curr.items():
course_names[crn] = row['course']
delta = detect_changes(prev, curr)
timeline_diffs.append(delta)
delta_structured = detect_changes_structured(prev,curr)
timeline_diffs_structured.append(delta_structured)
# Flatten and group by crn
crn_changes = defaultdict(list)
for delta in timeline_diffs:
for crn, changes in delta.items():
crn_changes[crn].extend(changes)
# Flatten and group by crn
crn_changes_structured = defaultdict(list)
for delta in timeline_diffs_structured:
for crn, changes in delta.items():
crn_changes_structured[crn].extend(changes)
# Sort changes for each CRN by datetime
for crn in crn_changes:
crn_changes[crn].sort(key=lambda x: x[0])
# Sort changes for each CRN by datetime
for crn in crn_changes_structured:
crn_changes[crn].sort(key=lambda x: x[0])
return crn_changes, crn_changes_structured, course_names
fresh_history = requests.get(f"http://gavilan.cc/schedule/reg_history_{term}.csv").text
fresh_file = codecs.open(f'cache/reg_history_{term}.csv','w','utf-8')
fresh_file.write(fresh_history)
fresh_file.close()
output1 = codecs.open(f'cache/reg_timeline_{term}.txt','w','utf-8')
output2 = codecs.open(f'cache/reg_timeline_{term}.json','w','utf-8')
changes, changes_structured, course_names = process_diff_timeline(f"cache/reg_history_{term}.csv")
# once for plain text
for crn in sorted(changes, key=lambda c: course_names.get(c, "")):
course = course_names.get(crn, "")
course_output = {'code': course, 'crn':crn,'events':[]}
print(f"\n{course} (CRN {crn}):")
output1.write(f"\n{course} (CRN {crn}):\n")
for dt, msg in changes[crn]:
print(f" [{dt}] {msg}")
output1.write(f" [{dt}] {msg}\n")
course_output['events'].append({'message':msg, 'time':time_to_iso(dt)})
# again for structured
crn_list = []
for crn in sorted(changes_structured, key=lambda c: course_names.get(c, "")):
course = course_names.get(crn, "")
course_output = {'code': course, 'crn':crn,'events':changes_structured[crn]}
crn_list.append(course_output)
output2.write( json.dumps(crn_list,indent=2) )
output2.close()
def recreate_all():
# Use canonical semester short codes from semesters.py
try:
from semesters import code as SEM_CODES
except Exception:
SEM_CODES = []
for x in SEM_CODES:
try:
recreate_reg_data(x)
except Exception as e:
print(f'Failed on {x} with: {e}')
def recreate_reg_data(term="fa25"):
from collections import defaultdict
from datetime import datetime
def parse_row(row):
dt = datetime.strptime(row['datetime'], "%Y-%m-%dT%H-%M")
crn = row['crn']
enrolled = int(row['enrolled'])
return dt, row['datetime'], crn, enrolled
def reduce_latest_per_day(rows):
latest = defaultdict(dict) # latest[crn][date] = (dt, ts, enrolled)
latest_ts_by_date = {} # date → (dt, ts) for header naming
for row in rows:
dt, full_ts, crn, enrolled = parse_row(row)
date_str = dt.date().isoformat()
ts_header = dt.strftime("%Y-%m-%dT%H") # <-- this is what we want
# for each crn, per day, keep latest reading
if date_str not in latest[crn] or dt > latest[crn][date_str][0]:
latest[crn][date_str] = (dt, ts_header, enrolled)
# also record latest timestamp per day for consistent column headers
if date_str not in latest_ts_by_date or dt > latest_ts_by_date[date_str][0]:
latest_ts_by_date[date_str] = (dt, ts_header)
return latest, [ts for _, ts in sorted(latest_ts_by_date.values())]
def pivot_table(latest, headers):
crns = sorted(latest)
table = []
for crn in crns:
row = [crn]
for ts in headers:
date_str = ts[:10] # match on YYYY-MM-DD
val = latest[crn].get(date_str)
if val and val[1] == ts:
row.append(str(val[2]))
else:
row.append("")
table.append(row)
return ['crn'] + headers, table
#with open(f"cache/reg_history_{term}.csv", newline='') as f:
from io import StringIO
url = f"https://gavilan.cc/schedule/reg_history_{term}.csv"
# Download
resp = requests.get(url)
resp.raise_for_status() # raises if bad status
# Wrap the text in a file-like object
f = StringIO(resp.text)
fieldnames = ['datetime', 'crn', 'course', 'teacher', 'max', 'enrolled', 'waitlistmax', 'waitlisted']
reader = csv.DictReader(f, fieldnames=fieldnames)
rows = list(reader)
latest, headers = reduce_latest_per_day(rows)
header_row, table = pivot_table(latest, headers)
with open(f"cache/reg_data_{term}.csv", "w", newline='') as f:
writer = csv.writer(f)
writer.writerow(header_row)
writer.writerows(table)
if __name__ == "__main__":
print ('')
options = { 1: ['Fetch rosters on schedule',fetch_current_rosters_auto] ,
2: ['Get canvas data 2024 style', canvas_data_2024_run ],
3: ['Set up canvas data 2024 style', setup_canvas_data_2024_run],
4: ['Narrative timeline of section updates', process_reg_history],
5: ['Create narrative format all semesters', recreate_all],
6: ['Recreate reg_data from full reg history', recreate_reg_data],
}
'''1: ['Re-create schedule csv and json files from raw html',recent_schedules] ,
2: ['Fetch rosters',fetch_current_rosters] ,
3:
4: ['Compute how registration is filling up classes', schedule_filling] ,
5: ['Manually convert 3 csv files to joined json enrollment file.', convert_roster_files] ,
6: ['Canvas data: interactive sync', interactive ],
7: ['Canvas data: automated sync', sync_non_interactive ],
8:
9:
16: ['Scrape schedule from ssb', scrape_schedule_multi ],
14: ['Generate latestart schedule', list_latestarts ],
15: ['Test ssb calls with python', scrape_schedule_py ],
10: ['schedule to db', scrape_for_db ],
11: ['clean argos draft schedule file', argos_data_from_cvc],
12: ['make expanded schedule json files of old semesters', expand_old_semesters ],
13: ['Parse deanza schedule', dza_sched ],
'''
if len(sys.argv) > 1 and re.search(r'^\d+',sys.argv[1]):
resp = int(sys.argv[1])
print("\n\nPerforming: %s\n\n" % options[resp][0])
else:
print ('')
for key in options:
print(str(key) + '.\t' + options[key][0])
print('')
resp = input('Choose: ')
# Call the function in the options dict
options[ int(resp)][1]()
# Testing
#if __name__ == "__main__":
#users = fetch('/api/v1/courses/69/users?per_page=100',1)
#print "These are the users: "
#print users
#getSemesterSchedule()
#get_doc()
#pass