canvasapp/pipelines.py

1429 lines
54 KiB
Python

import util
import codecs, json, requests, re, csv, datetime, os, jsondiff, os.path
import sys, shutil, hmac, hashlib, base64, schedule, time, pathlib
from datetime import timedelta
from canvas_secrets import apiKey, apiSecret, FTP_SITE, FTP_USER, FTP_PW, url, domain, account_id, header, header_media, g_id, g_secret
from canvas_secrets import instructure_url, instructure_username, instructure_private_key
import os, asyncio
from dap.api import DAPClient
from dap.dap_types import Credentials
from dap.integration.database import DatabaseConnection
from dap.replicator.sql import SQLReplicator
"""
Everything to do with fetching data,
- From iLearn, via token
- current roster uploads from instructures sftp site
- raw logs and other from canvas data repo
- from ssb, use firefox to scrape the schedule
And some subsequent processing:
- Raw roster files, into a more compact json format
- Raw logs into something more useful
"""
verbose = False
users = {}
users_by_id = {}
# todo: all these constants for SSB -- line 1008
#
# todo: https://stackoverflow.com/questions/42656247/how-can-i-use-canvas-data-rest-api-using-python
sys.setrecursionlimit( 100000 )
local_data_folder = 'cache/canvas_data/'
mylog = codecs.open(local_data_folder + 'temp_log.txt','w')
class FetchError(Exception):
pass
DEBUG = 0
def d(s,end=''):
global DEBUG
if end and DEBUG: print(s,end=end)
elif DEBUG: print(s)
################
################ CANVAS API MAIN FETCHING FUNCTIONS
################
################
################
# Main canvas querying fxn
def fetch(target,verbose=0,params=0,media=0):
# if there are more results, recursivly call myself, adding on to the results.
results = 0
if target[0:4] != "http": target = url + target
if verbose:
print("++ Fetching: " + target)
if media:
r2 = requests.get(target, headers = header_media)
elif params:
r2 = requests.get(target, headers = header, params = params)
else:
r2 = requests.get(target, headers = header)
#if verbose:
#print "++ Got: " + r2.text
try:
results = json.loads(r2.text)
count = len(results)
except:
print("-- Failed to parse: ", r2.text)
if verbose:
print("Got %i results" % count)
if verbose > 1:
print(r2.headers)
tempout = codecs.open('cache/fetchcache.txt','a','utf-8')
tempout.write(r2.text+"\n\n")
tempout.close()
if ('link' in r2.headers and count > 0):
links = r2.headers['link'].split(',')
for L in links:
ll = L.split(';')
link = ll[0].replace("<","")
link = link.replace(">","")
if re.search(r'next', ll[1]):
if (verbose): print("++ More link: " + link)
#link = re.sub(r'per_page=10$', 'per_page=100', link) # link.replace('per_page=10','per_page=500')
#if (verbose): print("++ More link: " + link)
nest = fetch(link,verbose,params,media)
if isinstance(results,dict): results.update(nest)
else: results.extend(nest)
return results
# Main canvas querying fxn - stream version - don't die on big requests
def fetch_stream(target,verbose=0):
# if there are more results, recursivly call myself, adding on to the results.
results = 0
while target:
if target[0:4] != "http": target = url + target
if verbose:
print("++ Fetching: " + target)
r2 = requests.get(target, headers = header)
if r2.status_code == 502:
raise FetchError()
try:
results = json.loads(r2.text)
count = len(results)
except:
print("-- Failed to parse: ", r2.text)
if verbose:
print("Got %i results" % count)
if verbose > 1:
print(r2.headers)
tempout = codecs.open('cache/fetchcache.txt','a','utf-8')
tempout.write(r2.text+"\n\n")
tempout.close()
next_link_found = 0
if ('link' in r2.headers and count > 0):
links = r2.headers['link'].split(',')
for L in links:
ll = L.split(';')
link = ll[0].replace("<","")
link = link.replace(">","")
if re.search(r'next', ll[1]):
target = link
next_link_found = 1
break
if not next_link_found: target = 0
yield results
# for dicts with one key, collapse that one key out, cause
# paging makes problems... example: enrollment_terms
def fetch_collapse(target,collapse='',verbose=0):
# if there are more results, recursivly call myself, adding on to the results.
results = 0
if target[0:4] != "http": target = url + target
if verbose:
print("++ Fetching: " + target)
r2 = requests.get(target, headers = header)
#if verbose:
#print "++ Got: " + r2.text
try:
results = json.loads(r2.text)
except:
print("-- Failed to parse: ", r2.text)
if verbose: print(r2.headers)
if collapse and collapse in results:
results = results[collapse]
if ('link' in r2.headers):
links = r2.headers['link'].split(',')
for L in links:
ll = L.split(';')
link = ll[0].replace("<","")
link = link.replace(">","")
if re.search(r'next', ll[1]):
if (verbose): print("++ More link: " + link)
nest = fetch_collapse(link, collapse, verbose)
if isinstance(results,dict): results.update(nest)
else: results.extend(nest)
return results
################
################ CANVAS DATA
################
################
################
# Get canvas data 2024 style
def canvas_data_2024_run():
print("Updating all tables.")
asyncio.run(canvas_data_2024())
print("Done with all tables.")
async def canvas_data_2024():
base_url: str = os.environ["DAP_API_URL"]
client_id: str = os.environ["DAP_CLIENT_ID"]
client_secret: str = os.environ["DAP_CLIENT_SECRET"]
#connection_string: str = "postgresql://postgres:rolley34@192.168.1.6/db"
# todo: use secrets
connection_string: str = "postgresql://postgres:rolley34@192.168.1.199/db"
desired_tables = "users,courses,communication_channels,context_modules,conversation_message_participants,conversation_messages,conversation_participants,conversations,course_sections,enrollment_states,enrollment_dates_overrides,enrollment_terms,enrollments,learning_outcome_groups,learning_outcome_question_results,learning_outcomes,pseudonyms,quizzes,scores,submissions,submission_versions,wiki_pages,wikis".split(',')
credentials = Credentials.create(client_id=client_id, client_secret=client_secret)
async with DatabaseConnection(connection_string).open() as db_connection:
async with DAPClient(base_url, credentials) as session:
#tables = await session.get_tables("canvas")
for table in desired_tables:
print(f" trying to update {table} ")
try:
#await SQLReplicator(session, db_connection).initialize("canvas", table)
await SQLReplicator(session, db_connection).synchronize("canvas", table)
except Exception as e:
print(f" - skipping {table} because {e}")
# Get canvas data 2024 style
def setup_canvas_data_2024_run():
print("Setting up all tables.")
asyncio.run(setup_canvas_data_2024())
print("Done with all tables.")
async def setup_canvas_data_2024():
base_url: str = os.environ["DAP_API_URL"]
client_id: str = os.environ["DAP_CLIENT_ID"]
client_secret: str = os.environ["DAP_CLIENT_SECRET"]
#connection_string: str = "postgresql://postgres:rolley34@192.168.1.6/db"
connection_string: str = "postgresql://postgres:rolley34@192.168.1.192/db"
desired_tables = "users,courses,communication_channels,context_modules,conversation_message_participants,conversation_messages,conversation_participants,conversations,course_sections,enrollment_states,enrollment_dates_overrides,enrollment_terms,enrollments,learning_outcome_groups,learning_outcome_question_results,learning_outcomes,pseudonyms,quizzes,scores,submissions,submission_versions,wiki_pages,wikis".split(',')
credentials = Credentials.create(client_id=client_id, client_secret=client_secret)
async with DatabaseConnection(connection_string).open() as db_connection:
async with DAPClient(base_url, credentials) as session:
#tables = await session.get_tables("canvas")
for table in desired_tables:
print(f" {table}")
try:
await SQLReplicator(session, db_connection).initialize("canvas", table)
except Exception as e:
print(f" - skipping {table} because {e}")
################
################ ROSTERS AND REGISTRATION
################
################
################
# todo: the pipeline is disorganized. Organize it to have
# a hope of taking all this to a higher level.
#
# todo: where does this belong in the pipeline? compare with recent_schedules()
# Take the generically named rosters uploads files and move them to a semester folder and give them a date.
def move_to_folder(sem,year,folder,files):
semester = year+sem
semester_path = 'cache/rosters/%s' % semester
if not os.path.isdir('cache/rosters/'+semester):
os.makedirs('cache/rosters/'+semester)
now = datetime.datetime.now().strftime('%Y-%m-%dT%H-%M')
print("+ Moving roster files to folder: %s" % semester_path)
if not os.path.isdir(semester_path):
print("+ Creating folder: %s" % semester_path)
os.makedirs(semester_path)
if 'courses.csv' in files:
os.rename('cache/rosters/courses-%s.csv' % folder, 'cache/rosters/%s/courses.%s.csv' % (semester,now))
if 'enrollments.csv' in files:
os.rename('cache/rosters/enrollments-%s.csv' % folder, 'cache/rosters/%s/enrollments.%s.csv' % (semester,now))
if 'users.csv' in files:
os.rename('cache/rosters/users-%s.csv' % folder, 'cache/rosters/%s/users.%s.csv' % (semester,now))
# Take raw upload (csv) files and make one big json out of them.
# This relates to enrollment files, not schedule.
def convert_roster_files(semester="",year="",folder=""):
if not semester:
semester = input("the semester? (ex: spring) ")
folder = input("Folder? (ex 2020-02-25-14-58-20) ")
uf = open('cache/rosters/users-'+folder+'.csv','r')
cf = open('cache/rosters/courses-'+folder+'.csv','r')
ef = open('cache/rosters/enrollments-'+folder+'.csv','r')
u = csv.DictReader(uf)
c = csv.DictReader(cf)
e = csv.DictReader(ef)
uu = [i for i in u]
cc = [i for i in c]
ee = [i for i in e]
uf.close()
cf.close()
ef.close()
myrosterfile = 'cache/rosters/roster_%s_%s.json' % (year, semester)
if os.path.exists(myrosterfile):
print(" -- Moving previous combined roster json file. opening %s ..." % myrosterfile)
last_fileobj = open(myrosterfile,'r')
last_file = json.load(last_fileobj)
last_fileobj.close()
info = last_file[3]
last_date = info['date_filestring']
print(' -- writing: cache/rosters/%s%s/roster_%s.json ...' % (year,semester,last_date))
try:
os.rename(myrosterfile, 'cache/rosters/%s%s/roster_%s.json ...' % (year,semester,last_date))
print(' -- ok')
except Exception as e:
print(" ** Failed because i couldn't move the previous roster file: %s" % myrosterfile)
print(e)
myrosterfile = "new_" + myrosterfile
pass
#os.remove('cache/old_rosters/roster_'+semester+'.'+last_date+'.json')
#os.rename(myrosterfile, 'cache/old_rosters/roster_'+semester+'.'+last_date+'.json')
newinfo = {'date_filestring': datetime.datetime.now().strftime('%Y-%m-%dT%H-%M'), }
try:
new_roster = codecs.open(myrosterfile,'w', 'utf-8')
new_roster.write( json.dumps( [uu,cc,ee,newinfo], indent=2 ))
new_roster.close()
print(" -- Wrote roster info to: %s." % myrosterfile)
except Exception as e:
print(" ** Failed because i couldn't move the previous roster file: %s" % myrosterfile)
print(" ** " + str(e))
def file_doesnt_exist(name):
# Get list of files in current directory
files = os.listdir()
# Filter out zero-size files and directories
files = [f for f in files if os.path.isfile(f) and os.path.getsize(f) > 0]
if name in files:
print( f" * file: {name} already exists. not downloading." )
else:
print( f" * file: {name} downloading." )
# Check if the file exists in the filtered list
return not (name in files)
# From instructure sftp site
def fetch_current_rosters():
import pysftp
cnopts = pysftp.CnOpts()
cnopts.hostkeys = None
with pysftp.Connection(instructure_url,username=instructure_username, private_key=instructure_private_key,cnopts=cnopts) as sftp:
sftp.chdir('SIS')
files = sftp.listdir()
ff = open('cache/pipeline.log.txt','a')
now = datetime.datetime.now()
exact_time = now.strftime('%Y-%m-%d-%H-%M-%S')
rounded_hour = (now.replace(second=0, microsecond=0, minute=0, hour=now.hour)
+ timedelta(hours=now.minute//30))
rounded_time = rounded_hour.strftime('%Y-%m-%d-%H')
if len(files)>0: # and 'users.csv' in files:
print(f"--> {exact_time}: I see these files at instructure ftp site:")
[print(f" - {f}") for f in files]
i = 0
seen_files = []
check = ['login','users','courses','enrollments']
for checking in check:
try:
if f'{checking}.csv' in files and file_doesnt_exist(f'{checking}-{rounded_time}.csv'):
sftp.get(f'{checking}.csv',f'cache/rosters/{checking}-{rounded_time}.csv')
i += 1
seen_files.append(f'{checking}.csv')
except:
print(f' * {checking}.csv not present')
print(' Saved %i data files in rosters folder.' % i)
ff.write( f" Saved {i} data files: {seen_files}")
if i>2:
if 'courses.csv' in seen_files:
courses = open(f'cache/rosters/courses-{rounded_time}.csv','r')
courses.readline()
a = courses.readline()
print(a)
courses.close()
parts = a.split(',')
year = parts[1][0:4]
ss = parts[1][4:6]
sem = {'30':'spring', '50':'summer', '70':'fall' }
this_sem = sem[ss]
print(f" -> This semester is: {this_sem}, {year}" )
print(f" -> Building data file... {rounded_time}")
convert_roster_files(this_sem,year,rounded_time)
print(' -> moving files...')
ff.write( f" Moved files to folder: {this_sem} {year} {rounded_time}\n")
move_to_folder(this_sem,year,rounded_time,seen_files)
else:
print(" * No courses file. Not moving files.")
ff.write( f" * No courses file. Not moving files.\n")
else:
print(f"--> {exact_time}: Don't see files.")
sftp.close()
def fetch_current_rosters_auto():
fetch_minute = "56,57,58,59,00,01,02,03,04,05,06".split(",")
for m in fetch_minute:
schedule.every().hour.at(f":{m}").do(fetch_current_rosters)
#schedule.every().day.at("12:35").do(sync_non_interactive)
#schedule.every().day.at("21:00").do(sync_non_interactive)
#print(f"running every hour on the :{fetch_minute}\n")
while True:
try:
schedule.run_pending()
time.sleep(4)
except Exception as e:
import traceback
print(" ---- * * * Failed with: %s" % str(e))
ff = open('cache/pipeline.log.txt','a')
ff.write(datetime.datetime.now().strftime('%Y-%m-%d-%H-%M-%S') + "\n")
ff.write(traceback.format_exc()+"\n---------\n\n")
ff.close()
#schedule.CancelJob
time.sleep(1)
# Canvas data, download all new files
def sync_non_interactive():
resp = do_request('/api/account/self/file/sync')
mylog.write(json.dumps(resp, indent=4))
#mylog.close()
gotten = os.listdir(local_data_folder)
wanted = []
i = 0
for x in resp['files']:
filename = x['filename']
exi = "No "
if filename in gotten: exi = "Yes"
else: wanted.append(x)
print(str(i) + '.\tLocal: %s\tRemote: %s' % ( exi, filename ))
i += 1
print("I will attempt to download %i files." % len(wanted))
#answer = input("Press enter to begin, or q to quit ")
#if not answer == '': return
good_count = 0
bad_count = 0
for W in wanted:
print("Downloading: " + W['filename'])
response = requests.request(method='GET', url=W['url'], stream=True)
if(response.status_code != 200):
print('Request response went bad. Got back a %s code, meaning the request was %s' % \
(response.status_code, response.reason))
print('URL: ' + W['url'])
bad_count += 1
else:
#Use the downloaded data
with open(local_data_folder + W['filename'], 'wb') as fd:
for chunk in response.iter_content(chunk_size=128):
fd.write(chunk)
print("Success")
good_count += 1
print("Out of %i files, %i succeeded and %i failed." % (len(wanted),good_count,bad_count))
## OLD STYLE CANVAS DATA
# Get something from Canvas Data
def do_request(path):
#Set up the request pieces
method = 'GET'
host = 'api.inshosteddata.com'
apiTime = datetime.utcnow().strftime('%a, %d %b %y %H:%M:%S GMT')
apiContentType = 'application/json'
msgList = []
msgList.append(method)
msgList.append(host)
msgList.append(apiContentType)
msgList.append('')
msgList.append(path)
msgList.append('')
msgList.append(apiTime)
msgList.append(apiSecret)
msgStr = bytes("".join("%s\n" % k for k in msgList).strip(),'utf-8')
sig = base64.b64encode(hmac.new(key=bytes(apiSecret,'utf-8'),msg=msgStr,digestmod=hashlib.sha256).digest())
sig = sig.decode('utf-8')
headers = {}
headers['Authorization'] = 'HMACAuth {}:{}'.format(apiKey,sig)
headers['Date'] = apiTime
headers['Content-type'] = apiContentType
#Submit the request/get a response
uri = "https://"+host+path
print (uri)
print (headers)
response = requests.request(method='GET', url=uri, headers=headers, stream=True)
#Check to make sure the request was ok
if(response.status_code != 200):
print(('Request response went bad. Got back a ', response.status_code, ' code, meaning the request was ', response.reason))
else:
#Use the downloaded data
jsonData = response.json()
#print(json.dumps(jsonData, indent=4))
return jsonData
################
################ SENDING DATA AWAY
################
################
################
# Upload a json file to www
def put_file(remotepath,localpath, localfile,prompt=1):
import pysftp
show_all = 0
folder = datetime.datetime.now().strftime('%Y-%m-%d-%H-%M-%S')
cnopts = pysftp.CnOpts()
cnopts.hostkeys = None
with pysftp.Connection(FTP_SITE,username=FTP_USER, password=FTP_PW,cnopts=cnopts) as sftp:
#todo: these paths
#files = sftp.listdir()
#print(folder + "\tI see these files on remote: ", files, "\n")
sftp.chdir(remotepath)
files = sftp.listdir()
if show_all: print(folder + "\tI see these files on remote: ", files, "\n")
localf = os.listdir(localpath)
if show_all: print("I see these local: ", localf)
if prompt:
input('ready to upload')
sftp.put(localpath+localfile, localfile, preserve_mtime=True)
sftp.close()
"""
# copy files and directories from local static, to remote static,
# preserving modification times on the files
for f in localf:
print("This local file: " + f + " ", end=' ')
if not f in files:
sftp.put('video_srt/'+classfoldername+'/'+f, f, preserve_mtime=True)
print("Uploaded.")
else:
print("Skipped.")
"""
"""if len(files)==3 and 'users.csv' in files:
sftp.get('courses.csv','rosters/courses-'+folder+'.csv')
sftp.get('users.csv','rosters/users-'+folder+'.csv')
sftp.get('enrollments.csv','rosters/enrollments-'+folder+'.csv')
print folder + '\tSaved three data files in rosters folder.'
courses = open('rosters/courses-'+folder+'.csv','r')
courses.readline()
a = courses.readline()
print a
courses.close()
parts = a.split(',')
year = parts[1][0:4]
ss = parts[1][4:6]
#print parts[1]
sem = {'30':'spring', '50':'summer', '70':'fall' }
this_sem = sem[ss]
#print this_sem, "", year
print folder + '\tbuilding data file...'
convert_roster_files(this_sem,year,folder)
print folder + '\tmoving files...'
move_to_folder(this_sem,year,folder)
else:
print folder + "\tDon't see all three files."""
################
################ GOOGLE DOCS
################
################
################
def sec(t): return "<h3>"+t+"</h3>\n"
def para(t): return "<p>"+t+"</p>\n"
def ul(t): return "<ul>"+t+"</ul>\n"
def li(t): return "<li>"+t+"</li>\n"
def question(t,bracket=1):
ret = ''
match = re.search( r'\[(.*)\]', t)
if match and bracket:
ret += "<a name='" + match.group(1) + "'></a>"
t = re.sub( r'\[.*\]','',t)
else:
parts = t.split(' ')
id = ''
for p in parts:
if re.search(r'[a-zA-Z]',p[0]): id += p[0]
ret += "<a name='%s'></a>" % id.lower()
return ret + '<div class="accordion" data-accordion=""><h4 class="acrd_cntl">' + t + '</h4>\n<div class="acrd_cntnt">'
def answer(t):
return t + '</div></div>\n'
def read_paragraph_element(element,type="NORMAL_TEXT"):
"""Returns the text in the given ParagraphElement.
Args:
element: a ParagraphElement from a Google Doc.
"""
text_run = element.get('textRun')
begin = ''
end = ''
if not text_run:
return ''
if 'textStyle' in text_run and 'link' in text_run['textStyle']:
begin = '<a href="' + text_run['textStyle']['link']['url'] + '">'
end = '</a>'
if 'textStyle' in text_run and 'bold' in text_run['textStyle'] and text_run['textStyle']['bold']==True and type=="NORMAL_TEXT":
begin = '<strong>' + begin
end = end + '</strong>'
content = text_run.get('content')
content = re.sub(u'\u000b','<br />\n',content)
return begin + content + end
def get_doc(docid, bracket=1, verbose=0):
import pickle
import os.path
from googleapiclient.discovery import build
from google_auth_oauthlib.flow import InstalledAppFlow
from google.auth.transport.requests import Request
#ooout = open(fileout,'w')
# If modifying these scopes, delete the file token.pickle.
SCOPES = ['https://www.googleapis.com/auth/documents.readonly']
creds = None
# The file token.pickle stores the user's access and refresh tokens, and is
# created automatically when the authorization flow completes for the first
# time.
if os.path.exists('token.pickle'):
with open('token.pickle', 'rb') as token:
creds = pickle.load(token)
# If there are no (valid) credentials available, let the user log in.
if not creds or not creds.valid:
if creds and creds.expired and creds.refresh_token:
creds.refresh(Request())
else:
flow = InstalledAppFlow.from_client_secrets_file(
'credentials.json', SCOPES)
creds = flow.run_local_server(port=0)
# Save the credentials for the next run
with open('token.pickle', 'wb') as token:
pickle.dump(creds, token)
service = build('docs', 'v1', credentials=creds)
# Retrieve the documents contents from the Docs service.
document = service.documents().get(documentId=docid).execute()
if verbose: print(document)
tempout = codecs.open('cache/trash/gdoctemp.txt','w','utf-8')
tempout.write( json.dumps(document,indent=2) + "\n\n\n------------------------------------\n\n")
if verbose: print('The title of the document is: {}'.format(document.get('title')))
doc_content = document.get('body').get('content')
if verbose: print(doc_content)
doc_objects = document.get('inlineObjects')
if verbose: print(doc_objects)
doc_lists = document.get('lists')
text = '<div class="acrd_grp" data-accordion-group="">'
last_type = ''
answer_text = ''
in_a_list = ''
img_count = 1
img_lookup = {}
img_heights = {}
img_widths = {}
if doc_objects:
for k,value in doc_objects.items():
tempout.write( "->" + k + "=" + json.dumps(value,indent=2) + "\n\n\n--\n\n")
if 'inlineObjectProperties' in value:
if 'embeddedObject' in value['inlineObjectProperties']:
if 'imageProperties' in value['inlineObjectProperties']['embeddedObject']:
if 'contentUri' in value['inlineObjectProperties']['embeddedObject']['imageProperties']:
print(k)
uu = value['inlineObjectProperties']['embeddedObject']['imageProperties']['contentUri']
response = requests.get(uu, stream=True)
name = 'image_' + str(img_count) + '.' + response.headers['content-type'].split('/')[1]
img_count += 1
img_lookup[k] = name
with open('cache/doc_images/'+name, 'wb') as out_file:
shutil.copyfileobj(response.raw, out_file)
print(uu)
print(response.headers)
print(name)
#input('x?')
del response
if 'size' in value['inlineObjectProperties']['embeddedObject']:
img_heights[k] = int(value['inlineObjectProperties']['embeddedObject']['size']['height']['magnitude'])
img_widths[k] = int(value['inlineObjectProperties']['embeddedObject']['size']['width']['magnitude'])
tempout.write('- - - - - - - -\n\n')
#for value in doc_lists:
# tempout.write( json.dumps(value,indent=2) + "\n\n\n--\n\n")
tempout.write('- - - - - - - -\n\n')
list_stack = []
list_depth = 0
last_list_depth = 0
for value in doc_content:
tempout.write( json.dumps(value,indent=2) + "\n\n\n")
if verbose: print(json.dumps(value, sort_keys=True, indent=4))
# todo: x link, x bold, list, image.
tag_fxn = para
if 'paragraph' in value:
this_text = ''
if 'bullet' in value['paragraph']:
# either we're (1)starting a new list, (2)in one, (3)starting a nested one, or (4)finished a nested one.
lid = value['paragraph']['bullet']['listId']
if not list_stack: # 1
list_stack.append(lid)
else:
if lid == list_stack[0]: # 2
pass
else:
if not lid in list_stack: # 3
list_stack.append(lid)
else: # 4
x = list_stack.pop()
while x != lid: list_stack.pop()
elif len(list_stack) > 0: # current para isn't a bullet but we still have a list open.
list_stack = []
list_depth = len(list_stack)
deeper = list_depth - last_list_depth
if deeper > 0:
answer_text += "<ul>" * deeper
elif deeper < 0:
deeper = -1 * deeper
answer_text += "</ul>" * deeper
if len(list_stack):
tag_fxn = li
elements = value.get('paragraph').get('elements')
# inlineObjectElement": {
# "inlineObjectId": "kix.ssseeu8j9cfx",
if 'paragraphStyle' in value.get('paragraph'):
style = value.get('paragraph').get('paragraphStyle')
#text += json.dumps(style, sort_keys=True, indent=4)
if 'namedStyleType' in style:
type = style['namedStyleType']
for elem in elements:
# text content
this_text += read_paragraph_element(elem,type)
# image content
if 'inlineObjectElement' in elem:
vpi = elem['inlineObjectElement']
if 'inlineObjectId' in vpi:
ii = vpi['inlineObjectId']
if ii in img_lookup:
img = img_lookup[ii]
h = img_heights[ii]
w = img_widths[ii]
this_text += '<img src="doc_images/%s" width="%i" height="%i" />' % (img,w,h)
if last_type=='NORMAL_TEXT' and type!=last_type:
text += answer(answer_text)
answer_text = ''
if type=='HEADING_2':
text += sec(this_text)
this_text = ''
elif type=='HEADING_3':
text += question(this_text,bracket)
this_text = ''
else:
answer_text += tag_fxn(this_text)
this_text = ''
last_type = type
last_list_depth = list_depth
elif 'table' in value:
# The text in table cells are in nested Structural Elements and tables may be
# nested.
text += "\nTABLE\n"
#table = value.get('table')
#for row in table.get('tableRows'):
# cells = row.get('tableCells')
# for cell in cells:
# text += read_strucutural_elements(cell.get('content'))
#elif 'tableOfContents' in value:
# # The text in the TOC is also in a Structural Element.
# toc = value.get('tableOfContents')
# text += read_strucutural_elements(toc.get('content'))
#else:
# print(json.dumps(value, sort_keys=True, indent=4))
text += answer(answer_text)
#text += '</div>'
#print(text)
return text
######### TRY #2 ######
def read_paragraph_element_2(element,type="NORMAL_TEXT"):
text_run = element.get('textRun')
begin = ''
end = ''
if not text_run: return ''
if 'textStyle' in text_run and 'link' in text_run['textStyle']:
begin = '<a href="' + text_run['textStyle']['link']['url'] + '">'
end = '</a>'
if 'textStyle' in text_run and 'bold' in text_run['textStyle'] and text_run['textStyle']['bold']==True and type=="NORMAL_TEXT":
begin = '<strong>' + begin
end = end + '</strong>'
elif 'textStyle' in text_run and 'italic' in text_run['textStyle'] and text_run['textStyle']['italic']==True and type=="NORMAL_TEXT":
begin = '<em>' + begin
end = end + '</em>'
content = text_run.get('content')
content = re.sub(u'\u000b','<br />\n',content)
return begin + content + end
# t is a string that begins with "Icons: " ... and contains comma(space) separated list
def handle_icons(t):
text = t[7:].strip()
parts = text.split(", ")
return ('icons',parts)
# t is a string that begins with "Tags: " ... and contains comma(space) separated list
def handle_tags(t):
text = t[6:].strip()
parts = text.split(", ")
return ('tags',parts)
def handle_question(t,bracket=1):
anchor = ''
match = re.search( r'\[(.*)\]', t)
if match and bracket:
anchor = match.group(1).lower()
t = re.sub( r'\[.*\]','',t)
else:
parts = t.split(' ')
for p in parts:
if re.search(r'[a-zA-Z]',p[0]): anchor += p[0].lower()
return ('question', t, anchor)
def handle_answer(t):
return ('answer',t)
def handle_sec(t): return ('section',t)
def handle_para(t): return ('paragraph',t)
def handle_ul(t): return ('unorderdedlist',t)
def handle_li(t): return ('listitem',t)
img_count = 1
img_lookup = {}
img_heights = {}
img_widths = {}
def fetch_doc_image(k,value):
global img_count, img_lookup, img_heights, img_widths
if 'inlineObjectProperties' in value:
if 'embeddedObject' in value['inlineObjectProperties']:
if 'imageProperties' in value['inlineObjectProperties']['embeddedObject']:
if 'contentUri' in value['inlineObjectProperties']['embeddedObject']['imageProperties']:
print(k)
uu = value['inlineObjectProperties']['embeddedObject']['imageProperties']['contentUri']
response = requests.get(uu, stream=True)
name = 'image_' + str(img_count) + '.' + response.headers['content-type'].split('/')[1]
img_count += 1
img_lookup[k] = name
with open('cache/doc_images/'+name, 'wb') as out_file:
shutil.copyfileobj(response.raw, out_file)
print(uu)
print(response.headers)
print(name)
del response
if 'size' in value['inlineObjectProperties']['embeddedObject']:
img_heights[k] = int(value['inlineObjectProperties']['embeddedObject']['size']['height']['magnitude'])
img_widths[k] = int(value['inlineObjectProperties']['embeddedObject']['size']['width']['magnitude'])
def get_doc_generic(docid, bracket=1, verbose=0):
import pickle
import os.path
from googleapiclient.discovery import build
from google_auth_oauthlib.flow import InstalledAppFlow
from google.auth.transport.requests import Request
global img_count, img_lookup, img_heights, img_widths
# If modifying these scopes, delete the file token.pickle.
SCOPES = ['https://www.googleapis.com/auth/documents.readonly']
creds = None
# The file token.pickle stores the user's access and refresh tokens, and is
# created automatically when the authorization flow completes for the first
# time.
if os.path.exists('token.pickle'):
with open('token.pickle', 'rb') as token:
creds = pickle.load(token)
if not creds or not creds.valid:
if creds and creds.expired and creds.refresh_token:
creds.refresh(Request())
else:
flow = InstalledAppFlow.from_client_secrets_file(
'credentials.json', SCOPES)
creds = flow.run_local_server(port=0)
# Save the credentials for the next run
with open('token.pickle', 'wb') as token:
pickle.dump(creds, token)
service = build('docs', 'v1', credentials=creds)
# Retrieve the documents contents from the Docs service.
document = service.documents().get(documentId=docid).execute()
tempout = codecs.open('cache/trash/gdoctemp.txt','w','utf-8')
tempout.write( json.dumps(document,indent=2) \
+ "\n\n\n------------------------------------\n\n")
if verbose: print('The title of the document is: {}'.format(document.get('title')))
doc_content = document.get('body').get('content')
doc_objects = document.get('inlineObjects')
doc_lists = document.get('lists')
#text = ''
result = []
last_type = ''
#answer_text = ''
answer = []
in_a_list = ''
# Get all the images
for k,value in doc_objects.items():
tempout.write( "->" + k + "=" + json.dumps(value,indent=2) + "\n\n\n--\n\n")
fetched = fetch_doc_image(k,value)
list_stack = []
list_depth = 0
last_list_depth = 0
for value in doc_content:
tempout.write( json.dumps(value,indent=2) + "\n\n\n")
if verbose: print(json.dumps(value, sort_keys=True, indent=4))
tag_fxn = handle_para
if 'paragraph' in value:
this_text = ''
# First we deal with if we're in a list.
if 'bullet' in value['paragraph']:
# either we're (1)starting a new list, (2)in one (do nothing),
# (3)starting a nested one, or (4)finished a nested one.
lid = value['paragraph']['bullet']['listId']
if not list_stack: # 1
list_stack.append(lid)
else:
if not lid == list_stack[0]:
if not lid in list_stack: # 3
list_stack.append(lid)
else: # 4
x = list_stack.pop()
while x != lid: list_stack.pop()
elif len(list_stack) > 0:
# current para isn't a bullet but we still have a list open.
list_stack = []
list_depth = len(list_stack)
deeper = list_depth - last_list_depth
if deeper > 0:
answer.append("<ul>" * deeper)
elif deeper < 0:
deeper = -1 * deeper
answer.append("</ul>" * deeper)
if len(list_stack):
tag_fxn = handle_li
# NOW the tag_fxn is either 'para' or 'li'... let's get the styling info next,
elements = value.get('paragraph').get('elements')
if 'paragraphStyle' in value.get('paragraph'):
style = value.get('paragraph').get('paragraphStyle')
if 'namedStyleType' in style:
type = style['namedStyleType']
# and FINALLY, the actual contents.
for elem in elements:
# text content
this_text += read_paragraph_element_2(elem,type)
# image content
if 'inlineObjectElement' in elem:
vpi = elem['inlineObjectElement']
if 'inlineObjectId' in vpi:
ii = vpi['inlineObjectId']
if ii in img_lookup:
img = img_lookup[ii]
h = img_heights[ii]
w = img_widths[ii]
this_text += '<img src="doc_images/%s" width="%i" height="%i" />' % (img,w,h)
# Now for something tricky. Call an appropriate handler, based on:
# (a) what is the paragraph style type?
# (b) is it different from the prev one?
if last_type=='NORMAL_TEXT' and type!=last_type:
if this_text.strip():
result.append(handle_answer(answer))
answer = []
#answer_text = ''
if type=='HEADING_2' and this_text.strip():
result.append( handle_sec(this_text) )
this_text = ''
elif type=='HEADING_3' and this_text.strip():
result.append(handle_question(this_text,bracket))
this_text = ''
else:
if this_text.lower().startswith('tags:'):
tag_fxn = handle_tags
if this_text.lower().startswith('icons:'):
tag_fxn = handle_icons
if this_text.strip():
answer.append(tag_fxn(this_text))
this_text = ''
last_type = type
last_list_depth = list_depth
elif 'table' in value:
pass
result.append(handle_answer(answer))
return json.dumps(result,indent=4)
def process_reg_history(term='fa25'):
from collections import defaultdict
from itertools import groupby
from operator import itemgetter
def read_grouped_csv(path):
with open(path, newline='') as f:
fieldnames = ['datetime', 'crn', 'course', 'teacher', 'max', 'enrolled', 'waitlistmax', 'waitlisted']
reader = csv.DictReader(f, fieldnames=fieldnames)
rows = sorted(reader, key=lambda r: r['datetime']) # Group by timestamp
grouped = {}
for ts, group in groupby(rows, key=itemgetter('datetime')):
grouped[ts] = {r['crn']: r for r in group}
return grouped
def crossed_threshold(old_val, new_val, max_val):
thresholds = [0.25, 0.5, 0.75, 1.0]
if int(max_val) == 0:
return False, None
old_ratio = int(old_val) / int(max_val)
new_ratio = int(new_val) / int(max_val)
for t in thresholds:
if old_ratio < t <= new_ratio:
return True, int(t * 100)
return False, None
def detect_changes(prev, curr):
changes = defaultdict(list)
all_crns = prev.keys() | curr.keys()
for crn in all_crns:
o, n = prev.get(crn), curr.get(crn)
if not o:
changes[crn].append((n['datetime'], "Section was added."))
elif not n:
changes[crn].append((
o['datetime'],
f"Section was removed (last seen: teacher {o['teacher']}, "
f"{o['enrolled']}/{o['max']} enrolled, {o['waitlisted']}/{o['waitlistmax']} waitlisted)."
))
else:
dt = n['datetime']
if o['teacher'] != n['teacher']:
changes[crn].append((dt, f"Teacher changed from {o['teacher']} to {n['teacher']}."))
if o['enrolled'] != n['enrolled']:
crossed, percent = crossed_threshold(o['enrolled'], n['enrolled'], n['max'])
if crossed:
changes[crn].append((dt, f"Enrollment crossed {percent}% ({n['enrolled']}/{n['max']})."))
if int(n['waitlisted']) > 10 and o['waitlisted'] != n['waitlisted']:
changes[crn].append((dt, f"Waitlist exceeds 10: {n['waitlisted']}."))
return changes
def time_to_iso(s):
return datetime.datetime.strptime(s, "%Y-%m-%dT%H-%M").isoformat()
def detect_changes_structured(prev, curr):
changes = defaultdict(list)
all_crns = prev.keys() | curr.keys()
for crn in all_crns:
o, n = prev.get(crn), curr.get(crn)
if not o:
changes[crn].append({'time':time_to_iso(n['datetime']), "type":'section update', 'message': "Section was added."})
elif not n:
changes[crn].append(
{'time':time_to_iso(o['datetime']), "type":'section update', 'message': "Section was removed.",
'value': o['enrolled'], 'capacity': o['max'], })
else:
dt = time_to_iso(n['datetime'])
if o['teacher'] != n['teacher']:
changes[crn].append({'time':dt, "type":'teacher_change',
'message': f"Teacher changed from {o['teacher']} to {n['teacher']}.",
'old_teacher': o['teacher'], 'new_teacher': n['teacher'], })
if o['enrolled'] != n['enrolled']:
crossed, percent = crossed_threshold(o['enrolled'], n['enrolled'], n['max'])
if crossed:
changes[crn].append({'time':dt, "type":'enrollment_milestone',
'message': f"Enrollment crossed {percent}% ({n['enrolled']}/{n['max']}).",
'percent':percent,'value':n['enrolled'],'capacity':n['max'] })
if int(n['waitlisted']) > 10 and o['waitlisted'] < n['waitlisted']:
changes[crn].append({'time':dt, "type":'enrollment_milestone',
'message': f"Waitlist exceeds 10: {n['waitlisted']}).",
'value':n['waitlisted']})
return changes
def process_diff_timeline(path):
snapshots = read_grouped_csv(path)
timeline = sorted(snapshots.keys())
timeline_diffs = []
timeline_diffs_structured = []
course_names = {} # crn -> latest known course name
for i in range(1, len(timeline)):
prev_ts, curr_ts = timeline[i-1], timeline[i]
prev, curr = snapshots[prev_ts], snapshots[curr_ts]
# update course name map
for crn, row in curr.items():
course_names[crn] = row['course']
delta = detect_changes(prev, curr)
timeline_diffs.append(delta)
delta_structured = detect_changes_structured(prev,curr)
timeline_diffs_structured.append(delta_structured)
# Flatten and group by crn
crn_changes = defaultdict(list)
for delta in timeline_diffs:
for crn, changes in delta.items():
crn_changes[crn].extend(changes)
# Flatten and group by crn
crn_changes_structured = defaultdict(list)
for delta in timeline_diffs_structured:
for crn, changes in delta.items():
crn_changes_structured[crn].extend(changes)
# Sort changes for each CRN by datetime
for crn in crn_changes:
crn_changes[crn].sort(key=lambda x: x[0])
# Sort changes for each CRN by datetime
for crn in crn_changes_structured:
crn_changes[crn].sort(key=lambda x: x[0])
return crn_changes, crn_changes_structured, course_names
fresh_history = requests.get(f"http://gavilan.cc/schedule/reg_history_{term}.csv").text
fresh_file = codecs.open(f'cache/reg_history_{term}.csv','w','utf-8')
fresh_file.write(fresh_history)
fresh_file.close()
output1 = codecs.open(f'cache/reg_timeline_{term}.txt','w','utf-8')
output2 = codecs.open(f'cache/reg_timeline_{term}.json','w','utf-8')
changes, changes_structured, course_names = process_diff_timeline(f"cache/reg_history_{term}.csv")
# once for plain text
for crn in sorted(changes, key=lambda c: course_names.get(c, "")):
course = course_names.get(crn, "")
course_output = {'code': course, 'crn':crn,'events':[]}
print(f"\n{course} (CRN {crn}):")
output1.write(f"\n{course} (CRN {crn}):\n")
for dt, msg in changes[crn]:
print(f" [{dt}] {msg}")
output1.write(f" [{dt}] {msg}\n")
course_output['events'].append({'message':msg, 'time':time_to_iso(dt)})
# again for structured
crn_list = []
for crn in sorted(changes_structured, key=lambda c: course_names.get(c, "")):
course = course_names.get(crn, "")
course_output = {'code': course, 'crn':crn,'events':changes_structured[crn]}
crn_list.append(course_output)
output2.write( json.dumps(crn_list,indent=2) )
output2.close()
def recreate_all():
for x in 'sp20 su20 fa20 sp21 su21 fa21 sp22 su22 fa22 sp23 su23 fa23 sp24'.split(' '):
try:
recreate_reg_data(x)
except Exception as e:
print(f'Failed on {x} with: {e}')
def recreate_reg_data(term="fa25"):
from collections import defaultdict
from datetime import datetime
def parse_row(row):
dt = datetime.strptime(row['datetime'], "%Y-%m-%dT%H-%M")
crn = row['crn']
enrolled = int(row['enrolled'])
return dt, row['datetime'], crn, enrolled
def reduce_latest_per_day(rows):
latest = defaultdict(dict) # latest[crn][date] = (dt, ts, enrolled)
latest_ts_by_date = {} # date → (dt, ts) for header naming
for row in rows:
dt, full_ts, crn, enrolled = parse_row(row)
date_str = dt.date().isoformat()
ts_header = dt.strftime("%Y-%m-%dT%H") # <-- this is what we want
# for each crn, per day, keep latest reading
if date_str not in latest[crn] or dt > latest[crn][date_str][0]:
latest[crn][date_str] = (dt, ts_header, enrolled)
# also record latest timestamp per day for consistent column headers
if date_str not in latest_ts_by_date or dt > latest_ts_by_date[date_str][0]:
latest_ts_by_date[date_str] = (dt, ts_header)
return latest, [ts for _, ts in sorted(latest_ts_by_date.values())]
def pivot_table(latest, headers):
crns = sorted(latest)
table = []
for crn in crns:
row = [crn]
for ts in headers:
date_str = ts[:10] # match on YYYY-MM-DD
val = latest[crn].get(date_str)
if val and val[1] == ts:
row.append(str(val[2]))
else:
row.append("")
table.append(row)
return ['crn'] + headers, table
#with open(f"cache/reg_history_{term}.csv", newline='') as f:
from io import StringIO
url = f"https://gavilan.cc/schedule/reg_history_{term}.csv"
# Download
resp = requests.get(url)
resp.raise_for_status() # raises if bad status
# Wrap the text in a file-like object
f = StringIO(resp.text)
fieldnames = ['datetime', 'crn', 'course', 'teacher', 'max', 'enrolled', 'waitlistmax', 'waitlisted']
reader = csv.DictReader(f, fieldnames=fieldnames)
rows = list(reader)
latest, headers = reduce_latest_per_day(rows)
header_row, table = pivot_table(latest, headers)
with open(f"cache/reg_data_{term}.csv", "w", newline='') as f:
writer = csv.writer(f)
writer.writerow(header_row)
writer.writerows(table)
if __name__ == "__main__":
print ('')
options = { 1: ['Fetch rosters on schedule',fetch_current_rosters_auto] ,
2: ['Get canvas data 2024 style', canvas_data_2024_run ],
3: ['Set up canvas data 2024 style', setup_canvas_data_2024_run],
4: ['Narrative timeline of section updates', process_reg_history],
5: ['Create narrative format all semesters', recreate_all],
6: ['Recreate reg_data from full reg history', recreate_reg_data],
}
'''1: ['Re-create schedule csv and json files from raw html',recent_schedules] ,
2: ['Fetch rosters',fetch_current_rosters] ,
3:
4: ['Compute how registration is filling up classes', schedule_filling] ,
5: ['Manually convert 3 csv files to joined json enrollment file.', convert_roster_files] ,
6: ['Canvas data: interactive sync', interactive ],
7: ['Canvas data: automated sync', sync_non_interactive ],
8:
9:
16: ['Scrape schedule from ssb', scrape_schedule_multi ],
14: ['Generate latestart schedule', list_latestarts ],
15: ['Test ssb calls with python', scrape_schedule_py ],
10: ['schedule to db', scrape_for_db ],
11: ['clean argos draft schedule file', argos_data_from_cvc],
12: ['make expanded schedule json files of old semesters', expand_old_semesters ],
13: ['Parse deanza schedule', dza_sched ],
'''
if len(sys.argv) > 1 and re.search(r'^\d+',sys.argv[1]):
resp = int(sys.argv[1])
print("\n\nPerforming: %s\n\n" % options[resp][0])
else:
print ('')
for key in options:
print(str(key) + '.\t' + options[key][0])
print('')
resp = input('Choose: ')
# Call the function in the options dict
options[ int(resp)][1]()
# Testing
#if __name__ == "__main__":
#users = fetch('/api/v1/courses/69/users?per_page=100',1)
#print "These are the users: "
#print users
#getSemesterSchedule()
#get_doc()
#pass