1189 lines
44 KiB
Python
1189 lines
44 KiB
Python
#from sqlite3 import paramstyle
|
|
#from time import strptime
|
|
#from util import UnicodeDictReader
|
|
import codecs, json, requests, re, csv, datetime, pysftp, os, jsondiff, os.path
|
|
import sys, shutil, hmac, hashlib, base64, schedule, time, pathlib, datetime
|
|
#import pdb
|
|
from datetime import timedelta
|
|
import datetime
|
|
#from collections import defaultdict
|
|
|
|
from canvas_secrets import apiKey, apiSecret, FTP_SITE, FTP_USER, FTP_PW, url, domain, account_id, header, header_media, g_id, g_secret
|
|
from canvas_secrets import instructure_url, instructure_username, instructure_private_key
|
|
|
|
import os, asyncio
|
|
from dap.api import DAPClient
|
|
from dap.dap_types import Credentials
|
|
from dap.integration.database import DatabaseConnection
|
|
from dap.replicator.sql import SQLReplicator
|
|
|
|
|
|
|
|
"""
|
|
Everything to do with fetching data,
|
|
- From iLearn, via token
|
|
- current roster uploads from instructures sftp site
|
|
- raw logs and other from canvas data repo
|
|
- from ssb, use firefox to scrape the schedule
|
|
|
|
|
|
And some subsequent processing:
|
|
- Raw roster files, into a more compact json format
|
|
- Raw logs into something more useful
|
|
"""
|
|
|
|
verbose = False
|
|
|
|
users = {}
|
|
users_by_id = {}
|
|
|
|
# todo: all these constants for SSB -- line 1008
|
|
#
|
|
# todo: https://stackoverflow.com/questions/42656247/how-can-i-use-canvas-data-rest-api-using-python
|
|
|
|
|
|
|
|
|
|
|
|
|
|
sys.setrecursionlimit( 100000 )
|
|
|
|
local_data_folder = 'cache/canvas_data/'
|
|
mylog = codecs.open(local_data_folder + 'temp_log.txt','w')
|
|
|
|
|
|
|
|
|
|
class FetchError(Exception):
|
|
pass
|
|
|
|
|
|
DEBUG = 0
|
|
|
|
def d(s,end=''):
|
|
global DEBUG
|
|
if end and DEBUG: print(s,end=end)
|
|
elif DEBUG: print(s)
|
|
|
|
################
|
|
################ CANVAS API MAIN FETCHING FUNCTIONS
|
|
################
|
|
################
|
|
################
|
|
|
|
|
|
|
|
|
|
# Main canvas querying fxn
|
|
def fetch(target,verbose=0,params=0,media=0):
|
|
# if there are more results, recursivly call myself, adding on to the results.
|
|
results = 0
|
|
if target[0:4] != "http": target = url + target
|
|
if verbose:
|
|
print("++ Fetching: " + target)
|
|
if media:
|
|
r2 = requests.get(target, headers = header_media)
|
|
elif params:
|
|
r2 = requests.get(target, headers = header, params = params)
|
|
else:
|
|
r2 = requests.get(target, headers = header)
|
|
#if verbose:
|
|
#print "++ Got: " + r2.text
|
|
try:
|
|
results = json.loads(r2.text)
|
|
count = len(results)
|
|
except:
|
|
print("-- Failed to parse: ", r2.text)
|
|
if verbose:
|
|
print("Got %i results" % count)
|
|
if verbose > 1:
|
|
print(r2.headers)
|
|
|
|
tempout = codecs.open('cache/fetchcache.txt','a','utf-8')
|
|
tempout.write(r2.text+"\n\n")
|
|
tempout.close()
|
|
|
|
if ('link' in r2.headers and count > 0):
|
|
links = r2.headers['link'].split(',')
|
|
for L in links:
|
|
ll = L.split(';')
|
|
link = ll[0].replace("<","")
|
|
link = link.replace(">","")
|
|
if re.search(r'next', ll[1]):
|
|
if (verbose): print("++ More link: " + link)
|
|
#link = re.sub(r'per_page=10$', 'per_page=100', link) # link.replace('per_page=10','per_page=500')
|
|
#if (verbose): print("++ More link: " + link)
|
|
|
|
nest = fetch(link,verbose,params,media)
|
|
if isinstance(results,dict): results.update(nest)
|
|
else: results.extend(nest)
|
|
return results
|
|
|
|
# Main canvas querying fxn - stream version - don't die on big requests
|
|
def fetch_stream(target,verbose=0):
|
|
# if there are more results, recursivly call myself, adding on to the results.
|
|
results = 0
|
|
while target:
|
|
if target[0:4] != "http": target = url + target
|
|
if verbose:
|
|
print("++ Fetching: " + target)
|
|
r2 = requests.get(target, headers = header)
|
|
if r2.status_code == 502:
|
|
raise FetchError()
|
|
try:
|
|
results = json.loads(r2.text)
|
|
count = len(results)
|
|
except:
|
|
print("-- Failed to parse: ", r2.text)
|
|
if verbose:
|
|
print("Got %i results" % count)
|
|
if verbose > 1:
|
|
print(r2.headers)
|
|
tempout = codecs.open('cache/fetchcache.txt','a','utf-8')
|
|
tempout.write(r2.text+"\n\n")
|
|
tempout.close()
|
|
|
|
next_link_found = 0
|
|
if ('link' in r2.headers and count > 0):
|
|
links = r2.headers['link'].split(',')
|
|
for L in links:
|
|
ll = L.split(';')
|
|
link = ll[0].replace("<","")
|
|
link = link.replace(">","")
|
|
if re.search(r'next', ll[1]):
|
|
target = link
|
|
next_link_found = 1
|
|
break
|
|
if not next_link_found: target = 0
|
|
yield results
|
|
|
|
|
|
# for dicts with one key, collapse that one key out, cause
|
|
# paging makes problems... example: enrollment_terms
|
|
def fetch_collapse(target,collapse='',verbose=0):
|
|
# if there are more results, recursivly call myself, adding on to the results.
|
|
results = 0
|
|
if target[0:4] != "http": target = url + target
|
|
if verbose:
|
|
print("++ Fetching: " + target)
|
|
r2 = requests.get(target, headers = header)
|
|
#if verbose:
|
|
#print "++ Got: " + r2.text
|
|
try:
|
|
results = json.loads(r2.text)
|
|
except:
|
|
print("-- Failed to parse: ", r2.text)
|
|
if verbose: print(r2.headers)
|
|
|
|
if collapse and collapse in results:
|
|
results = results[collapse]
|
|
|
|
if ('link' in r2.headers):
|
|
links = r2.headers['link'].split(',')
|
|
for L in links:
|
|
ll = L.split(';')
|
|
link = ll[0].replace("<","")
|
|
link = link.replace(">","")
|
|
if re.search(r'next', ll[1]):
|
|
if (verbose): print("++ More link: " + link)
|
|
nest = fetch_collapse(link, collapse, verbose)
|
|
if isinstance(results,dict): results.update(nest)
|
|
else: results.extend(nest)
|
|
return results
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
################
|
|
################ CANVAS DATA
|
|
################
|
|
################
|
|
################
|
|
|
|
|
|
# Get canvas data 2024 style
|
|
def canvas_data_2024_run():
|
|
print("Updating all tables.")
|
|
asyncio.run(canvas_data_2024())
|
|
print("Done with all tables.")
|
|
|
|
|
|
async def canvas_data_2024():
|
|
|
|
base_url: str = os.environ["DAP_API_URL"]
|
|
client_id: str = os.environ["DAP_CLIENT_ID"]
|
|
client_secret: str = os.environ["DAP_CLIENT_SECRET"]
|
|
#connection_string: str = "postgresql://postgres:rolley34@192.168.1.6/db"
|
|
|
|
# todo: use secrets
|
|
connection_string: str = "postgresql://postgres:rolley34@deep1/db"
|
|
|
|
desired_tables = "users,courses,communication_channels,context_modules,conversation_message_participants,conversation_messages,conversation_participants,conversations,course_sections,enrollment_states,enrollment_dates_overrides,enrollment_terms,enrollments,learning_outcome_groups,learning_outcome_question_results,learning_outcomes,pseudonyms,quizzes,scores,submissions,submission_versions,wiki_pages,wikis".split(',')
|
|
credentials = Credentials.create(client_id=client_id, client_secret=client_secret)
|
|
|
|
async with DatabaseConnection(connection_string).open() as db_connection:
|
|
async with DAPClient(base_url, credentials) as session:
|
|
#tables = await session.get_tables("canvas")
|
|
for table in desired_tables:
|
|
print(f" trying to update {table} ")
|
|
try:
|
|
#await SQLReplicator(session, db_connection).initialize("canvas", table)
|
|
await SQLReplicator(session, db_connection).synchronize("canvas", table)
|
|
except Exception as e:
|
|
print(f" - skipping {table} because {e}")
|
|
|
|
|
|
|
|
# Get canvas data 2024 style
|
|
def setup_canvas_data_2024_run():
|
|
print("Setting up all tables.")
|
|
asyncio.run(setup_canvas_data_2024())
|
|
print("Done with all tables.")
|
|
|
|
|
|
async def setup_canvas_data_2024():
|
|
|
|
base_url: str = os.environ["DAP_API_URL"]
|
|
client_id: str = os.environ["DAP_CLIENT_ID"]
|
|
client_secret: str = os.environ["DAP_CLIENT_SECRET"]
|
|
#connection_string: str = "postgresql://postgres:rolley34@192.168.1.6/db"
|
|
connection_string: str = "postgresql://postgres:rolley34@192.168.1.192/db"
|
|
|
|
desired_tables = "users,courses,communication_channels,context_modules,conversation_message_participants,conversation_messages,conversation_participants,conversations,course_sections,enrollment_states,enrollment_dates_overrides,enrollment_terms,enrollments,learning_outcome_groups,learning_outcome_question_results,learning_outcomes,pseudonyms,quizzes,scores,submissions,submission_versions,wiki_pages,wikis".split(',')
|
|
credentials = Credentials.create(client_id=client_id, client_secret=client_secret)
|
|
|
|
async with DatabaseConnection(connection_string).open() as db_connection:
|
|
async with DAPClient(base_url, credentials) as session:
|
|
#tables = await session.get_tables("canvas")
|
|
for table in desired_tables:
|
|
print(f" {table}")
|
|
try:
|
|
await SQLReplicator(session, db_connection).initialize("canvas", table)
|
|
except Exception as e:
|
|
print(f" - skipping {table} because {e}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
################
|
|
################ ROSTERS AND REGISTRATION
|
|
################
|
|
################
|
|
################
|
|
|
|
# todo: the pipeline is disorganized. Organize it to have
|
|
# a hope of taking all this to a higher level.
|
|
#
|
|
|
|
# todo: where does this belong in the pipeline? compare with recent_schedules()
|
|
|
|
|
|
|
|
# Take the generically named rosters uploads files and move them to a semester folder and give them a date.
|
|
def move_to_folder(sem,year,folder,files):
|
|
semester = year+sem
|
|
semester_path = 'cache/rosters/%s' % semester
|
|
if not os.path.isdir('cache/rosters/'+semester):
|
|
os.makedirs('cache/rosters/'+semester)
|
|
now = datetime.datetime.now().strftime('%Y-%m-%dT%H-%M')
|
|
print("+ Moving roster files to folder: %s" % semester_path)
|
|
if not os.path.isdir(semester_path):
|
|
print("+ Creating folder: %s" % semester_path)
|
|
os.makedirs(semester_path)
|
|
if 'courses.csv' in files:
|
|
os.rename('cache/rosters/courses-%s.csv' % folder, 'cache/rosters/%s/courses.%s.csv' % (semester,now))
|
|
if 'enrollments.csv' in files:
|
|
os.rename('cache/rosters/enrollments-%s.csv' % folder, 'cache/rosters/%s/enrollments.%s.csv' % (semester,now))
|
|
if 'users.csv' in files:
|
|
os.rename('cache/rosters/users-%s.csv' % folder, 'cache/rosters/%s/users.%s.csv' % (semester,now))
|
|
|
|
|
|
|
|
# Take raw upload (csv) files and make one big json out of them.
|
|
# This relates to enrollment files, not schedule.
|
|
def convert_roster_files(semester="",year="",folder=""):
|
|
if not semester:
|
|
semester = input("the semester? (ex: spring) ")
|
|
folder = input("Folder? (ex 2020-02-25-14-58-20) ")
|
|
uf = open('cache/rosters/users-'+folder+'.csv','r')
|
|
cf = open('cache/rosters/courses-'+folder+'.csv','r')
|
|
ef = open('cache/rosters/enrollments-'+folder+'.csv','r')
|
|
u = csv.DictReader(uf)
|
|
c = csv.DictReader(cf)
|
|
e = csv.DictReader(ef)
|
|
uu = [i for i in u]
|
|
cc = [i for i in c]
|
|
ee = [i for i in e]
|
|
uf.close()
|
|
cf.close()
|
|
ef.close()
|
|
myrosterfile = 'cache/rosters/roster_%s_%s.json' % (year, semester)
|
|
|
|
if os.path.exists(myrosterfile):
|
|
print(" -- Moving previous combined roster json file. opening %s ..." % myrosterfile)
|
|
last_fileobj = open(myrosterfile,'r')
|
|
last_file = json.load(last_fileobj)
|
|
|
|
last_fileobj.close()
|
|
|
|
info = last_file[3]
|
|
last_date = info['date_filestring']
|
|
|
|
print(' -- writing: cache/rosters/%s%s/roster_%s.json ...' % (year,semester,last_date))
|
|
|
|
try:
|
|
os.rename(myrosterfile, 'cache/rosters/%s%s/roster_%s.json ...' % (year,semester,last_date))
|
|
print(' -- ok')
|
|
except Exception as e:
|
|
print(" ** Failed because i couldn't move the previous roster file: %s" % myrosterfile)
|
|
print(e)
|
|
myrosterfile = "new_" + myrosterfile
|
|
pass
|
|
#os.remove('cache/old_rosters/roster_'+semester+'.'+last_date+'.json')
|
|
#os.rename(myrosterfile, 'cache/old_rosters/roster_'+semester+'.'+last_date+'.json')
|
|
|
|
newinfo = {'date_filestring': datetime.datetime.now().strftime('%Y-%m-%dT%H-%M'), }
|
|
try:
|
|
new_roster = codecs.open(myrosterfile,'w', 'utf-8')
|
|
new_roster.write( json.dumps( [uu,cc,ee,newinfo], indent=2 ))
|
|
new_roster.close()
|
|
print(" -- Wrote roster info to: %s." % myrosterfile)
|
|
except Exception as e:
|
|
print(" ** Failed because i couldn't move the previous roster file: %s" % myrosterfile)
|
|
print(" ** " + str(e))
|
|
|
|
|
|
|
|
|
|
|
|
def file_doesnt_exist(name):
|
|
# Get list of files in current directory
|
|
files = os.listdir()
|
|
|
|
# Filter out zero-size files and directories
|
|
files = [f for f in files if os.path.isfile(f) and os.path.getsize(f) > 0]
|
|
|
|
if name in files:
|
|
print( f" * file: {name} already exists. not downloading." )
|
|
else:
|
|
print( f" * file: {name} downloading." )
|
|
|
|
# Check if the file exists in the filtered list
|
|
return not (name in files)
|
|
|
|
|
|
# From instructure sftp site
|
|
def fetch_current_rosters():
|
|
cnopts = pysftp.CnOpts()
|
|
cnopts.hostkeys = None
|
|
with pysftp.Connection(instructure_url,username=instructure_username, private_key=instructure_private_key,cnopts=cnopts) as sftp:
|
|
sftp.chdir('SIS')
|
|
files = sftp.listdir()
|
|
ff = open('cache/pipeline.log.txt','a')
|
|
now = datetime.datetime.now()
|
|
exact_time = now.strftime('%Y-%m-%d-%H-%M-%S')
|
|
rounded_hour = (now.replace(second=0, microsecond=0, minute=0, hour=now.hour)
|
|
+ timedelta(hours=now.minute//30))
|
|
|
|
rounded_time = rounded_hour.strftime('%Y-%m-%d-%H')
|
|
|
|
if len(files)>0: # and 'users.csv' in files:
|
|
print(f"--> {exact_time}: I see these files at instructure ftp site:")
|
|
[print(f" - {f}") for f in files]
|
|
i = 0
|
|
seen_files = []
|
|
check = ['login','users','courses','enrollments']
|
|
|
|
for checking in check:
|
|
try:
|
|
if f'{checking}.csv' in files and file_doesnt_exist(f'{checking}-{rounded_time}.csv'):
|
|
sftp.get(f'{checking}.csv',f'cache/rosters/{checking}-{rounded_time}.csv')
|
|
i += 1
|
|
seen_files.append(f'{checking}.csv')
|
|
except:
|
|
print(f' * {checking}.csv not present')
|
|
print(' Saved %i data files in rosters folder.' % i)
|
|
ff.write( f" Saved {i} data files: {seen_files}")
|
|
|
|
if i>2:
|
|
if 'courses.csv' in seen_files:
|
|
courses = open(f'cache/rosters/courses-{rounded_time}.csv','r')
|
|
courses.readline()
|
|
a = courses.readline()
|
|
print(a)
|
|
courses.close()
|
|
parts = a.split(',')
|
|
year = parts[1][0:4]
|
|
ss = parts[1][4:6]
|
|
sem = {'30':'spring', '50':'summer', '70':'fall' }
|
|
this_sem = sem[ss]
|
|
print(f" -> This semester is: {this_sem}, {year}" )
|
|
print(f" -> Building data file... {rounded_time}")
|
|
convert_roster_files(this_sem,year,rounded_time)
|
|
print(' -> moving files...')
|
|
ff.write( f" Moved files to folder: {this_sem} {year} {rounded_time}\n")
|
|
move_to_folder(this_sem,year,rounded_time,seen_files)
|
|
else:
|
|
print(" * No courses file. Not moving files.")
|
|
ff.write( f" * No courses file. Not moving files.\n")
|
|
else:
|
|
print(f"--> {exact_time}: Don't see files.")
|
|
sftp.close()
|
|
|
|
def fetch_current_rosters_auto():
|
|
fetch_minute = "56,57,58,59,00,01,02,03,04,05,06".split(",")
|
|
for m in fetch_minute:
|
|
schedule.every().hour.at(f":{m}").do(fetch_current_rosters)
|
|
|
|
#schedule.every().day.at("12:35").do(sync_non_interactive)
|
|
#schedule.every().day.at("21:00").do(sync_non_interactive)
|
|
|
|
|
|
#print(f"running every hour on the :{fetch_minute}\n")
|
|
while True:
|
|
try:
|
|
schedule.run_pending()
|
|
time.sleep(4)
|
|
except Exception as e:
|
|
import traceback
|
|
print(" ---- * * * Failed with: %s" % str(e))
|
|
ff = open('cache/pipeline.log.txt','a')
|
|
ff.write(datetime.datetime.now().strftime('%Y-%m-%d-%H-%M-%S') + "\n")
|
|
ff.write(traceback.format_exc()+"\n---------\n\n")
|
|
ff.close()
|
|
#schedule.CancelJob
|
|
time.sleep(1)
|
|
|
|
|
|
# Canvas data, download all new files
|
|
def sync_non_interactive():
|
|
resp = do_request('/api/account/self/file/sync')
|
|
mylog.write(json.dumps(resp, indent=4))
|
|
#mylog.close()
|
|
gotten = os.listdir(local_data_folder)
|
|
wanted = []
|
|
i = 0
|
|
for x in resp['files']:
|
|
filename = x['filename']
|
|
exi = "No "
|
|
if filename in gotten: exi = "Yes"
|
|
else: wanted.append(x)
|
|
|
|
print(str(i) + '.\tLocal: %s\tRemote: %s' % ( exi, filename ))
|
|
i += 1
|
|
print("I will attempt to download %i files." % len(wanted))
|
|
|
|
#answer = input("Press enter to begin, or q to quit ")
|
|
#if not answer == '': return
|
|
|
|
good_count = 0
|
|
bad_count = 0
|
|
for W in wanted:
|
|
print("Downloading: " + W['filename'])
|
|
response = requests.request(method='GET', url=W['url'], stream=True)
|
|
if(response.status_code != 200):
|
|
print('Request response went bad. Got back a %s code, meaning the request was %s' % \
|
|
(response.status_code, response.reason))
|
|
print('URL: ' + W['url'])
|
|
bad_count += 1
|
|
|
|
else:
|
|
#Use the downloaded data
|
|
with open(local_data_folder + W['filename'], 'wb') as fd:
|
|
for chunk in response.iter_content(chunk_size=128):
|
|
fd.write(chunk)
|
|
print("Success")
|
|
good_count += 1
|
|
print("Out of %i files, %i succeeded and %i failed." % (len(wanted),good_count,bad_count))
|
|
|
|
|
|
## OLD STYLE CANVAS DATA
|
|
|
|
# Get something from Canvas Data
|
|
def do_request(path):
|
|
#Set up the request pieces
|
|
method = 'GET'
|
|
host = 'api.inshosteddata.com'
|
|
apiTime = datetime.utcnow().strftime('%a, %d %b %y %H:%M:%S GMT')
|
|
apiContentType = 'application/json'
|
|
|
|
msgList = []
|
|
msgList.append(method)
|
|
msgList.append(host)
|
|
msgList.append(apiContentType)
|
|
msgList.append('')
|
|
msgList.append(path)
|
|
msgList.append('')
|
|
msgList.append(apiTime)
|
|
msgList.append(apiSecret)
|
|
|
|
msgStr = bytes("".join("%s\n" % k for k in msgList).strip(),'utf-8')
|
|
|
|
sig = base64.b64encode(hmac.new(key=bytes(apiSecret,'utf-8'),msg=msgStr,digestmod=hashlib.sha256).digest())
|
|
sig = sig.decode('utf-8')
|
|
|
|
headers = {}
|
|
headers['Authorization'] = 'HMACAuth {}:{}'.format(apiKey,sig)
|
|
headers['Date'] = apiTime
|
|
headers['Content-type'] = apiContentType
|
|
|
|
|
|
#Submit the request/get a response
|
|
uri = "https://"+host+path
|
|
print (uri)
|
|
print (headers)
|
|
response = requests.request(method='GET', url=uri, headers=headers, stream=True)
|
|
|
|
#Check to make sure the request was ok
|
|
if(response.status_code != 200):
|
|
print(('Request response went bad. Got back a ', response.status_code, ' code, meaning the request was ', response.reason))
|
|
else:
|
|
#Use the downloaded data
|
|
jsonData = response.json()
|
|
#print(json.dumps(jsonData, indent=4))
|
|
return jsonData
|
|
|
|
|
|
|
|
|
|
|
|
|
|
################
|
|
################ SENDING DATA AWAY
|
|
################
|
|
################
|
|
################
|
|
|
|
# Upload a json file to www
|
|
def put_file(remotepath,localpath, localfile,prompt=1):
|
|
show_all = 0
|
|
folder = datetime.datetime.now().strftime('%Y-%m-%d-%H-%M-%S')
|
|
cnopts = pysftp.CnOpts()
|
|
cnopts.hostkeys = None
|
|
|
|
with pysftp.Connection(FTP_SITE,username=FTP_USER, password=FTP_PW,cnopts=cnopts) as sftp:
|
|
#todo: these paths
|
|
#files = sftp.listdir()
|
|
#print(folder + "\tI see these files on remote: ", files, "\n")
|
|
sftp.chdir(remotepath)
|
|
files = sftp.listdir()
|
|
if show_all: print(folder + "\tI see these files on remote: ", files, "\n")
|
|
localf = os.listdir(localpath)
|
|
if show_all: print("I see these local: ", localf)
|
|
if prompt:
|
|
input('ready to upload')
|
|
sftp.put(localpath+localfile, localfile, preserve_mtime=True)
|
|
sftp.close()
|
|
|
|
|
|
"""
|
|
# copy files and directories from local static, to remote static,
|
|
# preserving modification times on the files
|
|
for f in localf:
|
|
print("This local file: " + f + " ", end=' ')
|
|
if not f in files:
|
|
sftp.put('video_srt/'+classfoldername+'/'+f, f, preserve_mtime=True)
|
|
print("Uploaded.")
|
|
else:
|
|
print("Skipped.")
|
|
"""
|
|
|
|
"""if len(files)==3 and 'users.csv' in files:
|
|
sftp.get('courses.csv','rosters/courses-'+folder+'.csv')
|
|
sftp.get('users.csv','rosters/users-'+folder+'.csv')
|
|
sftp.get('enrollments.csv','rosters/enrollments-'+folder+'.csv')
|
|
print folder + '\tSaved three data files in rosters folder.'
|
|
|
|
courses = open('rosters/courses-'+folder+'.csv','r')
|
|
courses.readline()
|
|
a = courses.readline()
|
|
print a
|
|
courses.close()
|
|
parts = a.split(',')
|
|
year = parts[1][0:4]
|
|
ss = parts[1][4:6]
|
|
#print parts[1]
|
|
sem = {'30':'spring', '50':'summer', '70':'fall' }
|
|
this_sem = sem[ss]
|
|
#print this_sem, "", year
|
|
print folder + '\tbuilding data file...'
|
|
convert_roster_files(this_sem,year,folder)
|
|
print folder + '\tmoving files...'
|
|
move_to_folder(this_sem,year,folder)
|
|
else:
|
|
print folder + "\tDon't see all three files."""
|
|
|
|
|
|
|
|
################
|
|
################ GOOGLE DOCS
|
|
################
|
|
################
|
|
################
|
|
|
|
def sec(t): return "<h3>"+t+"</h3>\n"
|
|
def para(t): return "<p>"+t+"</p>\n"
|
|
def ul(t): return "<ul>"+t+"</ul>\n"
|
|
def li(t): return "<li>"+t+"</li>\n"
|
|
|
|
def question(t,bracket=1):
|
|
ret = ''
|
|
match = re.search( r'\[(.*)\]', t)
|
|
if match and bracket:
|
|
ret += "<a name='" + match.group(1) + "'></a>"
|
|
t = re.sub( r'\[.*\]','',t)
|
|
else:
|
|
parts = t.split(' ')
|
|
id = ''
|
|
for p in parts:
|
|
if re.search(r'[a-zA-Z]',p[0]): id += p[0]
|
|
ret += "<a name='%s'></a>" % id.lower()
|
|
return ret + '<div class="accordion" data-accordion=""><h4 class="acrd_cntl">' + t + '</h4>\n<div class="acrd_cntnt">'
|
|
|
|
def answer(t):
|
|
return t + '</div></div>\n'
|
|
|
|
def read_paragraph_element(element,type="NORMAL_TEXT"):
|
|
"""Returns the text in the given ParagraphElement.
|
|
|
|
Args:
|
|
element: a ParagraphElement from a Google Doc.
|
|
"""
|
|
text_run = element.get('textRun')
|
|
begin = ''
|
|
end = ''
|
|
if not text_run:
|
|
return ''
|
|
if 'textStyle' in text_run and 'link' in text_run['textStyle']:
|
|
begin = '<a href="' + text_run['textStyle']['link']['url'] + '">'
|
|
end = '</a>'
|
|
if 'textStyle' in text_run and 'bold' in text_run['textStyle'] and text_run['textStyle']['bold']==True and type=="NORMAL_TEXT":
|
|
begin = '<strong>' + begin
|
|
end = end + '</strong>'
|
|
|
|
content = text_run.get('content')
|
|
content = re.sub(u'\u000b','<br />\n',content)
|
|
|
|
return begin + content + end
|
|
|
|
|
|
def get_doc(docid, bracket=1, verbose=0):
|
|
import pickle
|
|
import os.path
|
|
from googleapiclient.discovery import build
|
|
from google_auth_oauthlib.flow import InstalledAppFlow
|
|
from google.auth.transport.requests import Request
|
|
|
|
#ooout = open(fileout,'w')
|
|
|
|
# If modifying these scopes, delete the file token.pickle.
|
|
SCOPES = ['https://www.googleapis.com/auth/documents.readonly']
|
|
creds = None
|
|
# The file token.pickle stores the user's access and refresh tokens, and is
|
|
# created automatically when the authorization flow completes for the first
|
|
# time.
|
|
if os.path.exists('token.pickle'):
|
|
with open('token.pickle', 'rb') as token:
|
|
creds = pickle.load(token)
|
|
# If there are no (valid) credentials available, let the user log in.
|
|
if not creds or not creds.valid:
|
|
if creds and creds.expired and creds.refresh_token:
|
|
creds.refresh(Request())
|
|
else:
|
|
flow = InstalledAppFlow.from_client_secrets_file(
|
|
'credentials.json', SCOPES)
|
|
creds = flow.run_local_server(port=0)
|
|
# Save the credentials for the next run
|
|
with open('token.pickle', 'wb') as token:
|
|
pickle.dump(creds, token)
|
|
|
|
service = build('docs', 'v1', credentials=creds)
|
|
|
|
# Retrieve the documents contents from the Docs service.
|
|
document = service.documents().get(documentId=docid).execute()
|
|
if verbose: print(document)
|
|
|
|
tempout = codecs.open('cache/trash/gdoctemp.txt','w','utf-8')
|
|
tempout.write( json.dumps(document,indent=2) + "\n\n\n------------------------------------\n\n")
|
|
if verbose: print('The title of the document is: {}'.format(document.get('title')))
|
|
doc_content = document.get('body').get('content')
|
|
if verbose: print(doc_content)
|
|
|
|
doc_objects = document.get('inlineObjects')
|
|
if verbose: print(doc_objects)
|
|
|
|
doc_lists = document.get('lists')
|
|
|
|
text = '<div class="acrd_grp" data-accordion-group="">'
|
|
last_type = ''
|
|
answer_text = ''
|
|
in_a_list = ''
|
|
|
|
img_count = 1
|
|
img_lookup = {}
|
|
img_heights = {}
|
|
img_widths = {}
|
|
|
|
if doc_objects:
|
|
for k,value in doc_objects.items():
|
|
tempout.write( "->" + k + "=" + json.dumps(value,indent=2) + "\n\n\n--\n\n")
|
|
if 'inlineObjectProperties' in value:
|
|
if 'embeddedObject' in value['inlineObjectProperties']:
|
|
if 'imageProperties' in value['inlineObjectProperties']['embeddedObject']:
|
|
if 'contentUri' in value['inlineObjectProperties']['embeddedObject']['imageProperties']:
|
|
print(k)
|
|
uu = value['inlineObjectProperties']['embeddedObject']['imageProperties']['contentUri']
|
|
response = requests.get(uu, stream=True)
|
|
name = 'image_' + str(img_count) + '.' + response.headers['content-type'].split('/')[1]
|
|
img_count += 1
|
|
|
|
img_lookup[k] = name
|
|
|
|
with open('cache/doc_images/'+name, 'wb') as out_file:
|
|
shutil.copyfileobj(response.raw, out_file)
|
|
print(uu)
|
|
print(response.headers)
|
|
print(name)
|
|
#input('x?')
|
|
del response
|
|
if 'size' in value['inlineObjectProperties']['embeddedObject']:
|
|
img_heights[k] = int(value['inlineObjectProperties']['embeddedObject']['size']['height']['magnitude'])
|
|
img_widths[k] = int(value['inlineObjectProperties']['embeddedObject']['size']['width']['magnitude'])
|
|
|
|
tempout.write('- - - - - - - -\n\n')
|
|
#for value in doc_lists:
|
|
# tempout.write( json.dumps(value,indent=2) + "\n\n\n--\n\n")
|
|
|
|
tempout.write('- - - - - - - -\n\n')
|
|
list_stack = []
|
|
list_depth = 0
|
|
last_list_depth = 0
|
|
for value in doc_content:
|
|
tempout.write( json.dumps(value,indent=2) + "\n\n\n")
|
|
if verbose: print(json.dumps(value, sort_keys=True, indent=4))
|
|
|
|
# todo: x link, x bold, list, image.
|
|
tag_fxn = para
|
|
if 'paragraph' in value:
|
|
this_text = ''
|
|
|
|
if 'bullet' in value['paragraph']:
|
|
# either we're (1)starting a new list, (2)in one, (3)starting a nested one, or (4)finished a nested one.
|
|
|
|
lid = value['paragraph']['bullet']['listId']
|
|
|
|
if not list_stack: # 1
|
|
list_stack.append(lid)
|
|
else:
|
|
if lid == list_stack[0]: # 2
|
|
pass
|
|
|
|
else:
|
|
if not lid in list_stack: # 3
|
|
list_stack.append(lid)
|
|
else: # 4
|
|
x = list_stack.pop()
|
|
while x != lid: list_stack.pop()
|
|
elif len(list_stack) > 0: # current para isn't a bullet but we still have a list open.
|
|
list_stack = []
|
|
|
|
list_depth = len(list_stack)
|
|
|
|
deeper = list_depth - last_list_depth
|
|
|
|
if deeper > 0:
|
|
answer_text += "<ul>" * deeper
|
|
elif deeper < 0:
|
|
deeper = -1 * deeper
|
|
answer_text += "</ul>" * deeper
|
|
|
|
if len(list_stack):
|
|
tag_fxn = li
|
|
|
|
elements = value.get('paragraph').get('elements')
|
|
|
|
# inlineObjectElement": {
|
|
# "inlineObjectId": "kix.ssseeu8j9cfx",
|
|
|
|
if 'paragraphStyle' in value.get('paragraph'):
|
|
style = value.get('paragraph').get('paragraphStyle')
|
|
#text += json.dumps(style, sort_keys=True, indent=4)
|
|
if 'namedStyleType' in style:
|
|
type = style['namedStyleType']
|
|
|
|
for elem in elements:
|
|
|
|
# text content
|
|
this_text += read_paragraph_element(elem,type)
|
|
|
|
# image content
|
|
if 'inlineObjectElement' in elem:
|
|
vpi = elem['inlineObjectElement']
|
|
if 'inlineObjectId' in vpi:
|
|
ii = vpi['inlineObjectId']
|
|
if ii in img_lookup:
|
|
img = img_lookup[ii]
|
|
h = img_heights[ii]
|
|
w = img_widths[ii]
|
|
this_text += '<img src="doc_images/%s" width="%i" height="%i" />' % (img,w,h)
|
|
|
|
|
|
|
|
if last_type=='NORMAL_TEXT' and type!=last_type:
|
|
text += answer(answer_text)
|
|
answer_text = ''
|
|
|
|
if type=='HEADING_2':
|
|
text += sec(this_text)
|
|
this_text = ''
|
|
elif type=='HEADING_3':
|
|
text += question(this_text,bracket)
|
|
this_text = ''
|
|
else:
|
|
answer_text += tag_fxn(this_text)
|
|
this_text = ''
|
|
last_type = type
|
|
last_list_depth = list_depth
|
|
|
|
elif 'table' in value:
|
|
# The text in table cells are in nested Structural Elements and tables may be
|
|
# nested.
|
|
text += "\nTABLE\n"
|
|
#table = value.get('table')
|
|
#for row in table.get('tableRows'):
|
|
# cells = row.get('tableCells')
|
|
# for cell in cells:
|
|
# text += read_strucutural_elements(cell.get('content'))
|
|
#elif 'tableOfContents' in value:
|
|
# # The text in the TOC is also in a Structural Element.
|
|
# toc = value.get('tableOfContents')
|
|
# text += read_strucutural_elements(toc.get('content'))
|
|
|
|
#else:
|
|
# print(json.dumps(value, sort_keys=True, indent=4))
|
|
|
|
text += answer(answer_text)
|
|
#text += '</div>'
|
|
#print(text)
|
|
return text
|
|
|
|
######### TRY #2 ######
|
|
|
|
|
|
def read_paragraph_element_2(element,type="NORMAL_TEXT"):
|
|
text_run = element.get('textRun')
|
|
begin = ''
|
|
end = ''
|
|
if not text_run: return ''
|
|
if 'textStyle' in text_run and 'link' in text_run['textStyle']:
|
|
begin = '<a href="' + text_run['textStyle']['link']['url'] + '">'
|
|
end = '</a>'
|
|
if 'textStyle' in text_run and 'bold' in text_run['textStyle'] and text_run['textStyle']['bold']==True and type=="NORMAL_TEXT":
|
|
begin = '<strong>' + begin
|
|
end = end + '</strong>'
|
|
elif 'textStyle' in text_run and 'italic' in text_run['textStyle'] and text_run['textStyle']['italic']==True and type=="NORMAL_TEXT":
|
|
begin = '<em>' + begin
|
|
end = end + '</em>'
|
|
content = text_run.get('content')
|
|
content = re.sub(u'\u000b','<br />\n',content)
|
|
return begin + content + end
|
|
|
|
# t is a string that begins with "Icons: " ... and contains comma(space) separated list
|
|
def handle_icons(t):
|
|
text = t[7:].strip()
|
|
parts = text.split(", ")
|
|
return ('icons',parts)
|
|
|
|
# t is a string that begins with "Tags: " ... and contains comma(space) separated list
|
|
def handle_tags(t):
|
|
text = t[6:].strip()
|
|
parts = text.split(", ")
|
|
return ('tags',parts)
|
|
|
|
def handle_question(t,bracket=1):
|
|
anchor = ''
|
|
match = re.search( r'\[(.*)\]', t)
|
|
if match and bracket:
|
|
anchor = match.group(1).lower()
|
|
t = re.sub( r'\[.*\]','',t)
|
|
else:
|
|
parts = t.split(' ')
|
|
for p in parts:
|
|
if re.search(r'[a-zA-Z]',p[0]): anchor += p[0].lower()
|
|
return ('question', t, anchor)
|
|
|
|
def handle_answer(t):
|
|
return ('answer',t)
|
|
|
|
def handle_sec(t): return ('section',t)
|
|
def handle_para(t): return ('paragraph',t)
|
|
def handle_ul(t): return ('unorderdedlist',t)
|
|
def handle_li(t): return ('listitem',t)
|
|
|
|
|
|
|
|
img_count = 1
|
|
img_lookup = {}
|
|
img_heights = {}
|
|
img_widths = {}
|
|
|
|
|
|
def fetch_doc_image(k,value):
|
|
global img_count, img_lookup, img_heights, img_widths
|
|
if 'inlineObjectProperties' in value:
|
|
if 'embeddedObject' in value['inlineObjectProperties']:
|
|
if 'imageProperties' in value['inlineObjectProperties']['embeddedObject']:
|
|
if 'contentUri' in value['inlineObjectProperties']['embeddedObject']['imageProperties']:
|
|
print(k)
|
|
uu = value['inlineObjectProperties']['embeddedObject']['imageProperties']['contentUri']
|
|
response = requests.get(uu, stream=True)
|
|
name = 'image_' + str(img_count) + '.' + response.headers['content-type'].split('/')[1]
|
|
img_count += 1
|
|
img_lookup[k] = name
|
|
|
|
with open('cache/doc_images/'+name, 'wb') as out_file:
|
|
shutil.copyfileobj(response.raw, out_file)
|
|
print(uu)
|
|
print(response.headers)
|
|
print(name)
|
|
del response
|
|
if 'size' in value['inlineObjectProperties']['embeddedObject']:
|
|
img_heights[k] = int(value['inlineObjectProperties']['embeddedObject']['size']['height']['magnitude'])
|
|
img_widths[k] = int(value['inlineObjectProperties']['embeddedObject']['size']['width']['magnitude'])
|
|
|
|
|
|
def get_doc_generic(docid, bracket=1, verbose=0):
|
|
import pickle
|
|
import os.path
|
|
from googleapiclient.discovery import build
|
|
from google_auth_oauthlib.flow import InstalledAppFlow
|
|
from google.auth.transport.requests import Request
|
|
global img_count, img_lookup, img_heights, img_widths
|
|
|
|
# If modifying these scopes, delete the file token.pickle.
|
|
SCOPES = ['https://www.googleapis.com/auth/documents.readonly']
|
|
creds = None
|
|
# The file token.pickle stores the user's access and refresh tokens, and is
|
|
# created automatically when the authorization flow completes for the first
|
|
# time.
|
|
if os.path.exists('token.pickle'):
|
|
with open('token.pickle', 'rb') as token:
|
|
creds = pickle.load(token)
|
|
if not creds or not creds.valid:
|
|
if creds and creds.expired and creds.refresh_token:
|
|
creds.refresh(Request())
|
|
else:
|
|
flow = InstalledAppFlow.from_client_secrets_file(
|
|
'credentials.json', SCOPES)
|
|
creds = flow.run_local_server(port=0)
|
|
# Save the credentials for the next run
|
|
with open('token.pickle', 'wb') as token:
|
|
pickle.dump(creds, token)
|
|
|
|
service = build('docs', 'v1', credentials=creds)
|
|
|
|
# Retrieve the documents contents from the Docs service.
|
|
document = service.documents().get(documentId=docid).execute()
|
|
|
|
tempout = codecs.open('cache/trash/gdoctemp.txt','w','utf-8')
|
|
tempout.write( json.dumps(document,indent=2) \
|
|
+ "\n\n\n------------------------------------\n\n")
|
|
if verbose: print('The title of the document is: {}'.format(document.get('title')))
|
|
|
|
doc_content = document.get('body').get('content')
|
|
doc_objects = document.get('inlineObjects')
|
|
doc_lists = document.get('lists')
|
|
|
|
#text = ''
|
|
result = []
|
|
last_type = ''
|
|
#answer_text = ''
|
|
answer = []
|
|
in_a_list = ''
|
|
|
|
# Get all the images
|
|
for k,value in doc_objects.items():
|
|
tempout.write( "->" + k + "=" + json.dumps(value,indent=2) + "\n\n\n--\n\n")
|
|
fetched = fetch_doc_image(k,value)
|
|
|
|
list_stack = []
|
|
list_depth = 0
|
|
last_list_depth = 0
|
|
for value in doc_content:
|
|
tempout.write( json.dumps(value,indent=2) + "\n\n\n")
|
|
if verbose: print(json.dumps(value, sort_keys=True, indent=4))
|
|
|
|
tag_fxn = handle_para
|
|
if 'paragraph' in value:
|
|
this_text = ''
|
|
|
|
# First we deal with if we're in a list.
|
|
if 'bullet' in value['paragraph']:
|
|
# either we're (1)starting a new list, (2)in one (do nothing),
|
|
# (3)starting a nested one, or (4)finished a nested one.
|
|
lid = value['paragraph']['bullet']['listId']
|
|
if not list_stack: # 1
|
|
list_stack.append(lid)
|
|
else:
|
|
if not lid == list_stack[0]:
|
|
if not lid in list_stack: # 3
|
|
list_stack.append(lid)
|
|
else: # 4
|
|
x = list_stack.pop()
|
|
while x != lid: list_stack.pop()
|
|
elif len(list_stack) > 0:
|
|
# current para isn't a bullet but we still have a list open.
|
|
list_stack = []
|
|
|
|
|
|
list_depth = len(list_stack)
|
|
deeper = list_depth - last_list_depth
|
|
if deeper > 0:
|
|
answer.append("<ul>" * deeper)
|
|
elif deeper < 0:
|
|
deeper = -1 * deeper
|
|
answer.append("</ul>" * deeper)
|
|
if len(list_stack):
|
|
tag_fxn = handle_li
|
|
|
|
# NOW the tag_fxn is either 'para' or 'li'... let's get the styling info next,
|
|
elements = value.get('paragraph').get('elements')
|
|
if 'paragraphStyle' in value.get('paragraph'):
|
|
style = value.get('paragraph').get('paragraphStyle')
|
|
if 'namedStyleType' in style:
|
|
type = style['namedStyleType']
|
|
|
|
# and FINALLY, the actual contents.
|
|
for elem in elements:
|
|
# text content
|
|
this_text += read_paragraph_element_2(elem,type)
|
|
|
|
# image content
|
|
if 'inlineObjectElement' in elem:
|
|
vpi = elem['inlineObjectElement']
|
|
if 'inlineObjectId' in vpi:
|
|
ii = vpi['inlineObjectId']
|
|
if ii in img_lookup:
|
|
img = img_lookup[ii]
|
|
h = img_heights[ii]
|
|
w = img_widths[ii]
|
|
this_text += '<img src="doc_images/%s" width="%i" height="%i" />' % (img,w,h)
|
|
|
|
|
|
# Now for something tricky. Call an appropriate handler, based on:
|
|
# (a) what is the paragraph style type?
|
|
# (b) is it different from the prev one?
|
|
|
|
if last_type=='NORMAL_TEXT' and type!=last_type:
|
|
if this_text.strip():
|
|
result.append(handle_answer(answer))
|
|
answer = []
|
|
#answer_text = ''
|
|
|
|
if type=='HEADING_2' and this_text.strip():
|
|
result.append( handle_sec(this_text) )
|
|
this_text = ''
|
|
elif type=='HEADING_3' and this_text.strip():
|
|
result.append(handle_question(this_text,bracket))
|
|
this_text = ''
|
|
else:
|
|
if this_text.lower().startswith('tags:'):
|
|
tag_fxn = handle_tags
|
|
if this_text.lower().startswith('icons:'):
|
|
tag_fxn = handle_icons
|
|
if this_text.strip():
|
|
answer.append(tag_fxn(this_text))
|
|
this_text = ''
|
|
last_type = type
|
|
last_list_depth = list_depth
|
|
|
|
elif 'table' in value:
|
|
pass
|
|
|
|
|
|
result.append(handle_answer(answer))
|
|
return json.dumps(result,indent=4)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
|
print ('')
|
|
options = { 1: ['Fetch rosters on schedule',fetch_current_rosters_auto] ,
|
|
2: ['Get canvas data 2024 style', canvas_data_2024_run ],
|
|
3: ['Set up canvas data 2024 style', setup_canvas_data_2024_run],
|
|
}
|
|
|
|
'''1: ['Re-create schedule csv and json files from raw html',recent_schedules] ,
|
|
2: ['Fetch rosters',fetch_current_rosters] ,
|
|
3:
|
|
4: ['Compute how registration is filling up classes', schedule_filling] ,
|
|
5: ['Manually convert 3 csv files to joined json enrollment file.', convert_roster_files] ,
|
|
6: ['Canvas data: interactive sync', interactive ],
|
|
7: ['Canvas data: automated sync', sync_non_interactive ],
|
|
8:
|
|
9:
|
|
16: ['Scrape schedule from ssb', scrape_schedule_multi ],
|
|
14: ['Generate latestart schedule', list_latestarts ],
|
|
15: ['Test ssb calls with python', scrape_schedule_py ],
|
|
10: ['schedule to db', scrape_for_db ],
|
|
11: ['clean argos draft schedule file', argos_data_from_cvc],
|
|
12: ['make expanded schedule json files of old semesters', expand_old_semesters ],
|
|
13: ['Parse deanza schedule', dza_sched ],
|
|
'''
|
|
|
|
|
|
if len(sys.argv) > 1 and re.search(r'^\d+',sys.argv[1]):
|
|
resp = int(sys.argv[1])
|
|
print("\n\nPerforming: %s\n\n" % options[resp][0])
|
|
|
|
else:
|
|
print ('')
|
|
for key in options:
|
|
print(str(key) + '.\t' + options[key][0])
|
|
|
|
print('')
|
|
resp = input('Choose: ')
|
|
|
|
# Call the function in the options dict
|
|
options[ int(resp)][1]()
|
|
|
|
# Testing
|
|
|
|
#if __name__ == "__main__":
|
|
#users = fetch('/api/v1/courses/69/users?per_page=100',1)
|
|
#print "These are the users: "
|
|
#print users
|
|
|
|
#getSemesterSchedule()
|
|
|
|
|
|
#get_doc()
|
|
#pass
|