canvasapp/pipelines.py

2136 lines
76 KiB
Python
Raw Blame History

This file contains invisible Unicode characters

This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

from time import strptime
from bs4 import BeautifulSoup as bs
from util import UnicodeDictReader
from datetime import datetime as dt
from dateutil import parser
import pandas as pd
import codecs, json, requests, re, csv, datetime, pysftp, os, jsondiff, os.path
import sys, shutil, hmac, hashlib, base64, schedule, time, pathlib, datetime
import pdb
from collections import defaultdict
from deepdiff import DeepDiff
from canvas_secrets import apiKey, apiSecret, FTP_SITE, FTP_USER, FTP_PW, GOO, GOO_PIN, token, url, domain, account_id, header, g_id, g_secret
from canvas_secrets import instructure_url, instructure_username, instructure_private_key
"""
Everything to do with fetching data,
- From iLearn, via token
- current roster uploads from instructures sftp site
- raw logs and other from canvas data repo
- from ssb, use firefox to scrape the schedule
And some subsequent processing:
- Raw roster files, into a more compact json format
- Raw logs into something more useful
"""
verbose = False
users = {}
users_by_id = {}
# todo: all these constants for SSB -- line 1008
#
# todo: https://stackoverflow.com/questions/42656247/how-can-i-use-canvas-data-rest-api-using-python
schedfile = 'temp.csv'
SEMESTER = 'Summer 2019'
short_sem = 'su19'
semester_begin = strptime('06/17', '%m/%d')
filename = 'su19_sched.json'
SEMESTER = 'Summer 2020'
short_sem = 'su20'
semester_begin = strptime('06/15', '%m/%d')
filename = 'su20_sched.json'
SEMESTER = 'Fall 2020'
short_sem = 'fa20'
semester_begin = strptime('08/24', '%m/%d')
filename = 'fa20_sched.json'
SEMESTER = 'Spring 2021'
short_sem = 'sp21'
semester_begin = strptime('02/01', '%m/%d')
filename = 'sp21_sched.json'
filename_html = 'sp21_sched.html'
SEMESTER = 'Summer 2021 (View only)'
short_sem = 'su21'
semester_begin = strptime('06/14', '%m/%d')
filename = 'su21_sched.json'
filename_html = 'su21_sched.html'
# Current or upcoming semester is first.
sems = ['su21', 'sp21', 'fa20', 'su20', 'sp20'] #, 'fa19'] # 'sp19']
sys.setrecursionlimit( 100000 )
local_data_folder = 'cache/canvas_data/'
mylog = codecs.open(local_data_folder + 'temp_log.txt','w')
gp = {}
gp['ACCT'] = 'info'
gp['AE'] = 'skill'
gp['AH'] = 'well'
gp['AJ'] = 'skill'
gp['AMT'] = 'skill'
gp['ANTH'] = 'soc'
gp['APE'] = 'skill'
gp['ART'] = 'art'
gp['ASTR'] = 'stem'
gp['ATH'] = 'well'
gp['BIO'] = 'stem'
gp['BOT'] = 'info'
gp['BUS'] = 'info'
gp['CD'] = 'skill'
gp['CHEM'] = 'stem'
gp['CMGT'] = 'skill'
gp['CMUN'] = 'comm'
gp['COS'] = 'skill'
gp['CSIS'] = 'stem'
gp['CWE'] = 'skill'
gp['DM'] = 'info'
gp['ECOL'] = 'stem'
gp['ECON'] = 'info'
gp['ENGL'] = 'soc'
gp['ENGR'] = 'stem'
gp['ENVS'] = 'stem'
gp['ESL'] = 'comm'
gp['ETHN'] = 'comm'
gp['FRNH'] = 'comm'
gp['GEOG'] = 'stem'
gp['GEOL'] = 'stem'
gp['GUID'] = 'soc'
gp['HE'] = 'well'
gp['HIST'] = 'soc'
gp['HUM'] = 'soc'
gp['HVAC'] = 'skill'
gp['JFT'] = 'skill'
gp['JLE'] = 'skill'
gp['JOUR'] = 'comm'
gp['JPN'] = 'comm'
gp['KIN'] = 'well'
gp['LIB'] = 'comm'
gp['LIFE'] = 'well'
gp['MATH'] = 'stem'
gp['MCTV'] = 'art'
gp['MUS'] = 'art'
gp['PHIL'] = 'soc'
gp['PHYS'] = 'stem'
gp['POLS'] = 'soc'
gp['PSCI'] = 'stem'
gp['PSYC'] = 'soc'
gp['RE'] = 'skill'
gp['SJS'] = 'soc'
gp['SOC'] = 'soc'
gp['SPAN'] = 'comm'
gp['THEA'] = 'art'
gp['WELD'] = 'skill'
gp['WTRM'] = 'skill'
gp['MGMT'] = 'skill'
gp['MKTG'] = 'skill'
gp['HTM'] = 'skill'
dean = {}
dean['AH'] = 'et'
dean['HE'] = 'et'
dean['ATH'] = 'et'
dean['KIN'] = 'et'
dean['LIFE'] = 'et'
dean['AE'] = 'ss'
dean['APE'] = 'ss'
dean['ACCT'] = 'ss'
dean['AJ'] = 'ss'
dean['AMT'] = 'ss'
dean['HVAC'] = 'ss'
dean['JFT'] = 'ss'
dean['JLE'] = 'ss'
dean['RE'] = 'ss'
dean['WTRM'] = 'ss'
dean['WELD'] = 'ss'
dean['ANTH'] = 'nl'
dean['ART'] = 'nl'
dean['ASTR'] = 'jn'
dean['BIO'] = 'jn'
dean['BOT'] = 'ss'
dean['BUS'] = 'ss'
dean['CD'] = 'ss'
dean['CHEM'] = 'jn'
dean['CMGT'] = 'ss'
dean['CMUN'] = 'nl'
dean['COS'] = 'ss'
dean['CSIS'] = 'ss'
dean['CWE'] = 'ss'
dean['DM'] = 'ss'
dean['ECOL'] = 'jn'
dean['ECON'] = 'nl'
dean['ENGL'] = 'nl'
dean['ENGR'] = 'jn'
dean['ENVS'] = 'jn'
dean['ESL'] = 'ss'
dean['ETHN'] = 'nl'
dean['FRNH'] = 'nl'
dean['GEOG'] = 'jn'
dean['GEOL'] = 'jn'
dean['GUID'] = 'nl'
dean['HIST'] = 'nl'
dean['HUM'] = 'nl'
dean['JOUR'] = 'nl'
dean['JPN'] = 'nl'
dean['LIB'] = 'kn'
dean['MATH'] = 'jn'
dean['MCTV'] = 'nl'
dean['MGMT'] = 'ss'
dean['MKTG'] = 'ss'
dean['HTM'] = 'ss'
dean['MUS'] = 'nl'
dean['PHIL'] = 'nl'
dean['PHYS'] = 'jn'
dean['POLS'] = 'nl'
dean['PSCI'] = 'jn'
dean['PSYC'] = 'nl'
dean['SJS'] = 'nl'
dean['SOC'] = 'nl'
dean['SPAN'] = 'nl'
dean['THEA'] = 'nl'
class FetchError(Exception):
pass
DEBUG = 0
def d(s,end=''):
global DEBUG
if end and DEBUG: print(s,end=end)
elif DEBUG: print(s)
################
################ CANVAS API MAIN FETCHING FUNCTIONS
################
################
################
# Main canvas querying fxn
def fetch(target,verbose=0):
# if there are more results, recursivly call myself, adding on to the results.
results = 0
if target[0:4] != "http": target = url + target
if verbose:
print("++ Fetching: " + target)
r2 = requests.get(target, headers = header)
#if verbose:
#print "++ Got: " + r2.text
try:
results = json.loads(r2.text)
count = len(results)
except:
print("-- Failed to parse: ", r2.text)
if verbose:
print("Got %i results" % count)
if verbose > 1:
print(r2.headers)
tempout = codecs.open('cache/fetchcache.txt','a','utf-8')
tempout.write(r2.text+"\n\n")
tempout.close()
if ('link' in r2.headers and count > 0):
links = r2.headers['link'].split(',')
for L in links:
ll = L.split(';')
link = ll[0].replace("<","")
link = link.replace(">","")
if re.search(r'next', ll[1]):
if (verbose): print("++ More link: " + link)
#link = re.sub(r'per_page=10$', 'per_page=100', link) # link.replace('per_page=10','per_page=500')
#if (verbose): print("++ More link: " + link)
nest = fetch(link,verbose)
if isinstance(results,dict): results.update(nest)
else: results.extend(nest)
return results
# Main canvas querying fxn - stream version - don't die on big requests
def fetch_stream(target,verbose=0):
# if there are more results, recursivly call myself, adding on to the results.
results = 0
while target:
if target[0:4] != "http": target = url + target
if verbose:
print("++ Fetching: " + target)
r2 = requests.get(target, headers = header)
if r2.status_code == 502:
raise FetchError()
try:
results = json.loads(r2.text)
count = len(results)
except:
print("-- Failed to parse: ", r2.text)
if verbose:
print("Got %i results" % count)
if verbose > 1:
print(r2.headers)
tempout = codecs.open('cache/fetchcache.txt','a','utf-8')
tempout.write(r2.text+"\n\n")
tempout.close()
next_link_found = 0
if ('link' in r2.headers and count > 0):
links = r2.headers['link'].split(',')
for L in links:
ll = L.split(';')
link = ll[0].replace("<","")
link = link.replace(">","")
if re.search(r'next', ll[1]):
target = link
next_link_found = 1
break
if not next_link_found: target = 0
yield results
# for dicts with one key, collapse that one key out, cause
# paging makes problems... example: enrollment_terms
def fetch_collapse(target,collapse='',verbose=0):
# if there are more results, recursivly call myself, adding on to the results.
results = 0
if target[0:4] != "http": target = url + target
if verbose:
print("++ Fetching: " + target)
r2 = requests.get(target, headers = header)
#if verbose:
#print "++ Got: " + r2.text
try:
results = json.loads(r2.text)
except:
print("-- Failed to parse: ", r2.text)
if verbose: print(r2.headers)
if collapse and collapse in results:
results = results[collapse]
if ('link' in r2.headers):
links = r2.headers['link'].split(',')
for L in links:
ll = L.split(';')
link = ll[0].replace("<","")
link = link.replace(">","")
if re.search(r'next', ll[1]):
if (verbose): print("++ More link: " + link)
nest = fetch_collapse(link, collapse, verbose)
if isinstance(results,dict): results.update(nest)
else: results.extend(nest)
return results
################
################ SCHEDULE PARSING HELPERS
################
################
################
# Teacher name format changed. Remove commas and switch first to last
def fix_t_name(str):
str = str.strip()
str = re.sub('\s+',' ',str)
parts = str.split(', ')
if len(parts)>1:
return parts[1].strip() + " " + parts[0].strip()
return str
# Separate dept and code
def split_class_dept(c):
return c.split(' ')[0]
def split_class_code(c):
num = c.split(' ')[1]
parts = re.match('(\d+)([a-zA-Z]+)',num)
#ret = "Got %s, " % c
if parts:
r = int(parts.group(1))
#print(ret + "returning %i." % r)
return r
#print(ret + "returning %s." % num)
return int(num)
def split_class_code_letter(c):
num = c.split(' ')[1]
parts = re.match('(\d+)([A-Za-z]+)',num)
if parts:
return parts.group(2)
return ''
# go from sp20 to 2020spring
def shortToLongSem(s):
parts = re.search(r'(\w\w)(\d\d)', s)
yr = parts.group(2)
season = parts.group(1)
seasons = {'sp':'spring','su':'summer','fa':'fall','wi':'winter'}
return '20'+yr+seasons[season]
# Go to the semesters folder and read the schedule. Return dataframe
def getSemesterSchedule(short='sp21'): # I used to be current_schedule
# todo: Some semesters have a different format.... partofday type site xxx i just dL'd them again
filename = 'cache/semesters/'+shortToLongSem(short)+'/' + short + '_sched.json'
print("opening %s" % filename)
#openfile = open(filename,'r')
#a = json.loads(openfile)
#return pd.DataFrame(a)
schedule = pd.read_json(filename)
schedule.teacher = schedule['teacher'].apply(fix_t_name)
#print schedule['teacher']
for index,r in schedule.iterrows():
tch = r['teacher']
parts = tch.split(' . ')
if len(parts)>1:
#print "Multiple teachers: (" + tch + ")"
schedule.loc[index,'teacher'] = parts[0]
#print " Fixed original: ", schedule.loc[index]
for t in parts[1:]:
r['teacher'] = t
schedule.loc[-1] = r
#print " New row appended: ", schedule.loc[-1]
schedule = schedule.assign(dept = schedule['code'].apply(split_class_dept))
schedule = schedule.assign(codenum = schedule['code'].apply(split_class_code))
schedule = schedule.assign(codeletter = schedule['code'].apply(split_class_code_letter))
#print(schedule)
schedule['sem'] = short
#print schedule.columns
return schedule
def get_enrlmts_for_user(user,enrollments):
#active enrollments
u_en = enrollments[ lambda x: (x['user_id'] == user) & (x['workflow']=='active') ]
return u_en[['type','course_id']]
################
################ CANVAS DATA
################
################
################
# Get something from Canvas Data
def do_request(path):
#Set up the request pieces
method = 'GET'
host = 'api.inshosteddata.com'
apiTime = dt.utcnow().strftime('%a, %d %b %y %H:%M:%S GMT')
apiContentType = 'application/json'
msgList = []
msgList.append(method)
msgList.append(host)
msgList.append(apiContentType)
msgList.append('')
msgList.append(path)
msgList.append('')
msgList.append(apiTime)
msgList.append(apiSecret)
msgStr = bytes("".join("%s\n" % k for k in msgList).strip(),'utf-8')
sig = base64.b64encode(hmac.new(key=bytes(apiSecret,'utf-8'),msg=msgStr,digestmod=hashlib.sha256).digest())
sig = sig.decode('utf-8')
headers = {}
headers['Authorization'] = 'HMACAuth {}:{}'.format(apiKey,sig)
headers['Date'] = apiTime
headers['Content-type'] = apiContentType
#Submit the request/get a response
uri = "https://"+host+path
print (uri)
print (headers)
response = requests.request(method='GET', url=uri, headers=headers, stream=True)
#Check to make sure the request was ok
if(response.status_code != 200):
print(('Request response went bad. Got back a ', response.status_code, ' code, meaning the request was ', response.reason))
else:
#Use the downloaded data
jsonData = response.json()
#print(json.dumps(jsonData, indent=4))
return jsonData
# Canvas data, download all new files
def sync_non_interactive():
resp = do_request('/api/account/self/file/sync')
mylog.write(json.dumps(resp, indent=4))
#mylog.close()
gotten = os.listdir(local_data_folder)
wanted = []
i = 0
for x in resp['files']:
filename = x['filename']
exi = "No "
if filename in gotten: exi = "Yes"
else: wanted.append(x)
print(str(i) + '.\tLocal: %s\tRemote: %s' % ( exi, filename ))
i += 1
print("I will attempt to download %i files." % len(wanted))
#answer = input("Press enter to begin, or q to quit ")
#if not answer == '': return
good_count = 0
bad_count = 0
for W in wanted:
print("Downloading: " + W['filename'])
response = requests.request(method='GET', url=W['url'], stream=True)
if(response.status_code != 200):
print('Request response went bad. Got back a %s code, meaning the request was %s' % \
(response.status_code, response.reason))
print('URL: ' + W['url'])
bad_count += 1
else:
#Use the downloaded data
with open(local_data_folder + W['filename'], 'wb') as fd:
for chunk in response.iter_content(chunk_size=128):
fd.write(chunk)
print("Success")
good_count += 1
print("Out of %i files, %i succeeded and %i failed." % (len(wanted),good_count,bad_count))
# list files in canvas_data (online) and choose one or some to download.
def interactive():
resp = do_request('/api/account/self/file/sync')
mylog.write(json.dumps(resp, indent=4))
#mylog.close()
i = 0
gotten = os.listdir(local_data_folder)
for x in resp['files']:
print(str(i) + '.\t' + x['filename'])
i += 1
which = input("Which files to get? (separate with commas, or say 'all') ")
if which=='all':
which_a = list(range(i-1))
else:
which_a = which.split(",")
for W in which_a:
this_i = int(W)
this_f = resp['files'][this_i]
filename = this_f['filename']
if filename in gotten: continue
print("Downloading: " + filename)
response = requests.request(method='GET', url=this_f['url'], stream=True)
if(response.status_code != 200):
print(('Request response went bad. Got back a ', response.status_code, ' code, meaning the request was ', response.reason))
else:
#Use the downloaded data
with open(local_data_folder + filename, 'wb') as fd:
for chunk in response.iter_content(chunk_size=128):
fd.write(chunk)
print("Success")
"""if filename.split('.')[-1] == 'gz':
try:
plain_filename = 'canvas_data/' + ".".join(filename.split('.')[:-1])
pf = open(plain_filename,'w')
with gzip.open('canvas_data/' + filename , 'rb') as f:
pf.write(f.read())
except Exception as e:
print "Failed to ungizp. Probably too big: " + str(e)"""
###### SSB SCHEDULE
######
######
######
def todays_date_filename(): # helper
n = datetime.now()
m = n.month
if m < 10: m = "0"+str(m)
d = n.day
if d < 10: d = "0" + str(d)
return "reg_" + short_sem + "_" + str(n.year) + str(m) + str(d)
def nowAsStr(): # possible duplicate
#Get the current time, printed in the right format
currentTime = datetime.datetime.utcnow()
prettyTime = currentTime.strftime('%a, %d %b %Y %H:%M:%S GMT')
return prettyTime
def row_has_data(r): # helper
if r.find_all('th'):
return False
if len(r.find_all('td')) > 2:
return True
if re.search('Note\:', r.get_text()):
return True
return False
def row_text(r): # helper
#global dbg
d("Row Txt Fxn gets: ")
arr = []
for t in r.find_all('td'):
if t.contents and len(t.contents) and t.contents[0].name == 'img':
arr.append("1")
d("img")
r_text = t.get_text()
arr.append(r_text)
if 'colspan' in t.attrs and t['colspan']=='2':
d('[colspan2]')
arr.append('')
d("\t"+r_text, end=" ")
d('')
if len(arr)==1 and re.search('Note\:',arr[0]):
note_line = clean_funny( arr[0] )
note_line = re.sub(r'\n',' ', note_line)
note_line = re.sub(r'"','', note_line)
#note_line = re.sub(r',','\,', note_line)
return ',,,,,,,,,,,,,,,,,,"' + note_line + '"\n'
del arr[0]
arr[1] = clean_funny(arr[1])
arr[2] = clean_funny(arr[2])
if arr[1]: arr[1] = arr[1] + " " + arr[2]
del arr[2]
arr = [ re.sub(r'&nbsp;','',a) for a in arr]
arr = [ re.sub(',','. ',a) for a in arr]
arr = [ re.sub('\(P\)','',a) for a in arr]
arr = [ a.strip() for a in arr]
#del arr[-1]
r = ','.join(arr)+'\n'
r = re.sub('\n','',r)
r = re.sub('add to worksheet','',r)
d("Row Txt Fxn returns: " + r + "\n\n")
return r + '\n'
# Take banner's html and make a csv(?) file
def ssb_to_csv(src):
#out = codecs.open(schedfile,'w','utf-8')
output = 'crn,code,sec,cmp,cred,name,days,time,cap,act,rem,wl_cap,wl_act,wl_rem,teacher,date,loc,ztc,note\n'
b = bs(src, 'html.parser')
tab = b.find(class_="datadisplaytable")
if not tab:
print("hmm... didn't find a 'datadisplaytable' in this html: ")
#print(src)
return 0
rows = tab.find_all('tr')
drows = list(filter(row_has_data,rows))
for dd in drows:
t = row_text(dd)
output += t
return output
def clean_funny(str):
if str and str.encode('utf8') == ' ': return ''
return str
def clean_funny2(str):
if str and str == '\xa0': return ''
if str and str == ' ': return ''
return str
def clean_funny3(str):
return re.sub('\xa0','',str)
### course is a list of 1-3 lists, each one being a line in the schedule's output. First one has section
def course_start(course):
#todo: use this to make a early/late/short field and store semester dates w/ other constants
start = datetime(2019,1,28)
end = datetime(2019,5,24)
# is it normal, early, late, winter?
li = course[0]
date = li[12]
if date=='01/28-05/24':
return 'Normal'
if date=='TBA':
return 'TBA'
if date=='01/02-01/25':
return 'Winter'
if date=='01/02-01/24':
return 'Winter'
ma = re.search( r'(\d+)\/(\d+)\-(\d+)\/(\d+)', date)
if ma:
# TODO do these years matter?
mystart = datetime(2019, int(ma.group(1)), int(ma.group(2)))
if int(ma.group(1)) > 10: mystart = datetime(2018, int(ma.group(1)), int(ma.group(2)))
myend = datetime(2019, int(ma.group(3)), int(ma.group(4)))
length = myend - mystart
weeks = length.days / 7
if mystart != start:
if mystart < start:
#print 'Early Start ', str(weeks), " weeks ",
return 'Early start'
else:
#print 'Late Start ', str(weeks), " weeks ",
return 'Late start'
else:
if myend > end:
#print 'Long class ', str(weeks), " weeks ",
return 'Long term'
else:
#print 'Short term ', str(weeks), " weeks ",
return 'Short term'
#return ma.group(1) + '/' + ma.group(2) + " end: " + ma.group(3) + "/" + ma.group(4)
else:
return "Didn't match: " + date
def time_to_partofday(t):
#todo: account for multiple sites/rows
# 11:20 am-12:10 pm
mor = strptime('12:00 PM', '%I:%M %p')
mid = strptime( '2:00 PM', '%I:%M %p')
aft = strptime( '6:00 PM', '%I:%M %p')
if t == 'TBA':
return 'TBA'
t = t.upper()
parts = t.split('-')
try:
begin = strptime(parts[0], '%I:%M %p')
end = strptime(parts[1], '%I:%M %p')
if end > aft:
return "Evening"
if end > mid:
return "Afternoon"
if end > mor:
return "Midday"
return "Morning"
#return begin,end
except Exception as e:
#print 'problem parsing: ', t, " ",
return ""
# Deduce a 'site' field, based on room name and known offsite locations
def room_to_site(room,verbose=0):
#todo: account for multiple sites/rows
#todo: better way to store these offsite labels
othersites = 'AV,SBHS I-243,SBHS I-244,LOADCS,HOPEH,HOPEG,PLY,SAS,SBHS,LOHS,CHS,SBRAT,'.split(',')
# is it gilroy, mh, hol, other, online or hybrid?
site = 'Gilroy'
#if len(course[0]) > 13:
# room = course[0][13]
if room in othersites:
site = "Other"
if room == 'TBA':
site = 'TBA'
if room == 'AV':
site = 'San Martin Airport'
if re.search('MHG',room):
site = 'Morgan Hill'
if re.search('HOL',room):
site = 'Hollister'
if re.search('COY',room):
site = 'Coyote Valley'
if re.search('OFFSTE',room):
site = 'Other'
if re.search('ONLINE',room):
site = 'Online'
if verbose: print(room, '\t', end=' ')
return site
from io import StringIO
# take text lines and condense them to one dict per section
def to_section_list(input_text,verbose=0):
this_course = ''
#todo: no output files
#jout = codecs.open(filename, 'w', 'utf-8')
#input = csv.DictReader(open(schedfile,'r'))
#input = UnicodeDictReader(input_text.splitlines())
all_courses = []
try:
f = StringIO(input_text)
except:
print("ERROR with this input_text:")
print(input_text)
reader = csv.reader(f, delimiter=',')
headers = next(reader)
for r in reader:
d = dict(list(zip(headers,r)))
#pdb.set_trace()
# clean funny unicode char in blank entries
r = {k: clean_funny2(v) for k,v in list(d.items()) }
if verbose: print("Cleaned: " + str(r))
if 'time' in r:
if r['time']=='TBA': r['time'] = ''
if r['time']: r['partofday'] = time_to_partofday(r['time'])
r['type'] = ''
if 'loc' in r:
if r['loc'] == 'ONLINE': r['type'] = 'online'
if r['loc'] == 'ONLINE' and r['time']: r['type'] = 'online live'
if r['loc'] == 'ONLINE LIVE': r['type'] = 'online live'
if r['loc']: r['site'] = room_to_site(r['loc'],verbose)
if 'code' in r:
if re.search(r'ONLINE\sLIVE',r['code']):
r['type'] = 'online live'
elif re.search(r'ONLINE',r['code']):
r['type'] = 'online'
# does it have a section? it is the last course
if r['crn']: # is a new course or a continuation?
if verbose: print(" it's a new section.")
if this_course:
if not this_course['extra']: this_course.pop('extra',None)
all_courses.append(this_course)
this_course = r
#print(r['name'])
this_course['extra'] = []
else:
# is a continuation line
if verbose: print(" additional meeting: " + str(r))
for k,v in list(r.items()):
if not v: r.pop(k,None)
# TODO: if extra line is different type?
#if this_course['type']=='online' and r['type'] != 'online': this_course['type'] = 'hybrid'
#elif this_course['type']!='online' and r['type'] == 'online': this_course['type'] = 'hybrid'
this_course['extra'].append(r)
return all_courses
# Schedule / course filling history
# csv headers: crn, code, teacher, datetime, cap, act, wlcap, wlact
# Log the history of enrollments per course during registration
def log_section_filling(current_sched_list):
rows = 'timestamp crn code teacher cap act wl_cap wl_act'.split(' ')
rows_j = 'crn code teacher cap act wl_cap wl_act'.split(' ')
print(rows_j)
now = datetime.datetime.now().strftime('%Y-%m-%dT%H-%M')
csv_fn = 'cache/reg_history_' + short_sem + '.csv'
with codecs.open(csv_fn,'a','utf-8') as f:
writer = csv.writer(f)
for S in current_sched_list:
#print(S)
items = [now,]
[ items.append( S[X] ) for X in rows_j ]
writer.writerow(items)
# Same as above, but compressed, act only
def log_section_filling2(current_sched_list):
now = datetime.datetime.now().strftime('%Y-%m-%dT%H')
todays_data = { int(S['crn']): S['act'] for S in current_sched_list }
#print(todays_data)
todays_df = pd.DataFrame.from_dict(todays_data, orient='index', columns=[now])
todays_df = todays_df.rename_axis('crn')
#print(todays_df)
todays_df.to_csv('cache/reg_today_new.csv', index=True)
try:
myframe = pd.read_csv('cache/reg_data_' + short_sem + '.csv')
print(myframe)
except:
fff = open('cache/reg_data_'+short_sem+'.csv','w')
fff.write('crn\n')
fff.close()
myframe = pd.read_csv('cache/reg_data_' + short_sem + '.csv')
#myframe = pd.DataFrame.from_dict(todays_data, orient='index', columns=[now])
#myframe = myframe.rename_axis('crn')
print("Creating new data file for this semester.")
new_df = myframe.join( todays_df, on='crn', how='outer' )
new_df = new_df.rename_axis('crn')
print(new_df)
reg_data_filename = 'reg_data_' + short_sem + '.csv'
new_df.to_csv('cache/' + reg_data_filename, index=False)
put_file('/home/public/schedule/', 'cache/', reg_data_filename, 0)
# Use Firefox and log in to ssb and get full schedule. Only works where selenium is installed
def scrape_schedule():
#url = "https://ssb.gavilan.edu/prod/twbkwbis.P_GenMenu?name=bmenu.P_StuMainMnu"
url = "https://ssb-prod.ec.gavilan.edu/PROD/twbkwbis.P_GenMenu?name=bmenu.P_MainMnu"
text = ''
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait, Select
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
try:
driver = webdriver.Firefox()
driver.get(url)
driver.find_element_by_id("UserID").clear()
driver.find_element_by_id("UserID").send_keys(GOO)
driver.find_element_by_name("PIN").send_keys(GOO_PIN)
driver.find_element_by_name("loginform").submit()
driver.implicitly_wait(5)
print(driver.title)
driver.find_element_by_link_text("Students").click()
driver.implicitly_wait(5)
print(driver.title)
driver.find_element_by_link_text("Registration").click()
driver.implicitly_wait(5)
print(driver.title)
driver.find_element_by_link_text("Search for Classes").click()
driver.implicitly_wait(15)
print(driver.title)
dd = Select(driver.find_element_by_name("p_term"))
if (dd):
dd.select_by_visible_text(SEMESTER)
driver.find_element_by_xpath("/html/body/div/div[4]/form").submit()
driver.implicitly_wait(15)
print(driver.title)
driver.find_element_by_xpath("/html/body/div/div[4]/form/input[18]").click()
driver.implicitly_wait(10)
print(driver.title)
driver.find_element_by_name("SUB_BTN").click()
driver.implicitly_wait(40)
time.sleep(15)
driver.implicitly_wait(40)
print(driver.title)
text = driver.page_source
driver.quit()
except Exception as e:
print("Got an exception: ", e)
finally:
print("")
#driver.quit()
codecs.open('cache/' + filename_html,'w', 'utf-8').write(text)
#print(text)
as_list = ssb_to_csv(text)
#print(as_list)
as_dict = to_section_list(as_list)
jj = json.dumps(as_dict,indent=2)
# TODO
try:
ps = codecs.open('cache/'+filename,'r','utf-8')
prev_sched = json.loads(ps.read())
ps.close()
if 1: # sometimes I want to re-run this without affecting the logs.
log_section_filling(as_dict)
log_section_filling2(as_dict)
dd = DeepDiff(prev_sched, as_dict, ignore_order=True)
pretty_json = json.dumps( json.loads( dd.to_json() ), indent=2 )
codecs.open('cache/%s_sched_diff.json' % short_sem,'w','utf-8').write( pretty_json ) # dd.to_json() )
except Exception as e:
print(e)
print("Can't do diff?")
# Next, rename the prev sched_xxYY.json data file to have its date,
# make this new one, and then upload it to the website.
# Maybe even count the entries and do a little sanity checking
#
# print("Last modified: %s" % time.ctime(os.path.getmtime("test.txt")))
# print("Created: %s" % time.ctime(os.path.getctime("test.txt")))
try:
last_mod = time.ctime(os.path.getmtime('cache/' + filename))
import pathlib
prev_stat = pathlib.Path('cache/' + filename).stat()
mtime = dt.fromtimestamp(prev_stat.st_mtime)
print(mtime)
except:
print("Couldn't Diff.")
# fname = pathlib.Path('test.py')
# assert fname.exists(), f'No such file: {fname}' # check that the file exists
# print(fname.stat())
#
# os.stat_result(st_mode=33206, st_ino=5066549581564298, st_dev=573948050, st_nlink=1, st_uid=0, st_gid=0, st_size=413,
# st_atime=1523480272, st_mtime=1539787740, st_ctime=1523480272)
codecs.open('cache/' + filename, 'w', 'utf-8').write(jj)
put_file('/home/public/schedule/', 'cache/', filename, 0) # /gavilan.edu/_files/php/
return as_dict
def dza_sched():
text = codecs.open('cache/sched_fa22_deanza.html','r','utf-8').read()
as_list = ssb_to_csv(text)
#print(as_list)
as_dict = to_section_list(as_list)
jj = json.dumps(as_dict,indent=2)
codecs.open('cache/fa22_sched_deanza.json','w','utf-8').write(jj)
# recreate schedule json files with most current online schedule format.
def recent_schedules():
# # todo: sems is a global in this file. Is that the right thing to do?
#all_scheds = [ os.listdir( 'cache/rosters/' + shortToLongSem(s)) for s in sems ]
#for i,s in enumerate(sems):
for s in ['sp21',]:
filename = 'cache/sched_' + s + '.html'
print("Filename is %s" % filename)
input = codecs.open( filename, 'r', 'utf-8').read()
output = ssb_to_csv(input)
csv_fn = 'cache/temp_sched_' + s + '.csv'
if os.path.isfile(csv_fn):
os.remove(csv_fn)
codecs.open(csv_fn,'w','utf-8').write(output)
jsn = to_section_list(output)
jsn_fn = 'cache/semesters/'+shortToLongSem(s)+'/'+s+'_sched.json'
if os.path.isfile(jsn_fn):
os.remove(jsn_fn)
codecs.open(jsn_fn,'w').write(json.dumps(jsn))
print("I put the most recent schedule JSON files in ./cache/semesters/... folders.")
################
################ ROSTERS AND REGISTRATION
################
################
################
# todo: the pipeline is disorganized. Organize it to have
# a hope of taking all this to a higher level.
#
# todo: where does this belong in the pipeline? compare with recent_schedules()
# Take the generically named rosters uploads files and move them to a semester folder and give them a date.
def move_to_folder(sem,year,folder):
semester = year+sem
semester_path = 'cache/rosters/%s' % semester
if not os.path.isdir('cache/rosters/'+semester):
os.makedirs('cache/rosters/'+semester)
now = datetime.datetime.now().strftime('%Y-%m-%dT%H-%M')
print("+ Moving roster files to folder: %s" % semester_path)
if not os.path.isdir(semester_path):
print("+ Creating folder: %s" % semester_path)
os.makedirs(semester_path)
os.rename('cache/rosters/courses-%s.csv' % folder, 'cache/rosters/%s/courses.%s.csv' % (semester,now))
os.rename('cache/rosters/enrollments-%s.csv' % folder, 'cache/rosters/%s/enrollments.%s.csv' % (semester,now))
os.rename('cache/rosters/users-%s.csv' % folder, 'cache/rosters/%s/users.%s.csv' % (semester,now))
# Take raw upload (csv) files and make one big json out of them.
# This relates to enrollment files, not schedule.
def convert_roster_files(semester="",year="",folder=""):
if not semester:
semester = input("the semester? (ex: spring) ")
folder = input("Folder? (ex 2020-02-25-14-58-20) ")
uf = open('cache/rosters/users-'+folder+'.csv','r')
cf = open('cache/rosters/courses-'+folder+'.csv','r')
ef = open('cache/rosters/enrollments-'+folder+'.csv','r')
u = csv.DictReader(uf)
c = csv.DictReader(cf)
e = csv.DictReader(ef)
uu = [i for i in u]
cc = [i for i in c]
ee = [i for i in e]
uf.close()
cf.close()
ef.close()
myrosterfile = 'cache/rosters/roster_%s_%s.json' % (year, semester)
if os.path.exists(myrosterfile):
print(" -- Moving previous combined roster json file. opening %s ..." % myrosterfile)
last_fileobj = open(myrosterfile,'r')
last_file = json.load(last_fileobj)
last_fileobj.close()
info = last_file[3]
last_date = info['date_filestring']
print(' -- writing: cache/rosters/%s%s/roster_%s.json ...' % (year,semester,last_date))
try:
os.rename(myrosterfile, 'cache/rosters/%s%s/roster_%s.json ...' % (year,semester,last_date))
print(' -- ok')
except Exception as e:
print(" ** Failed because i couldn't move the previous roster file: %s" % myrosterfile)
print(e)
myrosterfile = "new_" + myrosterfile
pass
#os.remove('cache/old_rosters/roster_'+semester+'.'+last_date+'.json')
#os.rename(myrosterfile, 'cache/old_rosters/roster_'+semester+'.'+last_date+'.json')
newinfo = {'date_filestring': datetime.datetime.now().strftime('%Y-%m-%dT%H-%M'), }
try:
new_roster = codecs.open(myrosterfile,'w', 'utf-8')
new_roster.write( json.dumps( [uu,cc,ee,newinfo], indent=2 ))
new_roster.close()
print(" -- Wrote roster info to: %s." % myrosterfile)
except Exception as e:
print(" ** Failed because i couldn't move the previous roster file: %s" % myrosterfile)
print(" ** " + str(e))
# From instructure sftp site
def fetch_current_rosters():
dt_label = datetime.datetime.now().strftime('%Y-%m-%d-%H-%M-%S')
cnopts = pysftp.CnOpts()
cnopts.hostkeys = None
with pysftp.Connection(instructure_url,username=instructure_username, private_key=instructure_private_key,cnopts=cnopts) as sftp:
sftp.chdir('SIS')
files = sftp.listdir()
print("--> %s I see these files at instructure ftp site: " % dt_label )
[print(" %s" % f) for f in files]
i = 0
got_courses = 0
if len(files)>1: # and 'users.csv' in files:
try:
if 'users.csv' in files:
sftp.get('users.csv','cache/rosters/users-'+dt_label+'.csv')
i += 1
except:
print(' * users.csv not present')
try:
if 'courses.csv' in files:
sftp.get('courses.csv','cache/rosters/courses-'+dt_label+'.csv')
i += 1
got_courses = 1
except:
print(' * courses.csv not present')
try:
if 'enrollments.csv' in files:
sftp.get('enrollments.csv','cache/rosters/enrollments-'+dt_label+'.csv')
i += 1
except:
print(' * enrollments.csv not present')
print(' Saved %i data files in rosters folder.' % i)
if got_courses:
courses = open('cache/rosters/courses-%s.csv' % dt_label,'r')
courses.readline()
a = courses.readline()
print(a)
courses.close()
parts = a.split(',')
year = parts[1][0:4]
ss = parts[1][4:6]
#print parts[1]
sem = {'30':'spring', '50':'summer', '70':'fall' }
this_sem = sem[ss]
print(" -> This semester is: %s, %s" % (this_sem,year))
print(' -> %s building data file...' % dt_label)
convert_roster_files(this_sem,year,dt_label)
print(' -> moving files...')
move_to_folder(this_sem,year,dt_label)
else:
print(" * No courses file. Not moving files.")
else:
print("--> Don't see files.")
sftp.close()
def fetch_current_rosters_auto():
schedule.every().hour.at(":57").do(fetch_current_rosters)
schedule.every().day.at("12:35").do(sync_non_interactive)
schedule.every().day.at("21:00").do(sync_non_interactive)
print("running every hour on the :57\n")
while True:
try:
schedule.run_pending()
except Exception as e:
import traceback
print(" ---- * * * Failed with: %s" % str(e))
ff = open('cache/pipeline.log.txt','a')
ff.write(datetime.datetime.now().strftime('%Y-%m-%d-%H-%M-%S') + "\n")
ff.write(traceback.format_exc()+"\n---------\n\n")
ff.close()
#schedule.CancelJob
time.sleep(15)
# read schedule file with an eye toward watching what's filling up
def schedule_filling():
sem = 'spring2021' # todo: hardcoded
days = []
for f in sorted(os.listdir('cache/rosters/'+sem+'/')):
if f.endswith('.html'):
match = re.search(r'sched_(\d\d\d\d)_(\d\d)_(\d+)\.html',f)
if match:
print(f)
y = match.group(1)
m = match.group(2)
d = match.group(3)
print("Schedule from %s %s %s." % (y,m,d))
csv_sched = ssb_to_csv(open('cache/rosters/'+sem+'/'+f,'r').read())
jsn = to_section_list(csv_sched)
#print(json.dumps(jsn,indent=2))
days.append(jsn)
day1 = days[-2]
day2 = days[-1]
df = jsondiff.diff(day1, day2)
gains = defaultdict( list )
for D in df.keys():
if isinstance(D, int):
#print(day1[D]['code'] + '\t' + day1[D]['crn'] + ' Before: ' + day1[D]['act'] + ' After: ' + day2[D]['act'])
try:
gain = int(day2[D]['act']) - int(day1[D]['act'])
gains[gain].append( day1[D]['code'] + ' ' + day1[D]['crn'] )
except:
print("No gain for " + str(D))
#print("\t" + str(df[D]))
else:
print(D)
print(df[D])
for key, value in sorted(gains.items(), key=lambda x: x[0]):
print("{} : {}".format(key, value))
#print(json.dumps(gains,indent=2))
################
################ SENDING DATA AWAY
################
################
################
# Upload a json file to www
def put_file(remotepath,localpath, localfile,prompt=1):
show_all = 0
folder = datetime.datetime.now().strftime('%Y-%m-%d-%H-%M-%S')
cnopts = pysftp.CnOpts()
cnopts.hostkeys = None
with pysftp.Connection(FTP_SITE,username=FTP_USER, password=FTP_PW,cnopts=cnopts) as sftp:
#todo: these paths
#files = sftp.listdir()
#print(folder + "\tI see these files on remote: ", files, "\n")
sftp.chdir(remotepath)
files = sftp.listdir()
if show_all: print(folder + "\tI see these files on remote: ", files, "\n")
localf = os.listdir(localpath)
if show_all: print("I see these local: ", localf)
if prompt:
input('ready to upload')
sftp.put(localpath+localfile, localfile, preserve_mtime=True)
sftp.close()
"""
# copy files and directories from local static, to remote static,
# preserving modification times on the files
for f in localf:
print("This local file: " + f + " ", end=' ')
if not f in files:
sftp.put('video_srt/'+classfoldername+'/'+f, f, preserve_mtime=True)
print("Uploaded.")
else:
print("Skipped.")
"""
"""if len(files)==3 and 'users.csv' in files:
sftp.get('courses.csv','rosters/courses-'+folder+'.csv')
sftp.get('users.csv','rosters/users-'+folder+'.csv')
sftp.get('enrollments.csv','rosters/enrollments-'+folder+'.csv')
print folder + '\tSaved three data files in rosters folder.'
courses = open('rosters/courses-'+folder+'.csv','r')
courses.readline()
a = courses.readline()
print a
courses.close()
parts = a.split(',')
year = parts[1][0:4]
ss = parts[1][4:6]
#print parts[1]
sem = {'30':'spring', '50':'summer', '70':'fall' }
this_sem = sem[ss]
#print this_sem, "", year
print folder + '\tbuilding data file...'
convert_roster_files(this_sem,year,folder)
print folder + '\tmoving files...'
move_to_folder(this_sem,year,folder)
else:
print folder + "\tDon't see all three files."""
################
################ GOOGLE DOCS
################
################
################
def sec(t): return "<h3>"+t+"</h3>\n"
def para(t): return "<p>"+t+"</p>\n"
def ul(t): return "<ul>"+t+"</ul>\n"
def li(t): return "<li>"+t+"</li>\n"
def question(t,bracket=1):
ret = ''
match = re.search( r'\[(.*)\]', t)
if match and bracket:
ret += "<a name='" + match.group(1) + "'></a>"
t = re.sub( r'\[.*\]','',t)
else:
parts = t.split(' ')
id = ''
for p in parts:
if re.search(r'[a-zA-Z]',p[0]): id += p[0]
ret += "<a name='%s'></a>" % id.lower()
return ret + '<div class="accordion" data-accordion=""><h4 class="acrd_cntl">' + t + '</h4>\n<div class="acrd_cntnt">'
def answer(t):
return t + '</div></div>\n'
def read_paragraph_element(element,type="NORMAL_TEXT"):
"""Returns the text in the given ParagraphElement.
Args:
element: a ParagraphElement from a Google Doc.
"""
text_run = element.get('textRun')
begin = ''
end = ''
if not text_run:
return ''
if 'textStyle' in text_run and 'link' in text_run['textStyle']:
begin = '<a href="' + text_run['textStyle']['link']['url'] + '">'
end = '</a>'
if 'textStyle' in text_run and 'bold' in text_run['textStyle'] and text_run['textStyle']['bold']==True and type=="NORMAL_TEXT":
begin = '<strong>' + begin
end = end + '</strong>'
content = text_run.get('content')
content = re.sub(u'\u000b','<br />\n',content)
return begin + content + end
def get_doc(docid, bracket=1, verbose=0):
import pickle
import os.path
from googleapiclient.discovery import build
from google_auth_oauthlib.flow import InstalledAppFlow
from google.auth.transport.requests import Request
#ooout = open(fileout,'w')
# If modifying these scopes, delete the file token.pickle.
SCOPES = ['https://www.googleapis.com/auth/documents.readonly']
creds = None
# The file token.pickle stores the user's access and refresh tokens, and is
# created automatically when the authorization flow completes for the first
# time.
if os.path.exists('token.pickle'):
with open('token.pickle', 'rb') as token:
creds = pickle.load(token)
# If there are no (valid) credentials available, let the user log in.
if not creds or not creds.valid:
if creds and creds.expired and creds.refresh_token:
creds.refresh(Request())
else:
flow = InstalledAppFlow.from_client_secrets_file(
'credentials.json', SCOPES)
creds = flow.run_local_server(port=0)
# Save the credentials for the next run
with open('token.pickle', 'wb') as token:
pickle.dump(creds, token)
service = build('docs', 'v1', credentials=creds)
# Retrieve the documents contents from the Docs service.
document = service.documents().get(documentId=docid).execute()
if verbose: print(document)
tempout = codecs.open('cache/trash/gdoctemp.txt','w','utf-8')
tempout.write( json.dumps(document,indent=2) + "\n\n\n------------------------------------\n\n")
if verbose: print('The title of the document is: {}'.format(document.get('title')))
doc_content = document.get('body').get('content')
if verbose: print(doc_content)
doc_objects = document.get('inlineObjects')
if verbose: print(doc_objects)
doc_lists = document.get('lists')
text = '<div class="acrd_grp" data-accordion-group="">'
last_type = ''
answer_text = ''
in_a_list = ''
img_count = 1
img_lookup = {}
img_heights = {}
img_widths = {}
if doc_objects:
for k,value in doc_objects.items():
tempout.write( "->" + k + "=" + json.dumps(value,indent=2) + "\n\n\n--\n\n")
if 'inlineObjectProperties' in value:
if 'embeddedObject' in value['inlineObjectProperties']:
if 'imageProperties' in value['inlineObjectProperties']['embeddedObject']:
if 'contentUri' in value['inlineObjectProperties']['embeddedObject']['imageProperties']:
print(k)
uu = value['inlineObjectProperties']['embeddedObject']['imageProperties']['contentUri']
response = requests.get(uu, stream=True)
name = 'image_' + str(img_count) + '.' + response.headers['content-type'].split('/')[1]
img_count += 1
img_lookup[k] = name
with open('cache/doc_images/'+name, 'wb') as out_file:
shutil.copyfileobj(response.raw, out_file)
print(uu)
print(response.headers)
print(name)
#input('x?')
del response
if 'size' in value['inlineObjectProperties']['embeddedObject']:
img_heights[k] = int(value['inlineObjectProperties']['embeddedObject']['size']['height']['magnitude'])
img_widths[k] = int(value['inlineObjectProperties']['embeddedObject']['size']['width']['magnitude'])
tempout.write('- - - - - - - -\n\n')
#for value in doc_lists:
# tempout.write( json.dumps(value,indent=2) + "\n\n\n--\n\n")
tempout.write('- - - - - - - -\n\n')
list_stack = []
list_depth = 0
last_list_depth = 0
for value in doc_content:
tempout.write( json.dumps(value,indent=2) + "\n\n\n")
if verbose: print(json.dumps(value, sort_keys=True, indent=4))
# todo: x link, x bold, list, image.
tag_fxn = para
if 'paragraph' in value:
this_text = ''
if 'bullet' in value['paragraph']:
# either we're (1)starting a new list, (2)in one, (3)starting a nested one, or (4)finished a nested one.
lid = value['paragraph']['bullet']['listId']
if not list_stack: # 1
list_stack.append(lid)
else:
if lid == list_stack[0]: # 2
pass
else:
if not lid in list_stack: # 3
list_stack.append(lid)
else: # 4
x = list_stack.pop()
while x != lid: list_stack.pop()
elif len(list_stack) > 0: # current para isn't a bullet but we still have a list open.
list_stack = []
list_depth = len(list_stack)
deeper = list_depth - last_list_depth
if deeper > 0:
answer_text += "<ul>" * deeper
elif deeper < 0:
deeper = -1 * deeper
answer_text += "</ul>" * deeper
if len(list_stack):
tag_fxn = li
elements = value.get('paragraph').get('elements')
# inlineObjectElement": {
# "inlineObjectId": "kix.ssseeu8j9cfx",
if 'paragraphStyle' in value.get('paragraph'):
style = value.get('paragraph').get('paragraphStyle')
#text += json.dumps(style, sort_keys=True, indent=4)
if 'namedStyleType' in style:
type = style['namedStyleType']
for elem in elements:
# text content
this_text += read_paragraph_element(elem,type)
# image content
if 'inlineObjectElement' in elem:
vpi = elem['inlineObjectElement']
if 'inlineObjectId' in vpi:
ii = vpi['inlineObjectId']
if ii in img_lookup:
img = img_lookup[ii]
h = img_heights[ii]
w = img_widths[ii]
this_text += '<img src="doc_images/%s" width="%i" height="%i" />' % (img,w,h)
if last_type=='NORMAL_TEXT' and type!=last_type:
text += answer(answer_text)
answer_text = ''
if type=='HEADING_2':
text += sec(this_text)
this_text = ''
elif type=='HEADING_3':
text += question(this_text,bracket)
this_text = ''
else:
answer_text += tag_fxn(this_text)
this_text = ''
last_type = type
last_list_depth = list_depth
elif 'table' in value:
# The text in table cells are in nested Structural Elements and tables may be
# nested.
text += "\nTABLE\n"
#table = value.get('table')
#for row in table.get('tableRows'):
# cells = row.get('tableCells')
# for cell in cells:
# text += read_strucutural_elements(cell.get('content'))
#elif 'tableOfContents' in value:
# # The text in the TOC is also in a Structural Element.
# toc = value.get('tableOfContents')
# text += read_strucutural_elements(toc.get('content'))
#else:
# print(json.dumps(value, sort_keys=True, indent=4))
text += answer(answer_text)
#text += '</div>'
#print(text)
return text
######### TRY #2 ######
def read_paragraph_element_2(element,type="NORMAL_TEXT"):
text_run = element.get('textRun')
begin = ''
end = ''
if not text_run: return ''
if 'textStyle' in text_run and 'link' in text_run['textStyle']:
begin = '<a href="' + text_run['textStyle']['link']['url'] + '">'
end = '</a>'
if 'textStyle' in text_run and 'bold' in text_run['textStyle'] and text_run['textStyle']['bold']==True and type=="NORMAL_TEXT":
begin = '<strong>' + begin
end = end + '</strong>'
elif 'textStyle' in text_run and 'italic' in text_run['textStyle'] and text_run['textStyle']['italic']==True and type=="NORMAL_TEXT":
begin = '<em>' + begin
end = end + '</em>'
content = text_run.get('content')
content = re.sub(u'\u000b','<br />\n',content)
return begin + content + end
# t is a string that begins with "Icons: " ... and contains comma(space) separated list
def handle_icons(t):
text = t[7:].strip()
parts = text.split(", ")
return ('icons',parts)
# t is a string that begins with "Tags: " ... and contains comma(space) separated list
def handle_tags(t):
text = t[6:].strip()
parts = text.split(", ")
return ('tags',parts)
def handle_question(t,bracket=1):
anchor = ''
match = re.search( r'\[(.*)\]', t)
if match and bracket:
anchor = match.group(1).lower()
t = re.sub( r'\[.*\]','',t)
else:
parts = t.split(' ')
for p in parts:
if re.search(r'[a-zA-Z]',p[0]): anchor += p[0].lower()
return ('question', t, anchor)
def handle_answer(t):
return ('answer',t)
def handle_sec(t): return ('section',t)
def handle_para(t): return ('paragraph',t)
def handle_ul(t): return ('unorderdedlist',t)
def handle_li(t): return ('listitem',t)
img_count = 1
img_lookup = {}
img_heights = {}
img_widths = {}
def fetch_doc_image(k,value):
global img_count, img_lookup, img_heights, img_widths
if 'inlineObjectProperties' in value:
if 'embeddedObject' in value['inlineObjectProperties']:
if 'imageProperties' in value['inlineObjectProperties']['embeddedObject']:
if 'contentUri' in value['inlineObjectProperties']['embeddedObject']['imageProperties']:
print(k)
uu = value['inlineObjectProperties']['embeddedObject']['imageProperties']['contentUri']
response = requests.get(uu, stream=True)
name = 'image_' + str(img_count) + '.' + response.headers['content-type'].split('/')[1]
img_count += 1
img_lookup[k] = name
with open('cache/doc_images/'+name, 'wb') as out_file:
shutil.copyfileobj(response.raw, out_file)
print(uu)
print(response.headers)
print(name)
del response
if 'size' in value['inlineObjectProperties']['embeddedObject']:
img_heights[k] = int(value['inlineObjectProperties']['embeddedObject']['size']['height']['magnitude'])
img_widths[k] = int(value['inlineObjectProperties']['embeddedObject']['size']['width']['magnitude'])
def get_doc_generic(docid, bracket=1, verbose=0):
import pickle
import os.path
from googleapiclient.discovery import build
from google_auth_oauthlib.flow import InstalledAppFlow
from google.auth.transport.requests import Request
global img_count, img_lookup, img_heights, img_widths
# If modifying these scopes, delete the file token.pickle.
SCOPES = ['https://www.googleapis.com/auth/documents.readonly']
creds = None
# The file token.pickle stores the user's access and refresh tokens, and is
# created automatically when the authorization flow completes for the first
# time.
if os.path.exists('token.pickle'):
with open('token.pickle', 'rb') as token:
creds = pickle.load(token)
if not creds or not creds.valid:
if creds and creds.expired and creds.refresh_token:
creds.refresh(Request())
else:
flow = InstalledAppFlow.from_client_secrets_file(
'credentials.json', SCOPES)
creds = flow.run_local_server(port=0)
# Save the credentials for the next run
with open('token.pickle', 'wb') as token:
pickle.dump(creds, token)
service = build('docs', 'v1', credentials=creds)
# Retrieve the documents contents from the Docs service.
document = service.documents().get(documentId=docid).execute()
tempout = codecs.open('cache/trash/gdoctemp.txt','w','utf-8')
tempout.write( json.dumps(document,indent=2) \
+ "\n\n\n------------------------------------\n\n")
if verbose: print('The title of the document is: {}'.format(document.get('title')))
doc_content = document.get('body').get('content')
doc_objects = document.get('inlineObjects')
doc_lists = document.get('lists')
#text = ''
result = []
last_type = ''
#answer_text = ''
answer = []
in_a_list = ''
# Get all the images
for k,value in doc_objects.items():
tempout.write( "->" + k + "=" + json.dumps(value,indent=2) + "\n\n\n--\n\n")
fetched = fetch_doc_image(k,value)
list_stack = []
list_depth = 0
last_list_depth = 0
for value in doc_content:
tempout.write( json.dumps(value,indent=2) + "\n\n\n")
if verbose: print(json.dumps(value, sort_keys=True, indent=4))
tag_fxn = handle_para
if 'paragraph' in value:
this_text = ''
# First we deal with if we're in a list.
if 'bullet' in value['paragraph']:
# either we're (1)starting a new list, (2)in one (do nothing),
# (3)starting a nested one, or (4)finished a nested one.
lid = value['paragraph']['bullet']['listId']
if not list_stack: # 1
list_stack.append(lid)
else:
if not lid == list_stack[0]:
if not lid in list_stack: # 3
list_stack.append(lid)
else: # 4
x = list_stack.pop()
while x != lid: list_stack.pop()
elif len(list_stack) > 0:
# current para isn't a bullet but we still have a list open.
list_stack = []
list_depth = len(list_stack)
deeper = list_depth - last_list_depth
if deeper > 0:
answer.append("<ul>" * deeper)
elif deeper < 0:
deeper = -1 * deeper
answer.append("</ul>" * deeper)
if len(list_stack):
tag_fxn = handle_li
# NOW the tag_fxn is either 'para' or 'li'... let's get the styling info next,
elements = value.get('paragraph').get('elements')
if 'paragraphStyle' in value.get('paragraph'):
style = value.get('paragraph').get('paragraphStyle')
if 'namedStyleType' in style:
type = style['namedStyleType']
# and FINALLY, the actual contents.
for elem in elements:
# text content
this_text += read_paragraph_element_2(elem,type)
# image content
if 'inlineObjectElement' in elem:
vpi = elem['inlineObjectElement']
if 'inlineObjectId' in vpi:
ii = vpi['inlineObjectId']
if ii in img_lookup:
img = img_lookup[ii]
h = img_heights[ii]
w = img_widths[ii]
this_text += '<img src="doc_images/%s" width="%i" height="%i" />' % (img,w,h)
# Now for something tricky. Call an appropriate handler, based on:
# (a) what is the paragraph style type?
# (b) is it different from the prev one?
if last_type=='NORMAL_TEXT' and type!=last_type:
if this_text.strip():
result.append(handle_answer(answer))
answer = []
#answer_text = ''
if type=='HEADING_2' and this_text.strip():
result.append( handle_sec(this_text) )
this_text = ''
elif type=='HEADING_3' and this_text.strip():
result.append(handle_question(this_text,bracket))
this_text = ''
else:
if this_text.lower().startswith('tags:'):
tag_fxn = handle_tags
if this_text.lower().startswith('icons:'):
tag_fxn = handle_icons
if this_text.strip():
answer.append(tag_fxn(this_text))
this_text = ''
last_type = type
last_list_depth = list_depth
elif 'table' in value:
pass
result.append(handle_answer(answer))
return json.dumps(result,indent=4)
def scrape_schedule_py():
return 1
"""
cur_session = requests.Session()
mygav_url = "https://lum-prod.ec.gavilan.edu/"
r1 = requests.get(mygav_url)
login_url1 = "https://lum-prod.ec.gavilan.edu/c/portal/login"
login_url = "https://eis-prod.ec.gavilan.edu/authenticationendpoint/login.do?commonAuthCallerPath=%2Fsamlsso&forceAuth=false&passiveAuth=false&tenantDomain=carbon.super&sessionDataKey=57203341-6823-4511-b88e-4e104aa2fd71&relyingParty=LP5PROD_LuminisPortalEntity&type=samlsso&sp=Luminis+Portal+PROD&isSaaSApp=false&authenticators=BasicAuthenticator:LOCAL"
"""
def scrape_schedule_multi():
global SEMESTER, short_sem, semester_begin, filename, filename_html
SEMESTER = 'Spring 2023'
short_sem = 'sp23'
semester_begin = strptime('01/30', '%m/%d')
filename = 'sp23_sched.json'
filename_html = 'sp23_sched.html'
SEM = ['Fall 2022', 'Summer 2022 (View only)', 'Spring 2022 (View only)',
'Fall 2021 (View only)', 'Summer 2021 (View only)', 'Spring 2021 (View only)', 'Fall 2020 (View only)', 'Summer 2020 (View only)', 'Spring 2020 (View only)',
'Fall 2019 (View only)', 'Summer 2019 (View only)', 'Spring 2019 (View only)', 'Fall 2018 (View only)', 'Summer 2018 (View only)', 'Spring 2018 (View only)' ]
srt = 'fa22,su22,sp22,fa21,su21,sp21,fa20,su20,sp20,fa19,su19,sp19,fa18,su18,sp18'.split(',')
beg = ['08/22','06/13','01/31','08/23','06/14','02/01','08/24','06/15','01/27','08/26','06/17','01/28','08/27','06/18','01/29']
#for i in [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14]:
#SEMESTER = SEM[i]
#short_sem = srt[i]
#semester_begin = strptime(beg[i], '%m/%d')
#filename = '%s_sched.json' % short_sem
#filename_html = '%s_sched.html' % short_sem
as_dict = scrape_schedule()
expanded = list_latestarts(short_sem)
fields = "gp,dean,dept,num,code,crn,teacher,name,act,cap,site,type".split(",")
ffcsv = codecs.open('cache/enrollment_%s.csv' % short_sem, 'w', 'utf-8')
with ffcsv as csvfile:
csvwriter = csv.writer(csvfile)
csvwriter.writerow(fields)
for S in expanded:
parts = S['code'].split(' ')
S['dept'] = parts[0]
S['num'] = parts[1]
S['gp'] = gp[parts[0]]
S['dean'] = dean[parts[0]]
S['sem'] = short_sem
# S['act'] = S['cap']
if S['loc'] == "ONLINE LIVE": S['site'] = 'OnlineLive'
csvwriter.writerow( [ S[x] for x in fields ] )
put_file('/home/public/schedule/', 'cache/', 'enrollment_%s.csv' % short_sem, 0)
def scrape_for_db():
global SEMESTER, gp, dean, short_sem, semester_begin, filename, filename_html
fields = 'sem,crn,dept,num,gp,dean,code,name,teacher,type,cap,act,loc,site,date,days,time,cred,ztc'.split(',')
"""
SEMESTER = 'Fall 2022'
short_sem = 'fa22'
semester_begin = strptime('08/22', '%m/%d')
filename = 'fa22_sched.json'
filename_html = 'fa22_sched.html'
as_dict = scrape_schedule()
fff = codecs.open('cache/%s_sched.sql' % filename, 'w', 'utf-8')
fff.write("CREATE TABLE IF NOT EXISTS schedule ( id text, sem text, dept text, num text, gp text, dean text, code text, crn text, name text, teacher text,mode text, loc text, cap text, act text, site text, date text, cred text, ztc text, days text, time text);\n")
for S in as_dict:
parts = S['code'].split(' ')
S['dept'] = parts[0]
S['num'] = parts[1]
S['gp'] = gp[parts[0]]
S['dean'] = dean[parts[0]]
S['sem'] = short_sem
str = "INSERT INTO schedule (sem,crn,dept,num,gp,dean,code,name,teacher,mode,cap,act,loc,site,date,days,time,cred,ztc) VALUES (%s);\n" % \
", ".join( [ "'" + re.sub(r"'", "", S[x]) + "'" for x in fields ] )
print(str)
fff.write(str)
fff.write('UPDATE schedule SET site="OnlineLive" WHERE loc="ONLINE LIVE";\n')
fff.close()
"""
SEMESTER = 'Spring 2023 (View only)'
short_sem = 'sp23'
semester_begin = strptime('01/30', '%m/%d')
filename = 'sp23_sched.json'
filename_html = 'sp23_sched.html'
as_dict = scrape_schedule()
fff = codecs.open('cache/%s_sched.sql' % filename, 'w', 'utf-8')
fff.write("CREATE TABLE IF NOT EXISTS schedule ( id text, sem text, dept text, num text, gp text, dean text, code text, crn text, name text, teacher text,mode text, loc text, cap text, act text, site text, date text, cred text, ztc text, days text, time text);\n")
for S in as_dict:
parts = S['code'].split(' ')
S['dept'] = parts[0]
S['num'] = parts[1]
S['gp'] = gp[parts[0]]
S['dean'] = dean[parts[0]]
S['sem'] = short_sem
str = "INSERT INTO schedule (sem,crn,dept,num,gp,dean,code,name,teacher,mode,cap,act,loc,site,date,days,time,cred,ztc) VALUES (%s);\n" % \
", ".join( [ "'" + re.sub(r"'", "", S[x]) + "'" for x in fields ] )
print(str)
fff.write(str)
fff.write('UPDATE schedule SET site="OnlineLive" WHERE loc="ONLINE LIVE";\n')
fff.close()
def argos_data():
global dean,gp
f2 = codecs.open('cache/enrollment_sp23.csv','w','utf-8')
writer = csv.writer(f2)
headers = 'gp dean dept num code crn name act site'.split(' ')
writer.writerow(headers)
f = codecs.open('cache/sched_draft_sp23.csv','r','utf-8')
reader = csv.reader(f, delimiter=',')
headers = next(reader)
for r in reader:
d = dict(list(zip(headers,r)))
print(d)
my_dean = dean[d['Subj']]
my_gp = gp[d['Subj']]
dept = d['Subj']
num = d['Crse No']
code = dept + " " + num
crn = d['CRN']
name = d['Course Title']
act = d['Open Seats']
campus = d['Campus']
session = d['Session']
if campus == "Off Campus": site = session
else: site = campus
print(site)
writer.writerow([my_gp,my_dean,dept,num,code,crn,name,act,site])
def expand_old_semesters():
terms = 'sp16,su16,fa16,sp17,su17,fa17,sp18,su18,fa18,sp19,su19,fa19,sp20,su20,fa20,sp21,su21,fa21,sp22,su22,fa22'.split(',')
terms = 'sp16,su16,fa16,sp17,su17,fa17,sp18,su18,fa18,sp19,su19,fa19,sp20,su20'.split(',')
terms.reverse()
for t in terms:
list_latestarts(t)
input('press return to continue.')
# Input: xxxx_sched.json. Output: xxxx_latestarts.txt
def list_latestarts(term):
show_summary = 1
the_year = '20' + term[2:4]
print("year: ", the_year, " semester: ", term)
term_in = "cache/%s_sched.json" % term
term_out = "cache/%s_latestarts.txt" % term
expanded_out = "%s_sched_expanded.json" % term
print("Writing output to " + term_out)
infile = codecs.open(term_in, "r", "utf-8")
outfile = codecs.open(term_out, "w", "utf-8")
exoutfile = codecs.open('cache/' + expanded_out, "w", "utf-8")
expanded = []
sched = json.loads(infile.read())
#print sched
by_date = {}
if show_summary: print("course \t loc \t type \t time")
for C in sched:
if (not C['type']) and C['loc'] != 'ONLINE': # and C['time']:
C['type'] = 'in-person'
if show_summary: print("%s \t %s \t %s \t %s" % (C['code'],C['loc'],C['type'],C['time']))
if 'extra' in C:
if 'partofday' in C and ('type' in C['extra'][0]) and (C['extra'][0]['type'] == 'online') and C['loc'] != "ONLINE LIVE":
C['type'] = 'hybrid'
times = C['time'].split("-")
if len(times) > 1:
time_start = times[0]
time_end = times[1]
try:
startt = time.strptime(time_start,"%I:%M %p")
endt = time.strptime(time_end,"%I:%M %p")
min_start = startt.tm_min
min_end = endt.tm_min
if min_start == 0: min_start = "00"
else: min_start = str(min_start)
if min_end == 0: min_end = "00"
else: min_end = str(min_end)
C['time_start'] = "%i:%s" % (startt.tm_hour, min_start )
C['time_end'] = "%i:%s" % (endt.tm_hour, min_end )
if 0:
print("+ Parsed %s into %s and %s." % (C['time'], C['time_start'], C['time_end']))
except Exception as e:
print(e, "\n-- problem parsing time ", time_start, " or ", time_end)
else:
C['time_start'] = ''
C['time_end'] = ''
if re.search('TBA',C['date']):
C['start'] = ''
C['end'] = ''
C['doy'] = ''
expanded.append(C)
continue
parts = C['date'].split("-")
start = parts[0] + "/" + the_year
end = parts[1] + "/" + the_year
try:
startd = parser.parse(start)
endd = parser.parse(end)
C['start'] = "%i-%i" % (startd.month,startd.day)
C['end'] = "%i-%i" % (endd.month,endd.day)
C['doy'] = startd.timetuple().tm_yday
expanded.append(C)
except Exception as e:
print(e, "\n-- problem parsing ", start, " or ", end)
if not startd in by_date:
by_date[startd] = []
by_date[startd].append(C)
exoutfile.write( json.dumps(expanded,indent=2) )
exoutfile.close()
put_file('/home/public/schedule/', 'cache/', expanded_out, 0)
for X in sorted(by_date.keys()):
#print("Start: ", X)
if len(by_date[X]) < 200:
prettydate = X.strftime("%A, %B %d")
#print(prettydate + ": " + str(len(by_date[X])) + " courses")
outfile.write(prettydate + ": " + str(len(by_date[X])) + " courses" + "\n")
for Y in by_date[X]:
#print "\t" + Y['code'] + " " + Y['crn'] + "\t" + Y['teacher']
#print(Y)
#outfile.write("\t" + Y['code'] + " " + Y['crn'] + "\t" + Y['teacher'] + "\t" + Y['type'] +"\n")
outfile.write("\t" + Y['code'] + " " + Y['crn'] + "\t" + Y['teacher'] + "\t" + Y['type'] + "\t" + "\n")
return expanded
if __name__ == "__main__":
print ('')
options = { 1: ['Re-create schedule csv and json files from raw html',recent_schedules] ,
2: ['Fetch rosters',fetch_current_rosters] ,
3: ['Fetch rosters AND canvas data automatically',fetch_current_rosters_auto] ,
4: ['Compute how registration is filling up classes', schedule_filling] ,
5: ['Manually convert 3 csv files to joined json enrollment file.', convert_roster_files] ,
6: ['Canvas data: interactive sync', interactive ],
7: ['Canvas data: automated sync', sync_non_interactive ],
8: ['Scrape schedule from ssb', scrape_schedule_multi ],
9: ['Test ssb calls with python', scrape_schedule_py ],
10: ['schedule to db', scrape_for_db ],
11: ['clean argos draft schedule file', argos_data],
12: ['make expanded schedule json files of old semesters', expand_old_semesters ],
13: ['Parse deanza schedule', dza_sched ],
}
if len(sys.argv) > 1 and re.search(r'^\d+',sys.argv[1]):
resp = int(sys.argv[1])
print("\n\nPerforming: %s\n\n" % options[resp][0])
else:
print ('')
for key in options:
print(str(key) + '.\t' + options[key][0])
print('')
resp = input('Choose: ')
# Call the function in the options dict
options[ int(resp)][1]()
# Testing
#if __name__ == "__main__":
#users = fetch('/api/v1/courses/69/users?per_page=100',1)
#print "These are the users: "
#print users
#getSemesterSchedule()
#get_doc()
#pass