2217 lines
79 KiB
Python
2217 lines
79 KiB
Python
from time import strptime
|
||
from bs4 import BeautifulSoup as bs
|
||
from util import UnicodeDictReader
|
||
from datetime import datetime as dt
|
||
from dateutil import parser
|
||
import pandas as pd
|
||
import codecs, json, requests, re, csv, datetime, pysftp, os, jsondiff, os.path
|
||
import sys, shutil, hmac, hashlib, base64, schedule, time, pathlib, datetime
|
||
import pdb
|
||
from collections import defaultdict
|
||
from deepdiff import DeepDiff
|
||
from canvas_secrets import apiKey, apiSecret, FTP_SITE, FTP_USER, FTP_PW, GOO, GOO_PIN, token, url, domain, account_id, header, g_id, g_secret
|
||
from canvas_secrets import instructure_url, instructure_username, instructure_private_key
|
||
|
||
|
||
|
||
"""
|
||
Everything to do with fetching data,
|
||
- From iLearn, via token
|
||
- current roster uploads from instructures sftp site
|
||
- raw logs and other from canvas data repo
|
||
- from ssb, use firefox to scrape the schedule
|
||
|
||
|
||
And some subsequent processing:
|
||
- Raw roster files, into a more compact json format
|
||
- Raw logs into something more useful
|
||
"""
|
||
|
||
verbose = False
|
||
|
||
users = {}
|
||
users_by_id = {}
|
||
|
||
# todo: all these constants for SSB -- line 1008
|
||
#
|
||
# todo: https://stackoverflow.com/questions/42656247/how-can-i-use-canvas-data-rest-api-using-python
|
||
|
||
schedfile = 'temp.csv'
|
||
|
||
|
||
SEMESTER = 'Summer 2019'
|
||
short_sem = 'su19'
|
||
semester_begin = strptime('06/17', '%m/%d')
|
||
filename = 'su19_sched.json'
|
||
|
||
SEMESTER = 'Summer 2020'
|
||
short_sem = 'su20'
|
||
semester_begin = strptime('06/15', '%m/%d')
|
||
filename = 'su20_sched.json'
|
||
|
||
SEMESTER = 'Fall 2020'
|
||
short_sem = 'fa20'
|
||
semester_begin = strptime('08/24', '%m/%d')
|
||
filename = 'fa20_sched.json'
|
||
|
||
SEMESTER = 'Spring 2021'
|
||
short_sem = 'sp21'
|
||
semester_begin = strptime('02/01', '%m/%d')
|
||
filename = 'sp21_sched.json'
|
||
filename_html = 'sp21_sched.html'
|
||
|
||
|
||
SEMESTER = 'Summer 2021 (View only)'
|
||
short_sem = 'su21'
|
||
semester_begin = strptime('06/14', '%m/%d')
|
||
filename = 'su21_sched.json'
|
||
filename_html = 'su21_sched.html'
|
||
|
||
|
||
|
||
|
||
# Current or upcoming semester is first.
|
||
sems = ['su21', 'sp21', 'fa20', 'su20', 'sp20'] #, 'fa19'] # 'sp19']
|
||
|
||
sys.setrecursionlimit( 100000 )
|
||
|
||
local_data_folder = 'cache/canvas_data/'
|
||
mylog = codecs.open(local_data_folder + 'temp_log.txt','w')
|
||
|
||
|
||
gp = {}
|
||
gp['ACCT'] = 'info'
|
||
gp['AE'] = 'skill'
|
||
gp['AH'] = 'well'
|
||
gp['AJ'] = 'skill'
|
||
gp['AMT'] = 'skill'
|
||
gp['ANTH'] = 'soc'
|
||
gp['APE'] = 'skill'
|
||
gp['ART'] = 'art'
|
||
gp['ASTR'] = 'stem'
|
||
gp['ATH'] = 'well'
|
||
gp['BIO'] = 'stem'
|
||
gp['BOT'] = 'info'
|
||
gp['BUS'] = 'info'
|
||
gp['CD'] = 'skill'
|
||
gp['CHEM'] = 'stem'
|
||
gp['CMGT'] = 'skill'
|
||
gp['CMUN'] = 'comm'
|
||
gp['COS'] = 'skill'
|
||
gp['CSIS'] = 'stem'
|
||
gp['CWE'] = 'skill'
|
||
gp['DM'] = 'info'
|
||
gp['ECOL'] = 'stem'
|
||
gp['ECON'] = 'info'
|
||
gp['ENGL'] = 'soc'
|
||
gp['ENGR'] = 'stem'
|
||
gp['ENVS'] = 'stem'
|
||
gp['ESL'] = 'comm'
|
||
gp['ETHN'] = 'comm'
|
||
gp['FRNH'] = 'comm'
|
||
gp['GEOG'] = 'stem'
|
||
gp['GEOL'] = 'stem'
|
||
gp['GUID'] = 'soc'
|
||
gp['HE'] = 'well'
|
||
gp['HIST'] = 'soc'
|
||
gp['HUM'] = 'soc'
|
||
gp['HVAC'] = 'skill'
|
||
gp['JFT'] = 'skill'
|
||
gp['JLE'] = 'skill'
|
||
gp['JOUR'] = 'comm'
|
||
gp['JPN'] = 'comm'
|
||
gp['KIN'] = 'well'
|
||
gp['LIB'] = 'comm'
|
||
gp['LIFE'] = 'well'
|
||
gp['MATH'] = 'stem'
|
||
gp['MCTV'] = 'art'
|
||
gp['MUS'] = 'art'
|
||
gp['PHIL'] = 'soc'
|
||
gp['PHYS'] = 'stem'
|
||
gp['POLS'] = 'soc'
|
||
gp['PSCI'] = 'stem'
|
||
gp['PSYC'] = 'soc'
|
||
gp['RE'] = 'skill'
|
||
gp['SJS'] = 'soc'
|
||
gp['SOC'] = 'soc'
|
||
gp['SPAN'] = 'comm'
|
||
gp['THEA'] = 'art'
|
||
gp['WELD'] = 'skill'
|
||
gp['WTRM'] = 'skill'
|
||
gp['MGMT'] = 'skill'
|
||
gp['MKTG'] = 'skill'
|
||
gp['HTM'] = 'skill'
|
||
|
||
dean = {}
|
||
dean['AH'] = 'et'
|
||
dean['HE'] = 'et'
|
||
dean['ATH'] = 'et'
|
||
dean['KIN'] = 'et'
|
||
dean['LIFE'] = 'et'
|
||
dean['AE'] = 'ss'
|
||
dean['APE'] = 'ss'
|
||
dean['ACCT'] = 'ss'
|
||
dean['AJ'] = 'ss'
|
||
dean['AMT'] = 'ss'
|
||
dean['HVAC'] = 'ss'
|
||
dean['JFT'] = 'ss'
|
||
dean['JLE'] = 'ss'
|
||
dean['RE'] = 'ss'
|
||
dean['WTRM'] = 'ss'
|
||
dean['WELD'] = 'ss'
|
||
dean['ANTH'] = 'nl'
|
||
dean['ART'] = 'nl'
|
||
dean['ASTR'] = 'jn'
|
||
dean['BIO'] = 'jn'
|
||
dean['BOT'] = 'ss'
|
||
dean['BUS'] = 'ss'
|
||
dean['CD'] = 'ss'
|
||
dean['CHEM'] = 'jn'
|
||
dean['CMGT'] = 'ss'
|
||
dean['CMUN'] = 'nl'
|
||
dean['COS'] = 'ss'
|
||
dean['CSIS'] = 'ss'
|
||
dean['CWE'] = 'ss'
|
||
dean['DM'] = 'ss'
|
||
dean['ECOL'] = 'jn'
|
||
dean['ECON'] = 'nl'
|
||
dean['ENGL'] = 'nl'
|
||
dean['ENGR'] = 'jn'
|
||
dean['ENVS'] = 'jn'
|
||
dean['ESL'] = 'ss'
|
||
dean['ETHN'] = 'nl'
|
||
dean['FRNH'] = 'nl'
|
||
dean['GEOG'] = 'jn'
|
||
dean['GEOL'] = 'jn'
|
||
dean['GUID'] = 'nl'
|
||
dean['HIST'] = 'nl'
|
||
dean['HUM'] = 'nl'
|
||
dean['JOUR'] = 'nl'
|
||
dean['JPN'] = 'nl'
|
||
dean['LIB'] = 'kn'
|
||
dean['MATH'] = 'jn'
|
||
dean['MCTV'] = 'nl'
|
||
dean['MGMT'] = 'ss'
|
||
dean['MKTG'] = 'ss'
|
||
dean['HTM'] = 'ss'
|
||
dean['MUS'] = 'nl'
|
||
dean['PHIL'] = 'nl'
|
||
dean['PHYS'] = 'jn'
|
||
dean['POLS'] = 'nl'
|
||
dean['PSCI'] = 'jn'
|
||
dean['PSYC'] = 'nl'
|
||
dean['SJS'] = 'nl'
|
||
dean['SOC'] = 'nl'
|
||
dean['SPAN'] = 'nl'
|
||
dean['THEA'] = 'nl'
|
||
|
||
|
||
class FetchError(Exception):
|
||
pass
|
||
|
||
|
||
DEBUG = 0
|
||
|
||
def d(s,end=''):
|
||
global DEBUG
|
||
if end and DEBUG: print(s,end=end)
|
||
elif DEBUG: print(s)
|
||
|
||
################
|
||
################ CANVAS API MAIN FETCHING FUNCTIONS
|
||
################
|
||
################
|
||
################
|
||
|
||
|
||
|
||
|
||
# Main canvas querying fxn
|
||
def fetch(target,verbose=0):
|
||
# if there are more results, recursivly call myself, adding on to the results.
|
||
results = 0
|
||
if target[0:4] != "http": target = url + target
|
||
if verbose:
|
||
print("++ Fetching: " + target)
|
||
r2 = requests.get(target, headers = header)
|
||
#if verbose:
|
||
#print "++ Got: " + r2.text
|
||
try:
|
||
results = json.loads(r2.text)
|
||
count = len(results)
|
||
except:
|
||
print("-- Failed to parse: ", r2.text)
|
||
if verbose:
|
||
print("Got %i results" % count)
|
||
if verbose > 1:
|
||
print(r2.headers)
|
||
|
||
tempout = codecs.open('cache/fetchcache.txt','a','utf-8')
|
||
tempout.write(r2.text+"\n\n")
|
||
tempout.close()
|
||
|
||
if ('link' in r2.headers and count > 0):
|
||
links = r2.headers['link'].split(',')
|
||
for L in links:
|
||
ll = L.split(';')
|
||
link = ll[0].replace("<","")
|
||
link = link.replace(">","")
|
||
if re.search(r'next', ll[1]):
|
||
if (verbose): print("++ More link: " + link)
|
||
#link = re.sub(r'per_page=10$', 'per_page=100', link) # link.replace('per_page=10','per_page=500')
|
||
#if (verbose): print("++ More link: " + link)
|
||
|
||
nest = fetch(link,verbose)
|
||
if isinstance(results,dict): results.update(nest)
|
||
else: results.extend(nest)
|
||
return results
|
||
|
||
# Main canvas querying fxn - stream version - don't die on big requests
|
||
def fetch_stream(target,verbose=0):
|
||
# if there are more results, recursivly call myself, adding on to the results.
|
||
results = 0
|
||
while target:
|
||
if target[0:4] != "http": target = url + target
|
||
if verbose:
|
||
print("++ Fetching: " + target)
|
||
r2 = requests.get(target, headers = header)
|
||
if r2.status_code == 502:
|
||
raise FetchError()
|
||
try:
|
||
results = json.loads(r2.text)
|
||
count = len(results)
|
||
except:
|
||
print("-- Failed to parse: ", r2.text)
|
||
if verbose:
|
||
print("Got %i results" % count)
|
||
if verbose > 1:
|
||
print(r2.headers)
|
||
tempout = codecs.open('cache/fetchcache.txt','a','utf-8')
|
||
tempout.write(r2.text+"\n\n")
|
||
tempout.close()
|
||
|
||
next_link_found = 0
|
||
if ('link' in r2.headers and count > 0):
|
||
links = r2.headers['link'].split(',')
|
||
for L in links:
|
||
ll = L.split(';')
|
||
link = ll[0].replace("<","")
|
||
link = link.replace(">","")
|
||
if re.search(r'next', ll[1]):
|
||
target = link
|
||
next_link_found = 1
|
||
break
|
||
if not next_link_found: target = 0
|
||
yield results
|
||
|
||
|
||
# for dicts with one key, collapse that one key out, cause
|
||
# paging makes problems... example: enrollment_terms
|
||
def fetch_collapse(target,collapse='',verbose=0):
|
||
# if there are more results, recursivly call myself, adding on to the results.
|
||
results = 0
|
||
if target[0:4] != "http": target = url + target
|
||
if verbose:
|
||
print("++ Fetching: " + target)
|
||
r2 = requests.get(target, headers = header)
|
||
#if verbose:
|
||
#print "++ Got: " + r2.text
|
||
try:
|
||
results = json.loads(r2.text)
|
||
except:
|
||
print("-- Failed to parse: ", r2.text)
|
||
if verbose: print(r2.headers)
|
||
|
||
if collapse and collapse in results:
|
||
results = results[collapse]
|
||
|
||
if ('link' in r2.headers):
|
||
links = r2.headers['link'].split(',')
|
||
for L in links:
|
||
ll = L.split(';')
|
||
link = ll[0].replace("<","")
|
||
link = link.replace(">","")
|
||
if re.search(r'next', ll[1]):
|
||
if (verbose): print("++ More link: " + link)
|
||
nest = fetch_collapse(link, collapse, verbose)
|
||
if isinstance(results,dict): results.update(nest)
|
||
else: results.extend(nest)
|
||
return results
|
||
|
||
|
||
|
||
################
|
||
################ SCHEDULE PARSING HELPERS
|
||
################
|
||
################
|
||
################
|
||
|
||
# Teacher name format changed. Remove commas and switch first to last
|
||
def fix_t_name(str):
|
||
str = str.strip()
|
||
str = re.sub('\s+',' ',str)
|
||
parts = str.split(', ')
|
||
if len(parts)>1:
|
||
return parts[1].strip() + " " + parts[0].strip()
|
||
return str
|
||
|
||
# Separate dept and code
|
||
def split_class_dept(c):
|
||
return c.split(' ')[0]
|
||
def split_class_code(c):
|
||
num = c.split(' ')[1]
|
||
parts = re.match('(\d+)([a-zA-Z]+)',num)
|
||
#ret = "Got %s, " % c
|
||
if parts:
|
||
r = int(parts.group(1))
|
||
#print(ret + "returning %i." % r)
|
||
return r
|
||
#print(ret + "returning %s." % num)
|
||
return int(num)
|
||
def split_class_code_letter(c):
|
||
num = c.split(' ')[1]
|
||
parts = re.match('(\d+)([A-Za-z]+)',num)
|
||
if parts:
|
||
return parts.group(2)
|
||
return ''
|
||
|
||
# go from sp20 to 2020spring
|
||
def shortToLongSem(s):
|
||
parts = re.search(r'(\w\w)(\d\d)', s)
|
||
yr = parts.group(2)
|
||
season = parts.group(1)
|
||
seasons = {'sp':'spring','su':'summer','fa':'fall','wi':'winter'}
|
||
return '20'+yr+seasons[season]
|
||
|
||
# Go to the semesters folder and read the schedule. Return dataframe
|
||
def getSemesterSchedule(short='sp21'): # I used to be current_schedule
|
||
# todo: Some semesters have a different format.... partofday type site xxx i just dL'd them again
|
||
|
||
filename = 'cache/semesters/'+shortToLongSem(short)+'/' + short + '_sched.json'
|
||
print("opening %s" % filename)
|
||
#openfile = open(filename,'r')
|
||
#a = json.loads(openfile)
|
||
#return pd.DataFrame(a)
|
||
schedule = pd.read_json(filename)
|
||
schedule.teacher = schedule['teacher'].apply(fix_t_name)
|
||
#print schedule['teacher']
|
||
for index,r in schedule.iterrows():
|
||
tch = r['teacher']
|
||
parts = tch.split(' . ')
|
||
if len(parts)>1:
|
||
#print "Multiple teachers: (" + tch + ")"
|
||
schedule.loc[index,'teacher'] = parts[0]
|
||
#print " Fixed original: ", schedule.loc[index]
|
||
|
||
for t in parts[1:]:
|
||
r['teacher'] = t
|
||
schedule.loc[-1] = r
|
||
#print " New row appended: ", schedule.loc[-1]
|
||
schedule = schedule.assign(dept = schedule['code'].apply(split_class_dept))
|
||
schedule = schedule.assign(codenum = schedule['code'].apply(split_class_code))
|
||
schedule = schedule.assign(codeletter = schedule['code'].apply(split_class_code_letter))
|
||
#print(schedule)
|
||
schedule['sem'] = short
|
||
#print schedule.columns
|
||
return schedule
|
||
|
||
def get_enrlmts_for_user(user,enrollments):
|
||
#active enrollments
|
||
u_en = enrollments[ lambda x: (x['user_id'] == user) & (x['workflow']=='active') ]
|
||
return u_en[['type','course_id']]
|
||
|
||
|
||
################
|
||
################ CANVAS DATA
|
||
################
|
||
################
|
||
################
|
||
|
||
|
||
# Get something from Canvas Data
|
||
def do_request(path):
|
||
#Set up the request pieces
|
||
method = 'GET'
|
||
host = 'api.inshosteddata.com'
|
||
apiTime = dt.utcnow().strftime('%a, %d %b %y %H:%M:%S GMT')
|
||
apiContentType = 'application/json'
|
||
|
||
msgList = []
|
||
msgList.append(method)
|
||
msgList.append(host)
|
||
msgList.append(apiContentType)
|
||
msgList.append('')
|
||
msgList.append(path)
|
||
msgList.append('')
|
||
msgList.append(apiTime)
|
||
msgList.append(apiSecret)
|
||
|
||
msgStr = bytes("".join("%s\n" % k for k in msgList).strip(),'utf-8')
|
||
|
||
sig = base64.b64encode(hmac.new(key=bytes(apiSecret,'utf-8'),msg=msgStr,digestmod=hashlib.sha256).digest())
|
||
sig = sig.decode('utf-8')
|
||
|
||
headers = {}
|
||
headers['Authorization'] = 'HMACAuth {}:{}'.format(apiKey,sig)
|
||
headers['Date'] = apiTime
|
||
headers['Content-type'] = apiContentType
|
||
|
||
|
||
#Submit the request/get a response
|
||
uri = "https://"+host+path
|
||
print (uri)
|
||
print (headers)
|
||
response = requests.request(method='GET', url=uri, headers=headers, stream=True)
|
||
|
||
#Check to make sure the request was ok
|
||
if(response.status_code != 200):
|
||
print(('Request response went bad. Got back a ', response.status_code, ' code, meaning the request was ', response.reason))
|
||
else:
|
||
#Use the downloaded data
|
||
jsonData = response.json()
|
||
#print(json.dumps(jsonData, indent=4))
|
||
return jsonData
|
||
|
||
# Canvas data, download all new files
|
||
def sync_non_interactive():
|
||
resp = do_request('/api/account/self/file/sync')
|
||
mylog.write(json.dumps(resp, indent=4))
|
||
#mylog.close()
|
||
gotten = os.listdir(local_data_folder)
|
||
wanted = []
|
||
i = 0
|
||
for x in resp['files']:
|
||
filename = x['filename']
|
||
exi = "No "
|
||
if filename in gotten: exi = "Yes"
|
||
else: wanted.append(x)
|
||
|
||
print(str(i) + '.\tLocal: %s\tRemote: %s' % ( exi, filename ))
|
||
i += 1
|
||
print("I will attempt to download %i files." % len(wanted))
|
||
|
||
#answer = input("Press enter to begin, or q to quit ")
|
||
#if not answer == '': return
|
||
|
||
good_count = 0
|
||
bad_count = 0
|
||
for W in wanted:
|
||
print("Downloading: " + W['filename'])
|
||
response = requests.request(method='GET', url=W['url'], stream=True)
|
||
if(response.status_code != 200):
|
||
print('Request response went bad. Got back a %s code, meaning the request was %s' % \
|
||
(response.status_code, response.reason))
|
||
print('URL: ' + W['url'])
|
||
bad_count += 1
|
||
|
||
else:
|
||
#Use the downloaded data
|
||
with open(local_data_folder + W['filename'], 'wb') as fd:
|
||
for chunk in response.iter_content(chunk_size=128):
|
||
fd.write(chunk)
|
||
print("Success")
|
||
good_count += 1
|
||
print("Out of %i files, %i succeeded and %i failed." % (len(wanted),good_count,bad_count))
|
||
|
||
|
||
# list files in canvas_data (online) and choose one or some to download.
|
||
def interactive():
|
||
resp = do_request('/api/account/self/file/sync')
|
||
mylog.write(json.dumps(resp, indent=4))
|
||
#mylog.close()
|
||
i = 0
|
||
gotten = os.listdir(local_data_folder)
|
||
for x in resp['files']:
|
||
print(str(i) + '.\t' + x['filename'])
|
||
i += 1
|
||
which = input("Which files to get? (separate with commas, or say 'all') ")
|
||
if which=='all':
|
||
which_a = list(range(i-1))
|
||
else:
|
||
which_a = which.split(",")
|
||
for W in which_a:
|
||
this_i = int(W)
|
||
this_f = resp['files'][this_i]
|
||
filename = this_f['filename']
|
||
if filename in gotten: continue
|
||
print("Downloading: " + filename)
|
||
response = requests.request(method='GET', url=this_f['url'], stream=True)
|
||
if(response.status_code != 200):
|
||
print(('Request response went bad. Got back a ', response.status_code, ' code, meaning the request was ', response.reason))
|
||
else:
|
||
#Use the downloaded data
|
||
with open(local_data_folder + filename, 'wb') as fd:
|
||
for chunk in response.iter_content(chunk_size=128):
|
||
fd.write(chunk)
|
||
print("Success")
|
||
"""if filename.split('.')[-1] == 'gz':
|
||
try:
|
||
plain_filename = 'canvas_data/' + ".".join(filename.split('.')[:-1])
|
||
pf = open(plain_filename,'w')
|
||
with gzip.open('canvas_data/' + filename , 'rb') as f:
|
||
pf.write(f.read())
|
||
except Exception as e:
|
||
print "Failed to ungizp. Probably too big: " + str(e)"""
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
###### SSB SCHEDULE
|
||
######
|
||
######
|
||
######
|
||
|
||
def todays_date_filename(): # helper
|
||
n = datetime.now()
|
||
m = n.month
|
||
if m < 10: m = "0"+str(m)
|
||
d = n.day
|
||
if d < 10: d = "0" + str(d)
|
||
return "reg_" + short_sem + "_" + str(n.year) + str(m) + str(d)
|
||
|
||
def nowAsStr(): # possible duplicate
|
||
#Get the current time, printed in the right format
|
||
currentTime = datetime.datetime.utcnow()
|
||
prettyTime = currentTime.strftime('%a, %d %b %Y %H:%M:%S GMT')
|
||
return prettyTime
|
||
|
||
|
||
def row_has_data(r): # helper
|
||
if r.find_all('th'):
|
||
return False
|
||
if len(r.find_all('td')) > 2:
|
||
return True
|
||
if re.search('Note\:', r.get_text()):
|
||
return True
|
||
return False
|
||
|
||
def row_text(r): # helper
|
||
#global dbg
|
||
|
||
d("Row Txt Fxn gets: ")
|
||
arr = []
|
||
for t in r.find_all('td'):
|
||
if t.contents and len(t.contents) and t.contents[0].name == 'img':
|
||
arr.append("1")
|
||
d("img")
|
||
r_text = t.get_text()
|
||
arr.append(r_text)
|
||
if 'colspan' in t.attrs and t['colspan']=='2':
|
||
d('[colspan2]')
|
||
arr.append('')
|
||
d("\t"+r_text, end=" ")
|
||
d('')
|
||
|
||
if len(arr)==1 and re.search('Note\:',arr[0]):
|
||
note_line = clean_funny( arr[0] )
|
||
note_line = re.sub(r'\n',' ', note_line)
|
||
note_line = re.sub(r'"','', note_line)
|
||
#note_line = re.sub(r',','\,', note_line)
|
||
return ',,,,,,,,,,,,,,,,,,"' + note_line + '"\n'
|
||
del arr[0]
|
||
arr[1] = clean_funny(arr[1])
|
||
arr[2] = clean_funny(arr[2])
|
||
if arr[1]: arr[1] = arr[1] + " " + arr[2]
|
||
del arr[2]
|
||
arr = [ re.sub(r' ','',a) for a in arr]
|
||
arr = [ re.sub(',','. ',a) for a in arr]
|
||
arr = [ re.sub('\(P\)','',a) for a in arr]
|
||
arr = [ a.strip() for a in arr]
|
||
#del arr[-1]
|
||
r = ','.join(arr)+'\n'
|
||
r = re.sub('\n','',r)
|
||
r = re.sub('add to worksheet','',r)
|
||
d("Row Txt Fxn returns: " + r + "\n\n")
|
||
|
||
return r + '\n'
|
||
|
||
|
||
|
||
# Take banner's html and make a csv(?) file
|
||
def ssb_to_csv(src):
|
||
#out = codecs.open(schedfile,'w','utf-8')
|
||
output = 'crn,code,sec,cmp,cred,name,days,time,cap,act,rem,wl_cap,wl_act,wl_rem,teacher,date,loc,ztc,note\n'
|
||
b = bs(src, 'html.parser')
|
||
tab = b.find(class_="datadisplaytable")
|
||
if not tab:
|
||
print("hmm... didn't find a 'datadisplaytable' in this html: ")
|
||
#print(src)
|
||
return 0
|
||
rows = tab.find_all('tr')
|
||
drows = list(filter(row_has_data,rows))
|
||
for dd in drows:
|
||
t = row_text(dd)
|
||
output += t
|
||
return output
|
||
|
||
|
||
|
||
def clean_funny(str):
|
||
if str and str.encode('utf8') == ' ': return ''
|
||
return str
|
||
def clean_funny2(str):
|
||
if str and str == '\xa0': return ''
|
||
if str and str == ' ': return ''
|
||
return str
|
||
|
||
def clean_funny3(str):
|
||
return re.sub('\xa0','',str)
|
||
|
||
|
||
|
||
### course is a list of 1-3 lists, each one being a line in the schedule's output. First one has section
|
||
def course_start(course):
|
||
#todo: use this to make a early/late/short field and store semester dates w/ other constants
|
||
|
||
start = datetime(2019,1,28)
|
||
end = datetime(2019,5,24)
|
||
|
||
# is it normal, early, late, winter?
|
||
li = course[0]
|
||
date = li[12]
|
||
|
||
if date=='01/28-05/24':
|
||
return 'Normal'
|
||
if date=='TBA':
|
||
return 'TBA'
|
||
if date=='01/02-01/25':
|
||
return 'Winter'
|
||
if date=='01/02-01/24':
|
||
return 'Winter'
|
||
|
||
ma = re.search( r'(\d+)\/(\d+)\-(\d+)\/(\d+)', date)
|
||
if ma:
|
||
# TODO do these years matter?
|
||
mystart = datetime(2019, int(ma.group(1)), int(ma.group(2)))
|
||
if int(ma.group(1)) > 10: mystart = datetime(2018, int(ma.group(1)), int(ma.group(2)))
|
||
myend = datetime(2019, int(ma.group(3)), int(ma.group(4)))
|
||
length = myend - mystart
|
||
weeks = length.days / 7
|
||
|
||
if mystart != start:
|
||
if mystart < start:
|
||
#print 'Early Start ', str(weeks), " weeks ",
|
||
return 'Early start'
|
||
else:
|
||
#print 'Late Start ', str(weeks), " weeks ",
|
||
return 'Late start'
|
||
else:
|
||
if myend > end:
|
||
#print 'Long class ', str(weeks), " weeks ",
|
||
return 'Long term'
|
||
else:
|
||
#print 'Short term ', str(weeks), " weeks ",
|
||
return 'Short term'
|
||
#return ma.group(1) + '/' + ma.group(2) + " end: " + ma.group(3) + "/" + ma.group(4)
|
||
else:
|
||
return "Didn't match: " + date
|
||
|
||
|
||
def time_to_partofday(t):
|
||
#todo: account for multiple sites/rows
|
||
# 11:20 am-12:10 pm
|
||
mor = strptime('12:00 PM', '%I:%M %p')
|
||
mid = strptime( '2:00 PM', '%I:%M %p')
|
||
aft = strptime( '6:00 PM', '%I:%M %p')
|
||
if t == 'TBA':
|
||
return 'TBA'
|
||
t = t.upper()
|
||
parts = t.split('-')
|
||
try:
|
||
begin = strptime(parts[0], '%I:%M %p')
|
||
end = strptime(parts[1], '%I:%M %p')
|
||
if end > aft:
|
||
return "Evening"
|
||
if end > mid:
|
||
return "Afternoon"
|
||
if end > mor:
|
||
return "Midday"
|
||
return "Morning"
|
||
#return begin,end
|
||
except Exception as e:
|
||
#print 'problem parsing: ', t, " ",
|
||
return ""
|
||
|
||
# Deduce a 'site' field, based on room name and known offsite locations
|
||
def room_to_site(room,verbose=0):
|
||
#todo: account for multiple sites/rows
|
||
#todo: better way to store these offsite labels
|
||
othersites = 'AV,SBHS I-243,SBHS I-244,LOADCS,HOPEH,HOPEG,PLY,SAS,SBHS,LOHS,CHS,SBRAT,'.split(',')
|
||
# is it gilroy, mh, hol, other, online or hybrid?
|
||
site = 'Gilroy'
|
||
#if len(course[0]) > 13:
|
||
# room = course[0][13]
|
||
if room in othersites:
|
||
site = "Other"
|
||
if room == 'TBA':
|
||
site = 'TBA'
|
||
if room == 'AV':
|
||
site = 'San Martin Airport'
|
||
if re.search('MHG',room):
|
||
site = 'Morgan Hill'
|
||
if re.search('HOL',room):
|
||
site = 'Hollister'
|
||
if re.search('COY',room):
|
||
site = 'Coyote Valley'
|
||
if re.search('OFFSTE',room):
|
||
site = 'Other'
|
||
if re.search('ONLINE',room):
|
||
site = 'Online'
|
||
if verbose: print(room, '\t', end=' ')
|
||
return site
|
||
|
||
|
||
from io import StringIO
|
||
|
||
|
||
# take text lines and condense them to one dict per section
|
||
def to_section_list(input_text,verbose=0):
|
||
this_course = ''
|
||
#todo: no output files
|
||
#jout = codecs.open(filename, 'w', 'utf-8')
|
||
#input = csv.DictReader(open(schedfile,'r'))
|
||
#input = UnicodeDictReader(input_text.splitlines())
|
||
all_courses = []
|
||
|
||
|
||
try:
|
||
f = StringIO(input_text)
|
||
except:
|
||
print("ERROR with this input_text:")
|
||
print(input_text)
|
||
reader = csv.reader(f, delimiter=',')
|
||
headers = next(reader)
|
||
for r in reader:
|
||
d = dict(list(zip(headers,r)))
|
||
#pdb.set_trace()
|
||
# clean funny unicode char in blank entries
|
||
r = {k: clean_funny2(v) for k,v in list(d.items()) }
|
||
if verbose: print("Cleaned: " + str(r))
|
||
|
||
if 'time' in r:
|
||
if r['time']=='TBA': r['time'] = ''
|
||
if r['time']: r['partofday'] = time_to_partofday(r['time'])
|
||
|
||
r['type'] = ''
|
||
|
||
if 'loc' in r:
|
||
if r['loc'] == 'ONLINE': r['type'] = 'online'
|
||
if r['loc'] == 'ONLINE' and r['time']: r['type'] = 'online live'
|
||
if r['loc'] == 'ONLINE LIVE': r['type'] = 'online live'
|
||
if r['loc']: r['site'] = room_to_site(r['loc'],verbose)
|
||
|
||
if 'code' in r:
|
||
if re.search(r'ONLINE\sLIVE',r['code']):
|
||
r['type'] = 'online live'
|
||
elif re.search(r'ONLINE',r['code']):
|
||
r['type'] = 'online'
|
||
|
||
# does it have a section? it is the last course
|
||
if r['crn']: # is a new course or a continuation?
|
||
if verbose: print(" it's a new section.")
|
||
if this_course:
|
||
if not this_course['extra']: this_course.pop('extra',None)
|
||
all_courses.append(this_course)
|
||
this_course = r
|
||
#print(r['name'])
|
||
this_course['extra'] = []
|
||
else:
|
||
# is a continuation line
|
||
if verbose: print(" additional meeting: " + str(r))
|
||
for k,v in list(r.items()):
|
||
if not v: r.pop(k,None)
|
||
# TODO: if extra line is different type?
|
||
#if this_course['type']=='online' and r['type'] != 'online': this_course['type'] = 'hybrid'
|
||
#elif this_course['type']!='online' and r['type'] == 'online': this_course['type'] = 'hybrid'
|
||
this_course['extra'].append(r)
|
||
return all_courses
|
||
|
||
|
||
# Schedule / course filling history
|
||
# csv headers: crn, code, teacher, datetime, cap, act, wlcap, wlact
|
||
# Log the history of enrollments per course during registration
|
||
def log_section_filling(current_sched_list):
|
||
rows = 'timestamp crn code teacher cap act wl_cap wl_act'.split(' ')
|
||
rows_j = 'crn code teacher cap act wl_cap wl_act'.split(' ')
|
||
print(rows_j)
|
||
now = datetime.datetime.now().strftime('%Y-%m-%dT%H-%M')
|
||
csv_fn = 'cache/reg_history_' + short_sem + '.csv'
|
||
with codecs.open(csv_fn,'a','utf-8') as f:
|
||
writer = csv.writer(f)
|
||
for S in current_sched_list:
|
||
#print(S)
|
||
items = [now,]
|
||
[ items.append( S[X] ) for X in rows_j ]
|
||
writer.writerow(items)
|
||
|
||
# Same as above, but compressed, act only
|
||
def log_section_filling2(current_sched_list):
|
||
|
||
|
||
|
||
now = datetime.datetime.now().strftime('%Y-%m-%dT%H')
|
||
|
||
todays_data = { int(S['crn']): S['act'] for S in current_sched_list }
|
||
#print(todays_data)
|
||
|
||
todays_df = pd.DataFrame.from_dict(todays_data, orient='index', columns=[now])
|
||
todays_df = todays_df.rename_axis('crn')
|
||
#print(todays_df)
|
||
todays_df.to_csv('cache/reg_today_new.csv', index=True)
|
||
|
||
try:
|
||
myframe = pd.read_csv('cache/reg_data_' + short_sem + '.csv')
|
||
print(myframe)
|
||
except:
|
||
fff = open('cache/reg_data_'+short_sem+'.csv','w')
|
||
fff.write('crn\n')
|
||
fff.close()
|
||
myframe = pd.read_csv('cache/reg_data_' + short_sem + '.csv')
|
||
#myframe = pd.DataFrame.from_dict(todays_data, orient='index', columns=[now])
|
||
#myframe = myframe.rename_axis('crn')
|
||
print("Creating new data file for this semester.")
|
||
|
||
new_df = myframe.join( todays_df, on='crn', how='outer' )
|
||
new_df = new_df.rename_axis('crn')
|
||
print(new_df)
|
||
|
||
reg_data_filename = 'reg_data_' + short_sem + '.csv'
|
||
new_df.to_csv('cache/' + reg_data_filename, index=False)
|
||
put_file('/home/public/schedule/', 'cache/', reg_data_filename, 0)
|
||
|
||
|
||
# Use Firefox and log in to ssb and get full schedule. Only works where selenium is installed
|
||
def scrape_schedule():
|
||
|
||
#url = "https://ssb.gavilan.edu/prod/twbkwbis.P_GenMenu?name=bmenu.P_StuMainMnu"
|
||
url = "https://ssb-prod.ec.gavilan.edu/PROD/twbkwbis.P_GenMenu?name=bmenu.P_MainMnu"
|
||
|
||
|
||
text = ''
|
||
|
||
from selenium import webdriver
|
||
from selenium.webdriver.common.keys import Keys
|
||
from selenium.webdriver.support.ui import WebDriverWait, Select
|
||
from selenium.webdriver.common.by import By
|
||
from selenium.webdriver.support import expected_conditions as EC
|
||
|
||
try:
|
||
driver = webdriver.Firefox()
|
||
driver.get(url)
|
||
driver.find_element_by_id("UserID").clear()
|
||
driver.find_element_by_id("UserID").send_keys(GOO)
|
||
driver.find_element_by_name("PIN").send_keys(GOO_PIN)
|
||
driver.find_element_by_name("loginform").submit()
|
||
driver.implicitly_wait(5)
|
||
|
||
print(driver.title)
|
||
|
||
driver.find_element_by_link_text("Students").click()
|
||
driver.implicitly_wait(5)
|
||
print(driver.title)
|
||
|
||
driver.find_element_by_link_text("Registration").click()
|
||
driver.implicitly_wait(5)
|
||
print(driver.title)
|
||
|
||
driver.find_element_by_link_text("Search for Classes").click()
|
||
driver.implicitly_wait(15)
|
||
print(driver.title)
|
||
|
||
dd = Select(driver.find_element_by_name("p_term"))
|
||
if (dd):
|
||
dd.select_by_visible_text(SEMESTER)
|
||
driver.find_element_by_xpath("/html/body/div/div[4]/form").submit()
|
||
driver.implicitly_wait(15)
|
||
print(driver.title)
|
||
|
||
driver.find_element_by_xpath("/html/body/div/div[4]/form/input[18]").click()
|
||
driver.implicitly_wait(10)
|
||
print(driver.title)
|
||
|
||
driver.find_element_by_name("SUB_BTN").click()
|
||
driver.implicitly_wait(40)
|
||
time.sleep(15)
|
||
driver.implicitly_wait(40)
|
||
print(driver.title)
|
||
text = driver.page_source
|
||
driver.quit()
|
||
|
||
except Exception as e:
|
||
print("Got an exception: ", e)
|
||
finally:
|
||
print("")
|
||
#driver.quit()
|
||
|
||
|
||
|
||
|
||
|
||
|
||
codecs.open('cache/' + filename_html,'w', 'utf-8').write(text)
|
||
|
||
|
||
|
||
#print(text)
|
||
as_list = ssb_to_csv(text)
|
||
#print(as_list)
|
||
as_dict = to_section_list(as_list)
|
||
jj = json.dumps(as_dict,indent=2)
|
||
|
||
# TODO
|
||
try:
|
||
ps = codecs.open('cache/'+filename,'r','utf-8')
|
||
prev_sched = json.loads(ps.read())
|
||
ps.close()
|
||
|
||
if 1: # sometimes I want to re-run this without affecting the logs.
|
||
log_section_filling(as_dict)
|
||
log_section_filling2(as_dict)
|
||
|
||
dd = DeepDiff(prev_sched, as_dict, ignore_order=True)
|
||
pretty_json = json.dumps( json.loads( dd.to_json() ), indent=2 )
|
||
codecs.open('cache/%s_sched_diff.json' % short_sem,'w','utf-8').write( pretty_json ) # dd.to_json() )
|
||
|
||
except Exception as e:
|
||
print(e)
|
||
print("Can't do diff?")
|
||
|
||
# Next, rename the prev sched_xxYY.json data file to have its date,
|
||
# make this new one, and then upload it to the website.
|
||
# Maybe even count the entries and do a little sanity checking
|
||
#
|
||
# print("Last modified: %s" % time.ctime(os.path.getmtime("test.txt")))
|
||
# print("Created: %s" % time.ctime(os.path.getctime("test.txt")))
|
||
|
||
|
||
try:
|
||
last_mod = time.ctime(os.path.getmtime('cache/' + filename))
|
||
|
||
import pathlib
|
||
prev_stat = pathlib.Path('cache/' + filename).stat()
|
||
mtime = dt.fromtimestamp(prev_stat.st_mtime)
|
||
print(mtime)
|
||
except:
|
||
print("Couldn't Diff.")
|
||
# fname = pathlib.Path('test.py')
|
||
# assert fname.exists(), f'No such file: {fname}' # check that the file exists
|
||
# print(fname.stat())
|
||
#
|
||
# os.stat_result(st_mode=33206, st_ino=5066549581564298, st_dev=573948050, st_nlink=1, st_uid=0, st_gid=0, st_size=413,
|
||
# st_atime=1523480272, st_mtime=1539787740, st_ctime=1523480272)
|
||
|
||
|
||
|
||
codecs.open('cache/' + filename, 'w', 'utf-8').write(jj)
|
||
|
||
put_file('/home/public/schedule/', 'cache/', filename, 0) # /gavilan.edu/_files/php/
|
||
|
||
return as_dict
|
||
|
||
def dza_sched():
|
||
text = codecs.open('cache/sched_fa22_deanza.html','r','utf-8').read()
|
||
as_list = ssb_to_csv(text)
|
||
#print(as_list)
|
||
as_dict = to_section_list(as_list)
|
||
jj = json.dumps(as_dict,indent=2)
|
||
codecs.open('cache/fa22_sched_deanza.json','w','utf-8').write(jj)
|
||
|
||
# recreate schedule json files with most current online schedule format.
|
||
def recent_schedules():
|
||
# # todo: sems is a global in this file. Is that the right thing to do?
|
||
#all_scheds = [ os.listdir( 'cache/rosters/' + shortToLongSem(s)) for s in sems ]
|
||
#for i,s in enumerate(sems):
|
||
for s in ['sp21',]:
|
||
filename = 'cache/sched_' + s + '.html'
|
||
print("Filename is %s" % filename)
|
||
input = codecs.open( filename, 'r', 'utf-8').read()
|
||
output = ssb_to_csv(input)
|
||
|
||
csv_fn = 'cache/temp_sched_' + s + '.csv'
|
||
if os.path.isfile(csv_fn):
|
||
os.remove(csv_fn)
|
||
|
||
codecs.open(csv_fn,'w','utf-8').write(output)
|
||
|
||
jsn = to_section_list(output)
|
||
jsn_fn = 'cache/semesters/'+shortToLongSem(s)+'/'+s+'_sched.json'
|
||
if os.path.isfile(jsn_fn):
|
||
os.remove(jsn_fn)
|
||
codecs.open(jsn_fn,'w').write(json.dumps(jsn))
|
||
print("I put the most recent schedule JSON files in ./cache/semesters/... folders.")
|
||
|
||
|
||
|
||
|
||
|
||
################
|
||
################ ROSTERS AND REGISTRATION
|
||
################
|
||
################
|
||
################
|
||
|
||
# todo: the pipeline is disorganized. Organize it to have
|
||
# a hope of taking all this to a higher level.
|
||
#
|
||
|
||
# todo: where does this belong in the pipeline? compare with recent_schedules()
|
||
|
||
|
||
|
||
# Take the generically named rosters uploads files and move them to a semester folder and give them a date.
|
||
def move_to_folder(sem,year,folder):
|
||
semester = year+sem
|
||
semester_path = 'cache/rosters/%s' % semester
|
||
if not os.path.isdir('cache/rosters/'+semester):
|
||
os.makedirs('cache/rosters/'+semester)
|
||
now = datetime.datetime.now().strftime('%Y-%m-%dT%H-%M')
|
||
print("+ Moving roster files to folder: %s" % semester_path)
|
||
if not os.path.isdir(semester_path):
|
||
print("+ Creating folder: %s" % semester_path)
|
||
os.makedirs(semester_path)
|
||
os.rename('cache/rosters/courses-%s.csv' % folder, 'cache/rosters/%s/courses.%s.csv' % (semester,now))
|
||
os.rename('cache/rosters/enrollments-%s.csv' % folder, 'cache/rosters/%s/enrollments.%s.csv' % (semester,now))
|
||
os.rename('cache/rosters/users-%s.csv' % folder, 'cache/rosters/%s/users.%s.csv' % (semester,now))
|
||
|
||
|
||
|
||
# Take raw upload (csv) files and make one big json out of them.
|
||
# This relates to enrollment files, not schedule.
|
||
def convert_roster_files(semester="",year="",folder=""):
|
||
if not semester:
|
||
semester = input("the semester? (ex: spring) ")
|
||
folder = input("Folder? (ex 2020-02-25-14-58-20) ")
|
||
uf = open('cache/rosters/users-'+folder+'.csv','r')
|
||
cf = open('cache/rosters/courses-'+folder+'.csv','r')
|
||
ef = open('cache/rosters/enrollments-'+folder+'.csv','r')
|
||
u = csv.DictReader(uf)
|
||
c = csv.DictReader(cf)
|
||
e = csv.DictReader(ef)
|
||
uu = [i for i in u]
|
||
cc = [i for i in c]
|
||
ee = [i for i in e]
|
||
uf.close()
|
||
cf.close()
|
||
ef.close()
|
||
myrosterfile = 'cache/rosters/roster_%s_%s.json' % (year, semester)
|
||
|
||
if os.path.exists(myrosterfile):
|
||
print(" -- Moving previous combined roster json file. opening %s ..." % myrosterfile)
|
||
last_fileobj = open(myrosterfile,'r')
|
||
last_file = json.load(last_fileobj)
|
||
|
||
last_fileobj.close()
|
||
|
||
info = last_file[3]
|
||
last_date = info['date_filestring']
|
||
|
||
print(' -- writing: cache/rosters/%s%s/roster_%s.json ...' % (year,semester,last_date))
|
||
|
||
try:
|
||
os.rename(myrosterfile, 'cache/rosters/%s%s/roster_%s.json ...' % (year,semester,last_date))
|
||
print(' -- ok')
|
||
except Exception as e:
|
||
print(" ** Failed because i couldn't move the previous roster file: %s" % myrosterfile)
|
||
print(e)
|
||
myrosterfile = "new_" + myrosterfile
|
||
pass
|
||
#os.remove('cache/old_rosters/roster_'+semester+'.'+last_date+'.json')
|
||
#os.rename(myrosterfile, 'cache/old_rosters/roster_'+semester+'.'+last_date+'.json')
|
||
|
||
newinfo = {'date_filestring': datetime.datetime.now().strftime('%Y-%m-%dT%H-%M'), }
|
||
try:
|
||
new_roster = codecs.open(myrosterfile,'w', 'utf-8')
|
||
new_roster.write( json.dumps( [uu,cc,ee,newinfo], indent=2 ))
|
||
new_roster.close()
|
||
print(" -- Wrote roster info to: %s." % myrosterfile)
|
||
except Exception as e:
|
||
print(" ** Failed because i couldn't move the previous roster file: %s" % myrosterfile)
|
||
print(" ** " + str(e))
|
||
|
||
|
||
|
||
|
||
|
||
|
||
# From instructure sftp site
|
||
def fetch_current_rosters():
|
||
dt_label = datetime.datetime.now().strftime('%Y-%m-%d-%H-%M-%S')
|
||
cnopts = pysftp.CnOpts()
|
||
cnopts.hostkeys = None
|
||
with pysftp.Connection(instructure_url,username=instructure_username, private_key=instructure_private_key,cnopts=cnopts) as sftp:
|
||
sftp.chdir('SIS')
|
||
files = sftp.listdir()
|
||
print("--> %s I see these files at instructure ftp site: " % dt_label )
|
||
[print(" %s" % f) for f in files]
|
||
i = 0
|
||
got_courses = 0
|
||
if len(files)>1: # and 'users.csv' in files:
|
||
try:
|
||
if 'users.csv' in files:
|
||
sftp.get('users.csv','cache/rosters/users-'+dt_label+'.csv')
|
||
i += 1
|
||
except:
|
||
print(' * users.csv not present')
|
||
try:
|
||
if 'courses.csv' in files:
|
||
sftp.get('courses.csv','cache/rosters/courses-'+dt_label+'.csv')
|
||
i += 1
|
||
got_courses = 1
|
||
except:
|
||
print(' * courses.csv not present')
|
||
try:
|
||
if 'enrollments.csv' in files:
|
||
sftp.get('enrollments.csv','cache/rosters/enrollments-'+dt_label+'.csv')
|
||
i += 1
|
||
except:
|
||
print(' * enrollments.csv not present')
|
||
print(' Saved %i data files in rosters folder.' % i)
|
||
|
||
if got_courses:
|
||
courses = open('cache/rosters/courses-%s.csv' % dt_label,'r')
|
||
courses.readline()
|
||
a = courses.readline()
|
||
print(a)
|
||
courses.close()
|
||
parts = a.split(',')
|
||
year = parts[1][0:4]
|
||
ss = parts[1][4:6]
|
||
#print parts[1]
|
||
sem = {'30':'spring', '50':'summer', '70':'fall' }
|
||
this_sem = sem[ss]
|
||
print(" -> This semester is: %s, %s" % (this_sem,year))
|
||
|
||
print(' -> %s building data file...' % dt_label)
|
||
convert_roster_files(this_sem,year,dt_label)
|
||
print(' -> moving files...')
|
||
move_to_folder(this_sem,year,dt_label)
|
||
else:
|
||
print(" * No courses file. Not moving files.")
|
||
else:
|
||
print("--> Don't see files.")
|
||
sftp.close()
|
||
|
||
def fetch_current_rosters_auto():
|
||
|
||
schedule.every().hour.at(":57").do(fetch_current_rosters)
|
||
|
||
schedule.every().day.at("12:35").do(sync_non_interactive)
|
||
schedule.every().day.at("21:00").do(sync_non_interactive)
|
||
|
||
|
||
print("running every hour on the :57\n")
|
||
while True:
|
||
try:
|
||
schedule.run_pending()
|
||
except Exception as e:
|
||
import traceback
|
||
print(" ---- * * * Failed with: %s" % str(e))
|
||
ff = open('cache/pipeline.log.txt','a')
|
||
ff.write(datetime.datetime.now().strftime('%Y-%m-%d-%H-%M-%S') + "\n")
|
||
ff.write(traceback.format_exc()+"\n---------\n\n")
|
||
ff.close()
|
||
#schedule.CancelJob
|
||
time.sleep(15)
|
||
|
||
|
||
|
||
|
||
# read schedule file with an eye toward watching what's filling up
|
||
def schedule_filling():
|
||
sem = 'spring2021' # todo: hardcoded
|
||
days = []
|
||
for f in sorted(os.listdir('cache/rosters/'+sem+'/')):
|
||
if f.endswith('.html'):
|
||
match = re.search(r'sched_(\d\d\d\d)_(\d\d)_(\d+)\.html',f)
|
||
if match:
|
||
print(f)
|
||
y = match.group(1)
|
||
m = match.group(2)
|
||
d = match.group(3)
|
||
print("Schedule from %s %s %s." % (y,m,d))
|
||
csv_sched = ssb_to_csv(open('cache/rosters/'+sem+'/'+f,'r').read())
|
||
jsn = to_section_list(csv_sched)
|
||
#print(json.dumps(jsn,indent=2))
|
||
days.append(jsn)
|
||
day1 = days[-2]
|
||
day2 = days[-1]
|
||
df = jsondiff.diff(day1, day2)
|
||
gains = defaultdict( list )
|
||
|
||
for D in df.keys():
|
||
if isinstance(D, int):
|
||
#print(day1[D]['code'] + '\t' + day1[D]['crn'] + ' Before: ' + day1[D]['act'] + ' After: ' + day2[D]['act'])
|
||
try:
|
||
gain = int(day2[D]['act']) - int(day1[D]['act'])
|
||
gains[gain].append( day1[D]['code'] + ' ' + day1[D]['crn'] )
|
||
except:
|
||
print("No gain for " + str(D))
|
||
#print("\t" + str(df[D]))
|
||
else:
|
||
print(D)
|
||
print(df[D])
|
||
for key, value in sorted(gains.items(), key=lambda x: x[0]):
|
||
print("{} : {}".format(key, value))
|
||
|
||
#print(json.dumps(gains,indent=2))
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
################
|
||
################ SENDING DATA AWAY
|
||
################
|
||
################
|
||
################
|
||
|
||
# Upload a json file to www
|
||
def put_file(remotepath,localpath, localfile,prompt=1):
|
||
show_all = 0
|
||
folder = datetime.datetime.now().strftime('%Y-%m-%d-%H-%M-%S')
|
||
cnopts = pysftp.CnOpts()
|
||
cnopts.hostkeys = None
|
||
|
||
with pysftp.Connection(FTP_SITE,username=FTP_USER, password=FTP_PW,cnopts=cnopts) as sftp:
|
||
#todo: these paths
|
||
#files = sftp.listdir()
|
||
#print(folder + "\tI see these files on remote: ", files, "\n")
|
||
sftp.chdir(remotepath)
|
||
files = sftp.listdir()
|
||
if show_all: print(folder + "\tI see these files on remote: ", files, "\n")
|
||
localf = os.listdir(localpath)
|
||
if show_all: print("I see these local: ", localf)
|
||
if prompt:
|
||
input('ready to upload')
|
||
sftp.put(localpath+localfile, localfile, preserve_mtime=True)
|
||
sftp.close()
|
||
|
||
|
||
"""
|
||
# copy files and directories from local static, to remote static,
|
||
# preserving modification times on the files
|
||
for f in localf:
|
||
print("This local file: " + f + " ", end=' ')
|
||
if not f in files:
|
||
sftp.put('video_srt/'+classfoldername+'/'+f, f, preserve_mtime=True)
|
||
print("Uploaded.")
|
||
else:
|
||
print("Skipped.")
|
||
"""
|
||
|
||
"""if len(files)==3 and 'users.csv' in files:
|
||
sftp.get('courses.csv','rosters/courses-'+folder+'.csv')
|
||
sftp.get('users.csv','rosters/users-'+folder+'.csv')
|
||
sftp.get('enrollments.csv','rosters/enrollments-'+folder+'.csv')
|
||
print folder + '\tSaved three data files in rosters folder.'
|
||
|
||
courses = open('rosters/courses-'+folder+'.csv','r')
|
||
courses.readline()
|
||
a = courses.readline()
|
||
print a
|
||
courses.close()
|
||
parts = a.split(',')
|
||
year = parts[1][0:4]
|
||
ss = parts[1][4:6]
|
||
#print parts[1]
|
||
sem = {'30':'spring', '50':'summer', '70':'fall' }
|
||
this_sem = sem[ss]
|
||
#print this_sem, "", year
|
||
print folder + '\tbuilding data file...'
|
||
convert_roster_files(this_sem,year,folder)
|
||
print folder + '\tmoving files...'
|
||
move_to_folder(this_sem,year,folder)
|
||
else:
|
||
print folder + "\tDon't see all three files."""
|
||
|
||
|
||
|
||
################
|
||
################ GOOGLE DOCS
|
||
################
|
||
################
|
||
################
|
||
|
||
def sec(t): return "<h3>"+t+"</h3>\n"
|
||
def para(t): return "<p>"+t+"</p>\n"
|
||
def ul(t): return "<ul>"+t+"</ul>\n"
|
||
def li(t): return "<li>"+t+"</li>\n"
|
||
|
||
def question(t,bracket=1):
|
||
ret = ''
|
||
match = re.search( r'\[(.*)\]', t)
|
||
if match and bracket:
|
||
ret += "<a name='" + match.group(1) + "'></a>"
|
||
t = re.sub( r'\[.*\]','',t)
|
||
else:
|
||
parts = t.split(' ')
|
||
id = ''
|
||
for p in parts:
|
||
if re.search(r'[a-zA-Z]',p[0]): id += p[0]
|
||
ret += "<a name='%s'></a>" % id.lower()
|
||
return ret + '<div class="accordion" data-accordion=""><h4 class="acrd_cntl">' + t + '</h4>\n<div class="acrd_cntnt">'
|
||
|
||
def answer(t):
|
||
return t + '</div></div>\n'
|
||
|
||
def read_paragraph_element(element,type="NORMAL_TEXT"):
|
||
"""Returns the text in the given ParagraphElement.
|
||
|
||
Args:
|
||
element: a ParagraphElement from a Google Doc.
|
||
"""
|
||
text_run = element.get('textRun')
|
||
begin = ''
|
||
end = ''
|
||
if not text_run:
|
||
return ''
|
||
if 'textStyle' in text_run and 'link' in text_run['textStyle']:
|
||
begin = '<a href="' + text_run['textStyle']['link']['url'] + '">'
|
||
end = '</a>'
|
||
if 'textStyle' in text_run and 'bold' in text_run['textStyle'] and text_run['textStyle']['bold']==True and type=="NORMAL_TEXT":
|
||
begin = '<strong>' + begin
|
||
end = end + '</strong>'
|
||
|
||
content = text_run.get('content')
|
||
content = re.sub(u'\u000b','<br />\n',content)
|
||
|
||
return begin + content + end
|
||
|
||
|
||
def get_doc(docid, bracket=1, verbose=0):
|
||
import pickle
|
||
import os.path
|
||
from googleapiclient.discovery import build
|
||
from google_auth_oauthlib.flow import InstalledAppFlow
|
||
from google.auth.transport.requests import Request
|
||
|
||
#ooout = open(fileout,'w')
|
||
|
||
# If modifying these scopes, delete the file token.pickle.
|
||
SCOPES = ['https://www.googleapis.com/auth/documents.readonly']
|
||
creds = None
|
||
# The file token.pickle stores the user's access and refresh tokens, and is
|
||
# created automatically when the authorization flow completes for the first
|
||
# time.
|
||
if os.path.exists('token.pickle'):
|
||
with open('token.pickle', 'rb') as token:
|
||
creds = pickle.load(token)
|
||
# If there are no (valid) credentials available, let the user log in.
|
||
if not creds or not creds.valid:
|
||
if creds and creds.expired and creds.refresh_token:
|
||
creds.refresh(Request())
|
||
else:
|
||
flow = InstalledAppFlow.from_client_secrets_file(
|
||
'credentials.json', SCOPES)
|
||
creds = flow.run_local_server(port=0)
|
||
# Save the credentials for the next run
|
||
with open('token.pickle', 'wb') as token:
|
||
pickle.dump(creds, token)
|
||
|
||
service = build('docs', 'v1', credentials=creds)
|
||
|
||
# Retrieve the documents contents from the Docs service.
|
||
document = service.documents().get(documentId=docid).execute()
|
||
if verbose: print(document)
|
||
|
||
tempout = codecs.open('cache/trash/gdoctemp.txt','w','utf-8')
|
||
tempout.write( json.dumps(document,indent=2) + "\n\n\n------------------------------------\n\n")
|
||
if verbose: print('The title of the document is: {}'.format(document.get('title')))
|
||
doc_content = document.get('body').get('content')
|
||
if verbose: print(doc_content)
|
||
|
||
doc_objects = document.get('inlineObjects')
|
||
if verbose: print(doc_objects)
|
||
|
||
doc_lists = document.get('lists')
|
||
|
||
text = '<div class="acrd_grp" data-accordion-group="">'
|
||
last_type = ''
|
||
answer_text = ''
|
||
in_a_list = ''
|
||
|
||
img_count = 1
|
||
img_lookup = {}
|
||
img_heights = {}
|
||
img_widths = {}
|
||
|
||
if doc_objects:
|
||
for k,value in doc_objects.items():
|
||
tempout.write( "->" + k + "=" + json.dumps(value,indent=2) + "\n\n\n--\n\n")
|
||
if 'inlineObjectProperties' in value:
|
||
if 'embeddedObject' in value['inlineObjectProperties']:
|
||
if 'imageProperties' in value['inlineObjectProperties']['embeddedObject']:
|
||
if 'contentUri' in value['inlineObjectProperties']['embeddedObject']['imageProperties']:
|
||
print(k)
|
||
uu = value['inlineObjectProperties']['embeddedObject']['imageProperties']['contentUri']
|
||
response = requests.get(uu, stream=True)
|
||
name = 'image_' + str(img_count) + '.' + response.headers['content-type'].split('/')[1]
|
||
img_count += 1
|
||
|
||
img_lookup[k] = name
|
||
|
||
with open('cache/doc_images/'+name, 'wb') as out_file:
|
||
shutil.copyfileobj(response.raw, out_file)
|
||
print(uu)
|
||
print(response.headers)
|
||
print(name)
|
||
#input('x?')
|
||
del response
|
||
if 'size' in value['inlineObjectProperties']['embeddedObject']:
|
||
img_heights[k] = int(value['inlineObjectProperties']['embeddedObject']['size']['height']['magnitude'])
|
||
img_widths[k] = int(value['inlineObjectProperties']['embeddedObject']['size']['width']['magnitude'])
|
||
|
||
tempout.write('- - - - - - - -\n\n')
|
||
#for value in doc_lists:
|
||
# tempout.write( json.dumps(value,indent=2) + "\n\n\n--\n\n")
|
||
|
||
tempout.write('- - - - - - - -\n\n')
|
||
list_stack = []
|
||
list_depth = 0
|
||
last_list_depth = 0
|
||
for value in doc_content:
|
||
tempout.write( json.dumps(value,indent=2) + "\n\n\n")
|
||
if verbose: print(json.dumps(value, sort_keys=True, indent=4))
|
||
|
||
# todo: x link, x bold, list, image.
|
||
tag_fxn = para
|
||
if 'paragraph' in value:
|
||
this_text = ''
|
||
|
||
if 'bullet' in value['paragraph']:
|
||
# either we're (1)starting a new list, (2)in one, (3)starting a nested one, or (4)finished a nested one.
|
||
|
||
lid = value['paragraph']['bullet']['listId']
|
||
|
||
if not list_stack: # 1
|
||
list_stack.append(lid)
|
||
else:
|
||
if lid == list_stack[0]: # 2
|
||
pass
|
||
|
||
else:
|
||
if not lid in list_stack: # 3
|
||
list_stack.append(lid)
|
||
else: # 4
|
||
x = list_stack.pop()
|
||
while x != lid: list_stack.pop()
|
||
elif len(list_stack) > 0: # current para isn't a bullet but we still have a list open.
|
||
list_stack = []
|
||
|
||
list_depth = len(list_stack)
|
||
|
||
deeper = list_depth - last_list_depth
|
||
|
||
if deeper > 0:
|
||
answer_text += "<ul>" * deeper
|
||
elif deeper < 0:
|
||
deeper = -1 * deeper
|
||
answer_text += "</ul>" * deeper
|
||
|
||
if len(list_stack):
|
||
tag_fxn = li
|
||
|
||
elements = value.get('paragraph').get('elements')
|
||
|
||
# inlineObjectElement": {
|
||
# "inlineObjectId": "kix.ssseeu8j9cfx",
|
||
|
||
if 'paragraphStyle' in value.get('paragraph'):
|
||
style = value.get('paragraph').get('paragraphStyle')
|
||
#text += json.dumps(style, sort_keys=True, indent=4)
|
||
if 'namedStyleType' in style:
|
||
type = style['namedStyleType']
|
||
|
||
for elem in elements:
|
||
|
||
# text content
|
||
this_text += read_paragraph_element(elem,type)
|
||
|
||
# image content
|
||
if 'inlineObjectElement' in elem:
|
||
vpi = elem['inlineObjectElement']
|
||
if 'inlineObjectId' in vpi:
|
||
ii = vpi['inlineObjectId']
|
||
if ii in img_lookup:
|
||
img = img_lookup[ii]
|
||
h = img_heights[ii]
|
||
w = img_widths[ii]
|
||
this_text += '<img src="doc_images/%s" width="%i" height="%i" />' % (img,w,h)
|
||
|
||
|
||
|
||
if last_type=='NORMAL_TEXT' and type!=last_type:
|
||
text += answer(answer_text)
|
||
answer_text = ''
|
||
|
||
if type=='HEADING_2':
|
||
text += sec(this_text)
|
||
this_text = ''
|
||
elif type=='HEADING_3':
|
||
text += question(this_text,bracket)
|
||
this_text = ''
|
||
else:
|
||
answer_text += tag_fxn(this_text)
|
||
this_text = ''
|
||
last_type = type
|
||
last_list_depth = list_depth
|
||
|
||
elif 'table' in value:
|
||
# The text in table cells are in nested Structural Elements and tables may be
|
||
# nested.
|
||
text += "\nTABLE\n"
|
||
#table = value.get('table')
|
||
#for row in table.get('tableRows'):
|
||
# cells = row.get('tableCells')
|
||
# for cell in cells:
|
||
# text += read_strucutural_elements(cell.get('content'))
|
||
#elif 'tableOfContents' in value:
|
||
# # The text in the TOC is also in a Structural Element.
|
||
# toc = value.get('tableOfContents')
|
||
# text += read_strucutural_elements(toc.get('content'))
|
||
|
||
#else:
|
||
# print(json.dumps(value, sort_keys=True, indent=4))
|
||
|
||
text += answer(answer_text)
|
||
#text += '</div>'
|
||
#print(text)
|
||
return text
|
||
|
||
######### TRY #2 ######
|
||
|
||
|
||
def read_paragraph_element_2(element,type="NORMAL_TEXT"):
|
||
text_run = element.get('textRun')
|
||
begin = ''
|
||
end = ''
|
||
if not text_run: return ''
|
||
if 'textStyle' in text_run and 'link' in text_run['textStyle']:
|
||
begin = '<a href="' + text_run['textStyle']['link']['url'] + '">'
|
||
end = '</a>'
|
||
if 'textStyle' in text_run and 'bold' in text_run['textStyle'] and text_run['textStyle']['bold']==True and type=="NORMAL_TEXT":
|
||
begin = '<strong>' + begin
|
||
end = end + '</strong>'
|
||
elif 'textStyle' in text_run and 'italic' in text_run['textStyle'] and text_run['textStyle']['italic']==True and type=="NORMAL_TEXT":
|
||
begin = '<em>' + begin
|
||
end = end + '</em>'
|
||
content = text_run.get('content')
|
||
content = re.sub(u'\u000b','<br />\n',content)
|
||
return begin + content + end
|
||
|
||
# t is a string that begins with "Icons: " ... and contains comma(space) separated list
|
||
def handle_icons(t):
|
||
text = t[7:].strip()
|
||
parts = text.split(", ")
|
||
return ('icons',parts)
|
||
|
||
# t is a string that begins with "Tags: " ... and contains comma(space) separated list
|
||
def handle_tags(t):
|
||
text = t[6:].strip()
|
||
parts = text.split(", ")
|
||
return ('tags',parts)
|
||
|
||
def handle_question(t,bracket=1):
|
||
anchor = ''
|
||
match = re.search( r'\[(.*)\]', t)
|
||
if match and bracket:
|
||
anchor = match.group(1).lower()
|
||
t = re.sub( r'\[.*\]','',t)
|
||
else:
|
||
parts = t.split(' ')
|
||
for p in parts:
|
||
if re.search(r'[a-zA-Z]',p[0]): anchor += p[0].lower()
|
||
return ('question', t, anchor)
|
||
|
||
def handle_answer(t):
|
||
return ('answer',t)
|
||
|
||
def handle_sec(t): return ('section',t)
|
||
def handle_para(t): return ('paragraph',t)
|
||
def handle_ul(t): return ('unorderdedlist',t)
|
||
def handle_li(t): return ('listitem',t)
|
||
|
||
|
||
|
||
img_count = 1
|
||
img_lookup = {}
|
||
img_heights = {}
|
||
img_widths = {}
|
||
|
||
|
||
def fetch_doc_image(k,value):
|
||
global img_count, img_lookup, img_heights, img_widths
|
||
if 'inlineObjectProperties' in value:
|
||
if 'embeddedObject' in value['inlineObjectProperties']:
|
||
if 'imageProperties' in value['inlineObjectProperties']['embeddedObject']:
|
||
if 'contentUri' in value['inlineObjectProperties']['embeddedObject']['imageProperties']:
|
||
print(k)
|
||
uu = value['inlineObjectProperties']['embeddedObject']['imageProperties']['contentUri']
|
||
response = requests.get(uu, stream=True)
|
||
name = 'image_' + str(img_count) + '.' + response.headers['content-type'].split('/')[1]
|
||
img_count += 1
|
||
img_lookup[k] = name
|
||
|
||
with open('cache/doc_images/'+name, 'wb') as out_file:
|
||
shutil.copyfileobj(response.raw, out_file)
|
||
print(uu)
|
||
print(response.headers)
|
||
print(name)
|
||
del response
|
||
if 'size' in value['inlineObjectProperties']['embeddedObject']:
|
||
img_heights[k] = int(value['inlineObjectProperties']['embeddedObject']['size']['height']['magnitude'])
|
||
img_widths[k] = int(value['inlineObjectProperties']['embeddedObject']['size']['width']['magnitude'])
|
||
|
||
|
||
def get_doc_generic(docid, bracket=1, verbose=0):
|
||
import pickle
|
||
import os.path
|
||
from googleapiclient.discovery import build
|
||
from google_auth_oauthlib.flow import InstalledAppFlow
|
||
from google.auth.transport.requests import Request
|
||
global img_count, img_lookup, img_heights, img_widths
|
||
|
||
# If modifying these scopes, delete the file token.pickle.
|
||
SCOPES = ['https://www.googleapis.com/auth/documents.readonly']
|
||
creds = None
|
||
# The file token.pickle stores the user's access and refresh tokens, and is
|
||
# created automatically when the authorization flow completes for the first
|
||
# time.
|
||
if os.path.exists('token.pickle'):
|
||
with open('token.pickle', 'rb') as token:
|
||
creds = pickle.load(token)
|
||
if not creds or not creds.valid:
|
||
if creds and creds.expired and creds.refresh_token:
|
||
creds.refresh(Request())
|
||
else:
|
||
flow = InstalledAppFlow.from_client_secrets_file(
|
||
'credentials.json', SCOPES)
|
||
creds = flow.run_local_server(port=0)
|
||
# Save the credentials for the next run
|
||
with open('token.pickle', 'wb') as token:
|
||
pickle.dump(creds, token)
|
||
|
||
service = build('docs', 'v1', credentials=creds)
|
||
|
||
# Retrieve the documents contents from the Docs service.
|
||
document = service.documents().get(documentId=docid).execute()
|
||
|
||
tempout = codecs.open('cache/trash/gdoctemp.txt','w','utf-8')
|
||
tempout.write( json.dumps(document,indent=2) \
|
||
+ "\n\n\n------------------------------------\n\n")
|
||
if verbose: print('The title of the document is: {}'.format(document.get('title')))
|
||
|
||
doc_content = document.get('body').get('content')
|
||
doc_objects = document.get('inlineObjects')
|
||
doc_lists = document.get('lists')
|
||
|
||
#text = ''
|
||
result = []
|
||
last_type = ''
|
||
#answer_text = ''
|
||
answer = []
|
||
in_a_list = ''
|
||
|
||
# Get all the images
|
||
for k,value in doc_objects.items():
|
||
tempout.write( "->" + k + "=" + json.dumps(value,indent=2) + "\n\n\n--\n\n")
|
||
fetched = fetch_doc_image(k,value)
|
||
|
||
list_stack = []
|
||
list_depth = 0
|
||
last_list_depth = 0
|
||
for value in doc_content:
|
||
tempout.write( json.dumps(value,indent=2) + "\n\n\n")
|
||
if verbose: print(json.dumps(value, sort_keys=True, indent=4))
|
||
|
||
tag_fxn = handle_para
|
||
if 'paragraph' in value:
|
||
this_text = ''
|
||
|
||
# First we deal with if we're in a list.
|
||
if 'bullet' in value['paragraph']:
|
||
# either we're (1)starting a new list, (2)in one (do nothing),
|
||
# (3)starting a nested one, or (4)finished a nested one.
|
||
lid = value['paragraph']['bullet']['listId']
|
||
if not list_stack: # 1
|
||
list_stack.append(lid)
|
||
else:
|
||
if not lid == list_stack[0]:
|
||
if not lid in list_stack: # 3
|
||
list_stack.append(lid)
|
||
else: # 4
|
||
x = list_stack.pop()
|
||
while x != lid: list_stack.pop()
|
||
elif len(list_stack) > 0:
|
||
# current para isn't a bullet but we still have a list open.
|
||
list_stack = []
|
||
|
||
|
||
list_depth = len(list_stack)
|
||
deeper = list_depth - last_list_depth
|
||
if deeper > 0:
|
||
answer.append("<ul>" * deeper)
|
||
elif deeper < 0:
|
||
deeper = -1 * deeper
|
||
answer.append("</ul>" * deeper)
|
||
if len(list_stack):
|
||
tag_fxn = handle_li
|
||
|
||
# NOW the tag_fxn is either 'para' or 'li'... let's get the styling info next,
|
||
elements = value.get('paragraph').get('elements')
|
||
if 'paragraphStyle' in value.get('paragraph'):
|
||
style = value.get('paragraph').get('paragraphStyle')
|
||
if 'namedStyleType' in style:
|
||
type = style['namedStyleType']
|
||
|
||
# and FINALLY, the actual contents.
|
||
for elem in elements:
|
||
# text content
|
||
this_text += read_paragraph_element_2(elem,type)
|
||
|
||
# image content
|
||
if 'inlineObjectElement' in elem:
|
||
vpi = elem['inlineObjectElement']
|
||
if 'inlineObjectId' in vpi:
|
||
ii = vpi['inlineObjectId']
|
||
if ii in img_lookup:
|
||
img = img_lookup[ii]
|
||
h = img_heights[ii]
|
||
w = img_widths[ii]
|
||
this_text += '<img src="doc_images/%s" width="%i" height="%i" />' % (img,w,h)
|
||
|
||
|
||
# Now for something tricky. Call an appropriate handler, based on:
|
||
# (a) what is the paragraph style type?
|
||
# (b) is it different from the prev one?
|
||
|
||
if last_type=='NORMAL_TEXT' and type!=last_type:
|
||
if this_text.strip():
|
||
result.append(handle_answer(answer))
|
||
answer = []
|
||
#answer_text = ''
|
||
|
||
if type=='HEADING_2' and this_text.strip():
|
||
result.append( handle_sec(this_text) )
|
||
this_text = ''
|
||
elif type=='HEADING_3' and this_text.strip():
|
||
result.append(handle_question(this_text,bracket))
|
||
this_text = ''
|
||
else:
|
||
if this_text.lower().startswith('tags:'):
|
||
tag_fxn = handle_tags
|
||
if this_text.lower().startswith('icons:'):
|
||
tag_fxn = handle_icons
|
||
if this_text.strip():
|
||
answer.append(tag_fxn(this_text))
|
||
this_text = ''
|
||
last_type = type
|
||
last_list_depth = list_depth
|
||
|
||
elif 'table' in value:
|
||
pass
|
||
|
||
|
||
result.append(handle_answer(answer))
|
||
return json.dumps(result,indent=4)
|
||
|
||
|
||
|
||
|
||
def scrape_schedule_py():
|
||
return 1
|
||
|
||
"""
|
||
cur_session = requests.Session()
|
||
mygav_url = "https://lum-prod.ec.gavilan.edu/"
|
||
|
||
r1 = requests.get(mygav_url)
|
||
|
||
login_url1 = "https://lum-prod.ec.gavilan.edu/c/portal/login"
|
||
|
||
|
||
login_url = "https://eis-prod.ec.gavilan.edu/authenticationendpoint/login.do?commonAuthCallerPath=%2Fsamlsso&forceAuth=false&passiveAuth=false&tenantDomain=carbon.super&sessionDataKey=57203341-6823-4511-b88e-4e104aa2fd71&relyingParty=LP5PROD_LuminisPortalEntity&type=samlsso&sp=Luminis+Portal+PROD&isSaaSApp=false&authenticators=BasicAuthenticator:LOCAL"
|
||
"""
|
||
|
||
|
||
|
||
def scrape_schedule_multi():
|
||
|
||
global SEMESTER, short_sem, semester_begin, filename, filename_html
|
||
|
||
SEMESTER = 'Spring 2023'
|
||
short_sem = 'sp23'
|
||
semester_begin = strptime('01/30', '%m/%d')
|
||
filename = 'sp23_sched.json'
|
||
filename_html = 'sp23_sched.html'
|
||
|
||
SEM = ['Fall 2022', 'Summer 2022 (View only)', 'Spring 2022 (View only)',
|
||
'Fall 2021 (View only)', 'Summer 2021 (View only)', 'Spring 2021 (View only)', 'Fall 2020 (View only)', 'Summer 2020 (View only)', 'Spring 2020 (View only)',
|
||
'Fall 2019 (View only)', 'Summer 2019 (View only)', 'Spring 2019 (View only)', 'Fall 2018 (View only)', 'Summer 2018 (View only)', 'Spring 2018 (View only)' ]
|
||
|
||
|
||
|
||
srt = 'fa22,su22,sp22,fa21,su21,sp21,fa20,su20,sp20,fa19,su19,sp19,fa18,su18,sp18'.split(',')
|
||
beg = ['08/22','06/13','01/31','08/23','06/14','02/01','08/24','06/15','01/27','08/26','06/17','01/28','08/27','06/18','01/29']
|
||
|
||
#for i in [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14]:
|
||
|
||
#SEMESTER = SEM[i]
|
||
#short_sem = srt[i]
|
||
#semester_begin = strptime(beg[i], '%m/%d')
|
||
#filename = '%s_sched.json' % short_sem
|
||
#filename_html = '%s_sched.html' % short_sem
|
||
|
||
as_dict = scrape_schedule()
|
||
|
||
expanded = list_latestarts(short_sem)
|
||
fields = "gp,dean,dept,num,code,crn,teacher,name,act,cap,site,type".split(",")
|
||
|
||
ffcsv = codecs.open('cache/enrollment_%s.csv' % short_sem, 'w', 'utf-8')
|
||
with ffcsv as csvfile:
|
||
csvwriter = csv.writer(csvfile)
|
||
csvwriter.writerow(fields)
|
||
|
||
for S in expanded:
|
||
parts = S['code'].split(' ')
|
||
S['dept'] = parts[0]
|
||
S['num'] = parts[1]
|
||
S['gp'] = gp[parts[0]]
|
||
S['dean'] = dean[parts[0]]
|
||
S['sem'] = short_sem
|
||
# S['act'] = S['cap']
|
||
if S['loc'] == "ONLINE LIVE": S['site'] = 'OnlineLive'
|
||
csvwriter.writerow( [ S[x] for x in fields ] )
|
||
|
||
put_file('/home/public/schedule/', 'cache/', 'enrollment_%s.csv' % short_sem, 0)
|
||
|
||
|
||
|
||
def scrape_for_db():
|
||
|
||
global SEMESTER, gp, dean, short_sem, semester_begin, filename, filename_html
|
||
fields = 'sem,crn,dept,num,gp,dean,code,name,teacher,type,cap,act,loc,site,date,days,time,cred,ztc'.split(',')
|
||
|
||
|
||
"""
|
||
SEMESTER = 'Fall 2022'
|
||
short_sem = 'fa22'
|
||
semester_begin = strptime('08/22', '%m/%d')
|
||
filename = 'fa22_sched.json'
|
||
filename_html = 'fa22_sched.html'
|
||
|
||
as_dict = scrape_schedule()
|
||
fff = codecs.open('cache/%s_sched.sql' % filename, 'w', 'utf-8')
|
||
fff.write("CREATE TABLE IF NOT EXISTS schedule ( id text, sem text, dept text, num text, gp text, dean text, code text, crn text, name text, teacher text,mode text, loc text, cap text, act text, site text, date text, cred text, ztc text, days text, time text);\n")
|
||
for S in as_dict:
|
||
parts = S['code'].split(' ')
|
||
S['dept'] = parts[0]
|
||
S['num'] = parts[1]
|
||
S['gp'] = gp[parts[0]]
|
||
S['dean'] = dean[parts[0]]
|
||
S['sem'] = short_sem
|
||
str = "INSERT INTO schedule (sem,crn,dept,num,gp,dean,code,name,teacher,mode,cap,act,loc,site,date,days,time,cred,ztc) VALUES (%s);\n" % \
|
||
", ".join( [ "'" + re.sub(r"'", "", S[x]) + "'" for x in fields ] )
|
||
print(str)
|
||
fff.write(str)
|
||
fff.write('UPDATE schedule SET site="OnlineLive" WHERE loc="ONLINE LIVE";\n')
|
||
fff.close()
|
||
"""
|
||
|
||
|
||
|
||
|
||
SEMESTER = 'Spring 2023 (View only)'
|
||
short_sem = 'sp23'
|
||
semester_begin = strptime('01/30', '%m/%d')
|
||
filename = 'sp23_sched.json'
|
||
filename_html = 'sp23_sched.html'
|
||
|
||
as_dict = scrape_schedule()
|
||
fff = codecs.open('cache/%s_sched.sql' % filename, 'w', 'utf-8')
|
||
fff.write("CREATE TABLE IF NOT EXISTS schedule ( id text, sem text, dept text, num text, gp text, dean text, code text, crn text, name text, teacher text,mode text, loc text, cap text, act text, site text, date text, cred text, ztc text, days text, time text);\n")
|
||
for S in as_dict:
|
||
parts = S['code'].split(' ')
|
||
S['dept'] = parts[0]
|
||
S['num'] = parts[1]
|
||
S['gp'] = gp[parts[0]]
|
||
S['dean'] = dean[parts[0]]
|
||
S['sem'] = short_sem
|
||
str = "INSERT INTO schedule (sem,crn,dept,num,gp,dean,code,name,teacher,mode,cap,act,loc,site,date,days,time,cred,ztc) VALUES (%s);\n" % \
|
||
", ".join( [ "'" + re.sub(r"'", "", S[x]) + "'" for x in fields ] )
|
||
print(str)
|
||
fff.write(str)
|
||
fff.write('UPDATE schedule SET site="OnlineLive" WHERE loc="ONLINE LIVE";\n')
|
||
fff.close()
|
||
|
||
def argos_data():
|
||
global dean,gp
|
||
|
||
f2 = codecs.open('cache/enrollment_argos_fa23.csv','w','utf-8')
|
||
writer = csv.writer(f2)
|
||
headers = 'gp dean dept num code crn name act site'.split(' ')
|
||
writer.writerow(headers)
|
||
|
||
f = codecs.open('cache/sched_draft_fa23.csv','r','utf-8')
|
||
reader = csv.reader(f, delimiter=',')
|
||
headers = next(reader)
|
||
for r in reader:
|
||
d = dict(list(zip(headers,r)))
|
||
print(d)
|
||
my_dean = dean[d['Subj']]
|
||
my_gp = gp[d['Subj']]
|
||
dept = d['Subj']
|
||
num = d['Crse No']
|
||
code = dept + " " + num
|
||
crn = d['CRN']
|
||
name = d['Course Title']
|
||
act = d['Open Seats']
|
||
campus = d['Campus']
|
||
session = d['Session']
|
||
if campus == "Off Campus": site = session
|
||
else: site = campus
|
||
print(site)
|
||
writer.writerow([my_gp,my_dean,dept,num,code,crn,name,act,site])
|
||
|
||
def days_times(s):
|
||
parts = re.search(r'^([MTWThRF]+)\s?(.*?)$',s)
|
||
if parts:
|
||
day = parts.group(1)
|
||
time = parts.group(2)
|
||
parts2 = re.search(r'^(.*)\s?-\s?(.*)$',time)
|
||
if parts2:
|
||
time_start = parts2.group(1).strip()
|
||
time_end = parts2.group(2).strip()
|
||
return day, time_start, time_end
|
||
return day, time, ''
|
||
return '','',''
|
||
|
||
def remove_year(s):
|
||
s = re.sub(r'\-', '/', s)
|
||
if len(s)>5: return s[5:]
|
||
return s
|
||
|
||
def argos_data_from_cvc():
|
||
global dean,gp
|
||
short_sem = 'su23'
|
||
|
||
f3 = codecs.open('cache/%s_sched.json' % short_sem, 'w', 'utf-8')
|
||
all_courses = []
|
||
|
||
f = codecs.open('cache/sched_draft_%s.csv' % short_sem, 'r','utf-8')
|
||
reader = csv.reader(f, delimiter=',')
|
||
headers = next(reader)
|
||
for r in reader:
|
||
d = dict(list(zip(headers,r)))
|
||
#print(d)
|
||
parts = re.search(r'^([A-Z]+)(\d+[A-Z]*)$', d['Course_Code'])
|
||
if parts:
|
||
dept = parts.group(1)
|
||
num = parts.group(2)
|
||
my_dean = dean[dept]
|
||
my_gp = gp[dept]
|
||
code = dept + " " + num
|
||
crn = d['CRN']
|
||
cred = d['Units_Credit_hours']
|
||
days, time_start, time_end = days_times(d['Meeting_Days_and_Times'])
|
||
times = ""
|
||
if time_start: times = time_start + "-" + time_end
|
||
date = remove_year(d['Start_Date']) + "-" + remove_year(d['End_Date'])
|
||
start = remove_year(d['Start_Date'])
|
||
end = remove_year(d['End_Date'])
|
||
ztc = d['ZTC']
|
||
name = d['Course_Name']
|
||
cap = d['Class_Capacity']
|
||
rem = d['Available_Seats']
|
||
act = int(cap) - int(rem)
|
||
teacher = d['Instructor_First_Name'] + " " + d['Instructor_Last_Name']
|
||
delivery = d['Delivery']
|
||
if delivery == "Online":
|
||
if days:
|
||
site = "Online"
|
||
type = "online live"
|
||
loc = "Online Live"
|
||
else:
|
||
site = "Online"
|
||
type = "online"
|
||
loc = "ONLINE"
|
||
elif delivery == "Hybrid":
|
||
site = d['Campus_College']
|
||
type = "hybrid"
|
||
loc = d['Meeting_Locations']
|
||
else:
|
||
site = d['Campus_College']
|
||
type = "in-person"
|
||
loc = d['Meeting_Locations']
|
||
this_course = { "crn": crn, "dept": dept, "num": num, "code": code, "name": name, "teacher": teacher, "type": type, "loc": loc, \
|
||
"cap": cap.strip(), "act": act, "site": site, "date": date, "cred": cred.strip(), "ztc": ztc, "days": days, "time": times, \
|
||
"start": start, "end": end, "time_start": time_start, "time_end": time_end, "dean": my_dean, "gp": my_gp}
|
||
all_courses.append(this_course)
|
||
print(site)
|
||
#writer.writerow([my_gp,my_dean,dept,num,code,crn,name,act,site])
|
||
print(all_courses)
|
||
#print(json.dumps(all_courses))
|
||
f3.write( json.dumps(all_courses,indent=2) )
|
||
f3.close()
|
||
expanded = list_latestarts(short_sem)
|
||
|
||
|
||
|
||
def expand_old_semesters():
|
||
|
||
terms = 'sp16,su16,fa16,sp17,su17,fa17,sp18,su18,fa18,sp19,su19,fa19,sp20,su20,fa20,sp21,su21,fa21,sp22,su22,fa22'.split(',')
|
||
terms = 'sp16,su16,fa16,sp17,su17,fa17,sp18,su18,fa18,sp19,su19,fa19,sp20,su20'.split(',')
|
||
terms.reverse()
|
||
|
||
for t in terms:
|
||
list_latestarts(t)
|
||
input('press return to continue.')
|
||
|
||
# Input: xxxx_sched.json. Output: xxxx_latestarts.txt
|
||
def list_latestarts(term="sp23"):
|
||
|
||
show_summary = 1
|
||
|
||
the_year = '20' + term[2:4]
|
||
print("year: ", the_year, " semester: ", term)
|
||
|
||
term_in = "cache/%s_sched.json" % term
|
||
term_out = "cache/%s_latestarts.txt" % term
|
||
expanded_out = "%s_sched_expanded.json" % term
|
||
print("Writing output to " + term_out)
|
||
infile = codecs.open(term_in, "r", "utf-8")
|
||
outfile = codecs.open(term_out, "w", "utf-8")
|
||
exoutfile = codecs.open('cache/' + expanded_out, "w", "utf-8")
|
||
expanded = []
|
||
sched = json.loads(infile.read())
|
||
#print sched
|
||
by_date = {}
|
||
|
||
if show_summary: print("course \t loc \t type \t time")
|
||
|
||
for C in sched:
|
||
if (not C['type']) and C['loc'] != 'ONLINE': # and C['time']:
|
||
C['type'] = 'in-person'
|
||
|
||
if show_summary: print("%s \t %s \t %s \t %s" % (C['code'],C['loc'],C['type'],C['time']))
|
||
|
||
if 'extra' in C:
|
||
if 'partofday' in C and ('type' in C['extra'][0]) and (C['extra'][0]['type'] == 'online') and C['loc'] != "ONLINE LIVE":
|
||
C['type'] = 'hybrid'
|
||
|
||
times = C['time'].split("-")
|
||
if len(times) > 1:
|
||
time_start = times[0]
|
||
time_end = times[1]
|
||
|
||
try:
|
||
startt = time.strptime(time_start,"%I:%M %p")
|
||
endt = time.strptime(time_end,"%I:%M %p")
|
||
min_start = startt.tm_min
|
||
min_end = endt.tm_min
|
||
if min_start == 0: min_start = "00"
|
||
else: min_start = str(min_start)
|
||
if min_end == 0: min_end = "00"
|
||
else: min_end = str(min_end)
|
||
C['time_start'] = "%i:%s" % (startt.tm_hour, min_start )
|
||
C['time_end'] = "%i:%s" % (endt.tm_hour, min_end )
|
||
if 0:
|
||
print("+ Parsed %s into %s and %s." % (C['time'], C['time_start'], C['time_end']))
|
||
except Exception as e:
|
||
print(e, "\n-- problem parsing time ", time_start, " or ", time_end)
|
||
else:
|
||
C['time_start'] = ''
|
||
C['time_end'] = ''
|
||
|
||
if re.search('TBA',C['date']):
|
||
C['start'] = ''
|
||
C['end'] = ''
|
||
C['doy'] = ''
|
||
expanded.append(C)
|
||
continue
|
||
|
||
parts = C['date'].split("-")
|
||
start = parts[0] + "/" + the_year
|
||
end = parts[1] + "/" + the_year
|
||
|
||
try:
|
||
startd = parser.parse(start)
|
||
endd = parser.parse(end)
|
||
C['start'] = "%i-%i" % (startd.month,startd.day)
|
||
C['end'] = "%i-%i" % (endd.month,endd.day)
|
||
C['doy'] = startd.timetuple().tm_yday
|
||
expanded.append(C)
|
||
except Exception as e:
|
||
print(e, "\n-- problem parsing ", start, " or ", end)
|
||
if not startd in by_date:
|
||
by_date[startd] = []
|
||
by_date[startd].append(C)
|
||
|
||
exoutfile.write( json.dumps(expanded,indent=2) )
|
||
exoutfile.close()
|
||
put_file('/home/public/schedule/', 'cache/', expanded_out, 0)
|
||
|
||
for X in sorted(by_date.keys()):
|
||
#print("Start: ", X)
|
||
if len(by_date[X]) < 200:
|
||
prettydate = X.strftime("%A, %B %d")
|
||
#print(prettydate + ": " + str(len(by_date[X])) + " courses")
|
||
outfile.write(prettydate + ": " + str(len(by_date[X])) + " courses" + "\n")
|
||
for Y in by_date[X]:
|
||
#print "\t" + Y['code'] + " " + Y['crn'] + "\t" + Y['teacher']
|
||
#print(Y)
|
||
#outfile.write("\t" + Y['code'] + " " + Y['crn'] + "\t" + Y['teacher'] + "\t" + Y['type'] +"\n")
|
||
outfile.write("\t" + Y['code'] + " " + Y['crn'] + "\t" + Y['teacher'] + "\t" + Y['type'] + "\t" + "\n")
|
||
return expanded
|
||
if __name__ == "__main__":
|
||
|
||
print ('')
|
||
options = { 1: ['Re-create schedule csv and json files from raw html',recent_schedules] ,
|
||
2: ['Fetch rosters',fetch_current_rosters] ,
|
||
3: ['Fetch rosters AND canvas data automatically',fetch_current_rosters_auto] ,
|
||
4: ['Compute how registration is filling up classes', schedule_filling] ,
|
||
5: ['Manually convert 3 csv files to joined json enrollment file.', convert_roster_files] ,
|
||
6: ['Canvas data: interactive sync', interactive ],
|
||
7: ['Canvas data: automated sync', sync_non_interactive ],
|
||
8: ['Scrape schedule from ssb', scrape_schedule_multi ],
|
||
9: ['Test ssb calls with python', scrape_schedule_py ],
|
||
10: ['schedule to db', scrape_for_db ],
|
||
11: ['clean argos draft schedule file', argos_data_from_cvc],
|
||
12: ['make expanded schedule json files of old semesters', expand_old_semesters ],
|
||
13: ['Parse deanza schedule', dza_sched ],
|
||
}
|
||
|
||
if len(sys.argv) > 1 and re.search(r'^\d+',sys.argv[1]):
|
||
resp = int(sys.argv[1])
|
||
print("\n\nPerforming: %s\n\n" % options[resp][0])
|
||
|
||
else:
|
||
print ('')
|
||
for key in options:
|
||
print(str(key) + '.\t' + options[key][0])
|
||
|
||
print('')
|
||
resp = input('Choose: ')
|
||
|
||
# Call the function in the options dict
|
||
options[ int(resp)][1]()
|
||
|
||
# Testing
|
||
|
||
#if __name__ == "__main__":
|
||
#users = fetch('/api/v1/courses/69/users?per_page=100',1)
|
||
#print "These are the users: "
|
||
#print users
|
||
|
||
#getSemesterSchedule()
|
||
|
||
|
||
#get_doc()
|
||
#pass
|