1959 lines
67 KiB
Python
1959 lines
67 KiB
Python
# This Python file uses the following encoding: utf-8
|
||
|
||
#from __future__ import print_function
|
||
from time import strptime
|
||
from bs4 import BeautifulSoup as bs
|
||
from util import UnicodeDictReader
|
||
from datetime import datetime as dt
|
||
import pandas as pd
|
||
import codecs, json, requests, re, csv, datetime, pysftp, os, jsondiff, os.path
|
||
import sys, shutil, hmac, hashlib, base64, schedule, time, pathlib, datetime
|
||
import pdb
|
||
from collections import defaultdict
|
||
from deepdiff import DeepDiff
|
||
from secrets import apiKey, apiSecret, FTP_SITE, FTP_USER, FTP_PW, GOO, GOO_PIN, token, url, domain, account_id, header, g_id, g_secret
|
||
from secrets import instructure_url, instructure_username, instructure_private_key
|
||
|
||
|
||
|
||
"""
|
||
Everything to do with fetching data,
|
||
- From iLearn, via token
|
||
- current roster uploads from instructures sftp site
|
||
- raw logs and other from canvas data repo
|
||
- from ssb, use firefox to scrape the schedule
|
||
|
||
|
||
And some subsequent processing:
|
||
- Raw roster files, into a more compact json format
|
||
- Raw logs into something more useful
|
||
"""
|
||
|
||
|
||
|
||
|
||
verbose = False
|
||
|
||
users = {}
|
||
users_by_id = {}
|
||
|
||
# todo: all these constants for SSB -- line 1008
|
||
#
|
||
# todo: https://stackoverflow.com/questions/42656247/how-can-i-use-canvas-data-rest-api-using-python
|
||
|
||
schedfile = 'temp.csv'
|
||
|
||
|
||
SEMESTER = 'Summer 2019'
|
||
short_sem = 'su19'
|
||
semester_begin = strptime('06/17', '%m/%d')
|
||
filename = 'su19_sched.json'
|
||
|
||
SEMESTER = 'Summer 2020'
|
||
short_sem = 'su20'
|
||
semester_begin = strptime('06/15', '%m/%d')
|
||
filename = 'su20_sched.json'
|
||
|
||
SEMESTER = 'Fall 2020'
|
||
short_sem = 'fa20'
|
||
semester_begin = strptime('08/24', '%m/%d')
|
||
filename = 'fa20_sched.json'
|
||
|
||
SEMESTER = 'Spring 2021'
|
||
short_sem = 'sp21'
|
||
semester_begin = strptime('02/01', '%m/%d')
|
||
filename = 'sp21_sched.json'
|
||
filename_html = 'sp21_sched.html'
|
||
|
||
|
||
SEMESTER = 'Summer 2021 (View only)'
|
||
short_sem = 'su21'
|
||
semester_begin = strptime('06/14', '%m/%d')
|
||
filename = 'su21_sched.json'
|
||
filename_html = 'su21_sched.html'
|
||
|
||
|
||
|
||
|
||
# Current or upcoming semester is first.
|
||
sems = ['su21', 'sp21', 'fa20', 'su20', 'sp20'] #, 'fa19'] # 'sp19']
|
||
|
||
sys.setrecursionlimit( 100000 )
|
||
|
||
local_data_folder = 'cache/canvas_data/'
|
||
mylog = codecs.open(local_data_folder + 'temp_log.txt','w')
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
gp = {}
|
||
gp['ACCT'] = 'info'
|
||
gp['AE'] = 'skill'
|
||
gp['AH'] = 'well'
|
||
gp['AJ'] = 'skill'
|
||
gp['AMT'] = 'skill'
|
||
gp['ANTH'] = 'soc'
|
||
gp['APE'] = 'skill'
|
||
gp['ART'] = 'art'
|
||
gp['ASTR'] = 'stem'
|
||
gp['ATH'] = 'well'
|
||
gp['BIO'] = 'stem'
|
||
gp['BOT'] = 'info'
|
||
gp['BUS'] = 'info'
|
||
gp['CD'] = 'skill'
|
||
gp['CHEM'] = 'stem'
|
||
gp['CMGT'] = 'skill'
|
||
gp['CMUN'] = 'comm'
|
||
gp['COS'] = 'skill'
|
||
gp['CSIS'] = 'stem'
|
||
gp['CWE'] = 'skill'
|
||
gp['DM'] = 'info'
|
||
gp['ECOL'] = 'stem'
|
||
gp['ECON'] = 'info'
|
||
gp['ENGL'] = 'soc'
|
||
gp['ENGR'] = 'stem'
|
||
gp['ENVS'] = 'stem'
|
||
gp['ESL'] = 'comm'
|
||
gp['ETHN'] = 'comm'
|
||
gp['FRNH'] = 'comm'
|
||
gp['GEOG'] = 'stem'
|
||
gp['GEOL'] = 'stem'
|
||
gp['GUID'] = 'soc'
|
||
gp['HE'] = 'well'
|
||
gp['HIST'] = 'soc'
|
||
gp['HUM'] = 'soc'
|
||
gp['HVAC'] = 'skill'
|
||
gp['JFT'] = 'skill'
|
||
gp['JLE'] = 'skill'
|
||
gp['JOUR'] = 'comm'
|
||
gp['JPN'] = 'comm'
|
||
gp['KIN'] = 'well'
|
||
gp['LIB'] = 'comm'
|
||
gp['LIFE'] = 'well'
|
||
gp['MATH'] = 'stem'
|
||
gp['MCTV'] = 'art'
|
||
gp['MUS'] = 'art'
|
||
gp['PHIL'] = 'soc'
|
||
gp['PHYS'] = 'stem'
|
||
gp['POLS'] = 'soc'
|
||
gp['PSCI'] = 'stem'
|
||
gp['PSYC'] = 'soc'
|
||
gp['RE'] = 'skill'
|
||
gp['SJS'] = 'soc'
|
||
gp['SOC'] = 'soc'
|
||
gp['SPAN'] = 'comm'
|
||
gp['THEA'] = 'art'
|
||
gp['WELD'] = 'skill'
|
||
gp['WTRM'] = 'skill'
|
||
gp['MGMT'] = 'skill'
|
||
gp['MKTG'] = 'skill'
|
||
gp['HTM'] = 'skill'
|
||
|
||
dean = {}
|
||
dean['AH'] = 'et'
|
||
dean['HE'] = 'et'
|
||
dean['ATH'] = 'et'
|
||
dean['KIN'] = 'et'
|
||
dean['LIFE'] = 'et'
|
||
dean['AE'] = 'ss'
|
||
dean['APE'] = 'ss'
|
||
dean['ACCT'] = 'ss'
|
||
dean['AJ'] = 'ss'
|
||
dean['AMT'] = 'ss'
|
||
dean['HVAC'] = 'ss'
|
||
dean['JFT'] = 'ss'
|
||
dean['JLE'] = 'ss'
|
||
dean['RE'] = 'ss'
|
||
dean['WTRM'] = 'ss'
|
||
dean['WELD'] = 'ss'
|
||
dean['ANTH'] = 'nl'
|
||
dean['ART'] = 'nl'
|
||
dean['ASTR'] = 'jn'
|
||
dean['BIO'] = 'jn'
|
||
dean['BOT'] = 'ss'
|
||
dean['BUS'] = 'ss'
|
||
dean['CD'] = 'ss'
|
||
dean['CHEM'] = 'jn'
|
||
dean['CMGT'] = 'ss'
|
||
dean['CMUN'] = 'nl'
|
||
dean['COS'] = 'ss'
|
||
dean['CSIS'] = 'ss'
|
||
dean['CWE'] = 'ss'
|
||
dean['DM'] = 'ss'
|
||
dean['ECOL'] = 'jn'
|
||
dean['ECON'] = 'nl'
|
||
dean['ENGL'] = 'nl'
|
||
dean['ENGR'] = 'jn'
|
||
dean['ENVS'] = 'jn'
|
||
dean['ESL'] = 'ss'
|
||
dean['ETHN'] = 'nl'
|
||
dean['FRNH'] = 'nl'
|
||
dean['GEOG'] = 'jn'
|
||
dean['GEOL'] = 'jn'
|
||
dean['GUID'] = 'nl'
|
||
dean['HIST'] = 'nl'
|
||
dean['HUM'] = 'nl'
|
||
dean['JOUR'] = 'nl'
|
||
dean['JPN'] = 'nl'
|
||
dean['LIB'] = 'kn'
|
||
dean['MATH'] = 'jn'
|
||
dean['MCTV'] = 'nl'
|
||
dean['MGMT'] = 'ss'
|
||
dean['MKTG'] = 'ss'
|
||
dean['HTM'] = 'ss'
|
||
dean['MUS'] = 'nl'
|
||
dean['PHIL'] = 'nl'
|
||
dean['PHYS'] = 'jn'
|
||
dean['POLS'] = 'nl'
|
||
dean['PSCI'] = 'jn'
|
||
dean['PSYC'] = 'nl'
|
||
dean['SJS'] = 'nl'
|
||
dean['SOC'] = 'nl'
|
||
dean['SPAN'] = 'nl'
|
||
dean['THEA'] = 'nl'
|
||
|
||
|
||
class FetchError(Exception):
|
||
pass
|
||
|
||
|
||
DEBUG = 0
|
||
|
||
def d(s,end=''):
|
||
global DEBUG
|
||
if end and DEBUG: print(s,end=end)
|
||
elif DEBUG: print(s)
|
||
|
||
################
|
||
################ CANVAS API MAIN FETCHING FUNCTIONS
|
||
################
|
||
################
|
||
################
|
||
|
||
|
||
|
||
|
||
# Main canvas querying fxn
|
||
def fetch(target,verbose=0):
|
||
# if there are more results, recursivly call myself, adding on to the results.
|
||
results = 0
|
||
if target[0:4] != "http": target = url + target
|
||
if verbose:
|
||
print("++ Fetching: " + target)
|
||
r2 = requests.get(target, headers = header)
|
||
#if verbose:
|
||
#print "++ Got: " + r2.text
|
||
try:
|
||
results = json.loads(r2.text)
|
||
count = len(results)
|
||
except:
|
||
print("-- Failed to parse: ", r2.text)
|
||
if verbose:
|
||
print("Got %i results" % count)
|
||
if verbose > 1:
|
||
print(r2.headers)
|
||
|
||
tempout = codecs.open('cache/fetchcache.txt','a','utf-8')
|
||
tempout.write(r2.text+"\n\n")
|
||
tempout.close()
|
||
|
||
if ('link' in r2.headers and count > 0):
|
||
links = r2.headers['link'].split(',')
|
||
for L in links:
|
||
ll = L.split(';')
|
||
link = ll[0].replace("<","")
|
||
link = link.replace(">","")
|
||
if re.search(r'next', ll[1]):
|
||
if (verbose): print("++ More link: " + link)
|
||
#link = re.sub(r'per_page=10$', 'per_page=100', link) # link.replace('per_page=10','per_page=500')
|
||
#if (verbose): print("++ More link: " + link)
|
||
|
||
nest = fetch(link,verbose)
|
||
if isinstance(results,dict): results.update(nest)
|
||
else: results.extend(nest)
|
||
return results
|
||
|
||
# Main canvas querying fxn - stream version - don't die on big requests
|
||
def fetch_stream(target,verbose=0):
|
||
# if there are more results, recursivly call myself, adding on to the results.
|
||
results = 0
|
||
while target:
|
||
if target[0:4] != "http": target = url + target
|
||
if verbose:
|
||
print("++ Fetching: " + target)
|
||
r2 = requests.get(target, headers = header)
|
||
if r2.status_code == 502:
|
||
raise FetchError()
|
||
try:
|
||
results = json.loads(r2.text)
|
||
count = len(results)
|
||
except:
|
||
print("-- Failed to parse: ", r2.text)
|
||
if verbose:
|
||
print("Got %i results" % count)
|
||
if verbose > 1:
|
||
print(r2.headers)
|
||
tempout = codecs.open('cache/fetchcache.txt','a','utf-8')
|
||
tempout.write(r2.text+"\n\n")
|
||
tempout.close()
|
||
|
||
next_link_found = 0
|
||
if ('link' in r2.headers and count > 0):
|
||
links = r2.headers['link'].split(',')
|
||
for L in links:
|
||
ll = L.split(';')
|
||
link = ll[0].replace("<","")
|
||
link = link.replace(">","")
|
||
if re.search(r'next', ll[1]):
|
||
target = link
|
||
next_link_found = 1
|
||
break
|
||
if not next_link_found: target = 0
|
||
yield results
|
||
|
||
|
||
# for dicts with one key, collapse that one key out, cause
|
||
# paging makes problems... example: enrollment_terms
|
||
def fetch_collapse(target,collapse='',verbose=0):
|
||
# if there are more results, recursivly call myself, adding on to the results.
|
||
results = 0
|
||
if target[0:4] != "http": target = url + target
|
||
if verbose:
|
||
print("++ Fetching: " + target)
|
||
r2 = requests.get(target, headers = header)
|
||
#if verbose:
|
||
#print "++ Got: " + r2.text
|
||
try:
|
||
results = json.loads(r2.text)
|
||
except:
|
||
print("-- Failed to parse: ", r2.text)
|
||
if verbose: print(r2.headers)
|
||
|
||
if collapse and collapse in results:
|
||
results = results[collapse]
|
||
|
||
if ('link' in r2.headers):
|
||
links = r2.headers['link'].split(',')
|
||
for L in links:
|
||
ll = L.split(';')
|
||
link = ll[0].replace("<","")
|
||
link = link.replace(">","")
|
||
if re.search(r'next', ll[1]):
|
||
if (verbose): print("++ More link: " + link)
|
||
nest = fetch_collapse(link, collapse, verbose)
|
||
if isinstance(results,dict): results.update(nest)
|
||
else: results.extend(nest)
|
||
return results
|
||
|
||
|
||
|
||
################
|
||
################ SCHEDULE PARSING HELPERS
|
||
################
|
||
################
|
||
################
|
||
|
||
# Teacher name format changed. Remove commas and switch first to last
|
||
def fix_t_name(str):
|
||
str = str.strip()
|
||
str = re.sub('\s+',' ',str)
|
||
parts = str.split(', ')
|
||
if len(parts)>1:
|
||
return parts[1].strip() + " " + parts[0].strip()
|
||
return str
|
||
|
||
# Separate dept and code
|
||
def split_class_dept(c):
|
||
return c.split(' ')[0]
|
||
def split_class_code(c):
|
||
num = c.split(' ')[1]
|
||
parts = re.match('(\d+)([a-zA-Z]+)',num)
|
||
#ret = "Got %s, " % c
|
||
if parts:
|
||
r = int(parts.group(1))
|
||
#print(ret + "returning %i." % r)
|
||
return r
|
||
#print(ret + "returning %s." % num)
|
||
return int(num)
|
||
def split_class_code_letter(c):
|
||
num = c.split(' ')[1]
|
||
parts = re.match('(\d+)([A-Za-z]+)',num)
|
||
if parts:
|
||
return parts.group(2)
|
||
return ''
|
||
|
||
# go from sp20 to 2020spring
|
||
def shortToLongSem(s):
|
||
parts = re.search(r'(\w\w)(\d\d)', s)
|
||
yr = parts.group(2)
|
||
season = parts.group(1)
|
||
seasons = {'sp':'spring','su':'summer','fa':'fall','wi':'winter'}
|
||
return '20'+yr+seasons[season]
|
||
|
||
# Go to the semesters folder and read the schedule. Return dataframe
|
||
def getSemesterSchedule(short='sp21'): # I used to be current_schedule
|
||
# todo: Some semesters have a different format.... partofday type site xxx i just dL'd them again
|
||
|
||
filename = 'cache/semesters/'+shortToLongSem(short)+'/' + short + '_sched.json'
|
||
print("opening %s" % filename)
|
||
#openfile = open(filename,'r')
|
||
#a = json.loads(openfile)
|
||
#return pd.DataFrame(a)
|
||
schedule = pd.read_json(filename)
|
||
schedule.teacher = schedule['teacher'].apply(fix_t_name)
|
||
#print schedule['teacher']
|
||
for index,r in schedule.iterrows():
|
||
tch = r['teacher']
|
||
parts = tch.split(' . ')
|
||
if len(parts)>1:
|
||
#print "Multiple teachers: (" + tch + ")"
|
||
schedule.loc[index,'teacher'] = parts[0]
|
||
#print " Fixed original: ", schedule.loc[index]
|
||
|
||
for t in parts[1:]:
|
||
r['teacher'] = t
|
||
schedule.loc[-1] = r
|
||
#print " New row appended: ", schedule.loc[-1]
|
||
schedule = schedule.assign(dept = schedule['code'].apply(split_class_dept))
|
||
schedule = schedule.assign(codenum = schedule['code'].apply(split_class_code))
|
||
schedule = schedule.assign(codeletter = schedule['code'].apply(split_class_code_letter))
|
||
#print(schedule)
|
||
schedule['sem'] = short
|
||
#print schedule.columns
|
||
return schedule
|
||
|
||
|
||
|
||
online_courses = {}
|
||
def prep_online_courses_df():
|
||
global online_courses
|
||
schedule = current_schedule() # from banner
|
||
online_courses = schedule[lambda x: x.type=='online']
|
||
|
||
def course_is_online(crn):
|
||
global online_courses
|
||
#print "looking up: " + str(crn)
|
||
#print online_courses
|
||
course = online_courses[lambda x: x.crn==int(crn)]
|
||
return len(course)
|
||
|
||
def get_crn_from_name(name):
|
||
#print "name is: "
|
||
#print(name)
|
||
m = re.search( r'(\d\d\d\d\d)', name)
|
||
if m: return int(m.groups(1)[0])
|
||
else: return 0
|
||
|
||
def get_enrlmts_for_user(user,enrollments):
|
||
#active enrollments
|
||
u_en = enrollments[ lambda x: (x['user_id'] == user) & (x['workflow']=='active') ]
|
||
return u_en[['type','course_id']]
|
||
|
||
|
||
|
||
################
|
||
################ CANVAS DATA
|
||
################
|
||
################
|
||
################
|
||
|
||
|
||
# Get something from Canvas Data
|
||
def do_request(path):
|
||
#Set up the request pieces
|
||
method = 'GET'
|
||
host = 'api.inshosteddata.com'
|
||
apiTime = dt.utcnow().strftime('%a, %d %b %y %H:%M:%S GMT')
|
||
apiContentType = 'application/json'
|
||
|
||
msgList = []
|
||
msgList.append(method)
|
||
msgList.append(host)
|
||
msgList.append(apiContentType)
|
||
msgList.append('')
|
||
msgList.append(path)
|
||
msgList.append('')
|
||
msgList.append(apiTime)
|
||
msgList.append(apiSecret)
|
||
|
||
msgStr = bytes("".join("%s\n" % k for k in msgList).strip(),'utf-8')
|
||
|
||
sig = base64.b64encode(hmac.new(key=bytes(apiSecret,'utf-8'),msg=msgStr,digestmod=hashlib.sha256).digest())
|
||
sig = sig.decode('utf-8')
|
||
|
||
headers = {}
|
||
headers['Authorization'] = 'HMACAuth {}:{}'.format(apiKey,sig)
|
||
headers['Date'] = apiTime
|
||
headers['Content-type'] = apiContentType
|
||
|
||
|
||
#Submit the request/get a response
|
||
uri = "https://"+host+path
|
||
print (uri)
|
||
print (headers)
|
||
response = requests.request(method='GET', url=uri, headers=headers, stream=True)
|
||
|
||
#Check to make sure the request was ok
|
||
if(response.status_code != 200):
|
||
print(('Request response went bad. Got back a ', response.status_code, ' code, meaning the request was ', response.reason))
|
||
else:
|
||
#Use the downloaded data
|
||
jsonData = response.json()
|
||
#print(json.dumps(jsonData, indent=4))
|
||
return jsonData
|
||
|
||
# Canvas data, download all new files
|
||
def sync_non_interactive():
|
||
resp = do_request('/api/account/self/file/sync')
|
||
mylog.write(json.dumps(resp, indent=4))
|
||
#mylog.close()
|
||
gotten = os.listdir(local_data_folder)
|
||
wanted = []
|
||
i = 0
|
||
for x in resp['files']:
|
||
filename = x['filename']
|
||
exi = "No "
|
||
if filename in gotten: exi = "Yes"
|
||
else: wanted.append(x)
|
||
|
||
print(str(i) + '.\tLocal: %s\tRemote: %s' % ( exi, filename ))
|
||
i += 1
|
||
print("I will attempt to download %i files." % len(wanted))
|
||
|
||
#answer = input("Press enter to begin, or q to quit ")
|
||
#if not answer == '': return
|
||
|
||
good_count = 0
|
||
bad_count = 0
|
||
for W in wanted:
|
||
print("Downloading: " + W['filename'])
|
||
response = requests.request(method='GET', url=W['url'], stream=True)
|
||
if(response.status_code != 200):
|
||
print('Request response went bad. Got back a %s code, meaning the request was %s' % \
|
||
(response.status_code, response.reason))
|
||
print('URL: ' + W['url'])
|
||
bad_count += 1
|
||
|
||
else:
|
||
#Use the downloaded data
|
||
with open(local_data_folder + W['filename'], 'wb') as fd:
|
||
for chunk in response.iter_content(chunk_size=128):
|
||
fd.write(chunk)
|
||
print("Success")
|
||
good_count += 1
|
||
print("Out of %i files, %i succeeded and %i failed." % (len(wanted),good_count,bad_count))
|
||
|
||
|
||
# list files in canvas_data (online) and choose one or some to download.
|
||
def interactive():
|
||
resp = do_request('/api/account/self/file/sync')
|
||
mylog.write(json.dumps(resp, indent=4))
|
||
#mylog.close()
|
||
i = 0
|
||
gotten = os.listdir(local_data_folder)
|
||
for x in resp['files']:
|
||
print(str(i) + '.\t' + x['filename'])
|
||
i += 1
|
||
which = input("Which files to get? (separate with commas, or say 'all') ")
|
||
if which=='all':
|
||
which_a = list(range(i-1))
|
||
else:
|
||
which_a = which.split(",")
|
||
for W in which_a:
|
||
this_i = int(W)
|
||
this_f = resp['files'][this_i]
|
||
filename = this_f['filename']
|
||
if filename in gotten: continue
|
||
print("Downloading: " + filename)
|
||
response = requests.request(method='GET', url=this_f['url'], stream=True)
|
||
if(response.status_code != 200):
|
||
print(('Request response went bad. Got back a ', response.status_code, ' code, meaning the request was ', response.reason))
|
||
else:
|
||
#Use the downloaded data
|
||
with open(local_data_folder + filename, 'wb') as fd:
|
||
for chunk in response.iter_content(chunk_size=128):
|
||
fd.write(chunk)
|
||
print("Success")
|
||
"""if filename.split('.')[-1] == 'gz':
|
||
try:
|
||
plain_filename = 'canvas_data/' + ".".join(filename.split('.')[:-1])
|
||
pf = open(plain_filename,'w')
|
||
with gzip.open('canvas_data/' + filename , 'rb') as f:
|
||
pf.write(f.read())
|
||
except Exception as e:
|
||
print "Failed to ungizp. Probably too big: " + str(e)"""
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
###### SSB SCHEDULE
|
||
######
|
||
######
|
||
######
|
||
|
||
def todays_date_filename(): # helper
|
||
n = datetime.now()
|
||
m = n.month
|
||
if m < 10: m = "0"+str(m)
|
||
d = n.day
|
||
if d < 10: d = "0" + str(d)
|
||
return "reg_" + short_sem + "_" + str(n.year) + str(m) + str(d)
|
||
|
||
def nowAsStr(): # possible duplicate
|
||
#Get the current time, printed in the right format
|
||
currentTime = datetime.datetime.utcnow()
|
||
prettyTime = currentTime.strftime('%a, %d %b %Y %H:%M:%S GMT')
|
||
return prettyTime
|
||
|
||
|
||
def row_has_data(r): # helper
|
||
if r.find_all('th'):
|
||
return False
|
||
if len(r.find_all('td')) > 2:
|
||
return True
|
||
if re.search('Note\:', r.get_text()):
|
||
return True
|
||
return False
|
||
|
||
#dbg = open('cache/temp_scheddebug_' + 'sp20' + '.txt','w')
|
||
|
||
|
||
def row_text(r): # helper
|
||
#global dbg
|
||
|
||
d("Row Txt Fxn gets: ")
|
||
arr = []
|
||
for t in r.find_all('td'):
|
||
if t.contents and len(t.contents) and t.contents[0].name == 'img':
|
||
arr.append("1")
|
||
d("img")
|
||
r_text = t.get_text()
|
||
arr.append(r_text)
|
||
if 'colspan' in t.attrs and t['colspan']=='2':
|
||
d('[colspan2]')
|
||
arr.append('')
|
||
d("\t"+r_text, end=" ")
|
||
d('')
|
||
|
||
if len(arr)==1 and re.search('Note\:',arr[0]):
|
||
note_line = clean_funny( arr[0] )
|
||
note_line = re.sub(r'\n',' ', note_line)
|
||
note_line = re.sub(r'"','', note_line)
|
||
#note_line = re.sub(r',','\,', note_line)
|
||
return ',,,,,,,,,,,,,,,,,,"' + note_line + '"\n'
|
||
del arr[0]
|
||
arr[1] = clean_funny(arr[1])
|
||
arr[2] = clean_funny(arr[2])
|
||
if arr[1]: arr[1] = arr[1] + " " + arr[2]
|
||
del arr[2]
|
||
arr = [ re.sub(r' ','',a) for a in arr]
|
||
arr = [ re.sub(',','. ',a) for a in arr]
|
||
arr = [ re.sub('\(P\)','',a) for a in arr]
|
||
arr = [ a.strip() for a in arr]
|
||
#del arr[-1]
|
||
r = ','.join(arr)+'\n'
|
||
r = re.sub('\n','',r)
|
||
r = re.sub('add to worksheet','',r)
|
||
d("Row Txt Fxn returns: " + r + "\n\n")
|
||
|
||
return r + '\n'
|
||
|
||
|
||
|
||
# Take banner's html and make a csv(?) file
|
||
def ssb_to_csv(src):
|
||
#out = codecs.open(schedfile,'w','utf-8')
|
||
output = 'crn,code,sec,cmp,cred,name,days,time,cap,act,rem,wl_cap,wl_act,wl_rem,teacher,date,loc,ztc,note\n'
|
||
b = bs(src, 'html.parser')
|
||
tab = b.find(class_="datadisplaytable")
|
||
if not tab:
|
||
print("hmm... didn't find a 'datadisplaytable' in this html: ")
|
||
#print(src)
|
||
return 0
|
||
rows = tab.find_all('tr')
|
||
drows = list(filter(row_has_data,rows))
|
||
for dd in drows:
|
||
t = row_text(dd)
|
||
output += t
|
||
return output
|
||
|
||
|
||
|
||
def clean_funny(str):
|
||
if str and str.encode('utf8') == ' ': return ''
|
||
return str
|
||
def clean_funny2(str):
|
||
if str and str == '\xa0': return ''
|
||
if str and str == ' ': return ''
|
||
return str
|
||
|
||
def clean_funny3(str):
|
||
return re.sub('\xa0','',str)
|
||
|
||
|
||
|
||
### course is a list of 1-3 lists, each one being a line in the schedule's output. First one has section
|
||
def course_start(course):
|
||
#todo: use this to make a early/late/short field and store semester dates w/ other constants
|
||
|
||
start = datetime(2019,1,28)
|
||
end = datetime(2019,5,24)
|
||
|
||
# is it normal, early, late, winter?
|
||
li = course[0]
|
||
date = li[12]
|
||
|
||
if date=='01/28-05/24':
|
||
return 'Normal'
|
||
if date=='TBA':
|
||
return 'TBA'
|
||
if date=='01/02-01/25':
|
||
return 'Winter'
|
||
if date=='01/02-01/24':
|
||
return 'Winter'
|
||
|
||
ma = re.search( r'(\d+)\/(\d+)\-(\d+)\/(\d+)', date)
|
||
if ma:
|
||
# TODO do these years matter?
|
||
mystart = datetime(2019, int(ma.group(1)), int(ma.group(2)))
|
||
if int(ma.group(1)) > 10: mystart = datetime(2018, int(ma.group(1)), int(ma.group(2)))
|
||
myend = datetime(2019, int(ma.group(3)), int(ma.group(4)))
|
||
length = myend - mystart
|
||
weeks = length.days / 7
|
||
|
||
if mystart != start:
|
||
if mystart < start:
|
||
#print 'Early Start ', str(weeks), " weeks ",
|
||
return 'Early start'
|
||
else:
|
||
#print 'Late Start ', str(weeks), " weeks ",
|
||
return 'Late start'
|
||
else:
|
||
if myend > end:
|
||
#print 'Long class ', str(weeks), " weeks ",
|
||
return 'Long term'
|
||
else:
|
||
#print 'Short term ', str(weeks), " weeks ",
|
||
return 'Short term'
|
||
#return ma.group(1) + '/' + ma.group(2) + " end: " + ma.group(3) + "/" + ma.group(4)
|
||
else:
|
||
return "Didn't match: " + date
|
||
|
||
|
||
def time_to_partofday(t):
|
||
#todo: account for multiple sites/rows
|
||
# 11:20 am-12:10 pm
|
||
mor = strptime('12:00 PM', '%I:%M %p')
|
||
mid = strptime( '2:00 PM', '%I:%M %p')
|
||
aft = strptime( '6:00 PM', '%I:%M %p')
|
||
if t == 'TBA':
|
||
return 'TBA'
|
||
t = t.upper()
|
||
parts = t.split('-')
|
||
try:
|
||
begin = strptime(parts[0], '%I:%M %p')
|
||
end = strptime(parts[1], '%I:%M %p')
|
||
if end > aft:
|
||
return "Evening"
|
||
if end > mid:
|
||
return "Afternoon"
|
||
if end > mor:
|
||
return "Midday"
|
||
return "Morning"
|
||
#return begin,end
|
||
except Exception as e:
|
||
#print 'problem parsing: ', t, " ",
|
||
return ""
|
||
|
||
# Deduce a 'site' field, based on room name and known offsite locations
|
||
def room_to_site(room,verbose=0):
|
||
#todo: account for multiple sites/rows
|
||
#todo: better way to store these offsite labels
|
||
othersites = 'AV,SBHS I-243,SBHS I-244,LOADCS,HOPEH,HOPEG,PLY,SAS,SBHS,LOHS,CHS,SBRAT,'.split(',')
|
||
# is it gilroy, mh, hol, other, online or hybrid?
|
||
site = 'Gilroy'
|
||
#if len(course[0]) > 13:
|
||
# room = course[0][13]
|
||
if room in othersites:
|
||
site = "Other"
|
||
if room == 'TBA':
|
||
site = 'TBA'
|
||
if room == 'AV':
|
||
site = 'San Martin Airport'
|
||
if re.search('MHG',room):
|
||
site = 'Morgan Hill'
|
||
if re.search('HOL',room):
|
||
site = 'Hollister'
|
||
if re.search('COY',room):
|
||
site = 'Coyote Valley'
|
||
if re.search('OFFSTE',room):
|
||
site = 'Other'
|
||
if re.search('ONLINE',room):
|
||
site = 'Online'
|
||
if verbose: print(room, '\t', end=' ')
|
||
return site
|
||
|
||
|
||
from io import StringIO
|
||
|
||
|
||
# take text lines and condense them to one dict per section
|
||
def to_section_list(input_text,verbose=0):
|
||
this_course = ''
|
||
#todo: no output files
|
||
#jout = codecs.open(filename, 'w', 'utf-8')
|
||
#input = csv.DictReader(open(schedfile,'r'))
|
||
#input = UnicodeDictReader(input_text.splitlines())
|
||
all_courses = []
|
||
|
||
|
||
try:
|
||
f = StringIO(input_text)
|
||
except:
|
||
print("ERROR with this input_text:")
|
||
print(input_text)
|
||
reader = csv.reader(f, delimiter=',')
|
||
headers = next(reader)
|
||
for r in reader:
|
||
d = dict(list(zip(headers,r)))
|
||
#pdb.set_trace()
|
||
# clean funny unicode char in blank entries
|
||
r = {k: clean_funny2(v) for k,v in list(d.items()) }
|
||
if verbose: print("Cleaned: " + str(r))
|
||
|
||
if 'time' in r:
|
||
if r['time']=='TBA': r['time'] = ''
|
||
if r['time']: r['partofday'] = time_to_partofday(r['time'])
|
||
|
||
r['type'] = ''
|
||
|
||
if 'loc' in r:
|
||
if r['loc'] == 'ONLINE': r['type'] = 'online'
|
||
|
||
if r['loc'] == 'ONLINE LIVE': r['type'] = 'online live'
|
||
if r['loc']: r['site'] = room_to_site(r['loc'],verbose)
|
||
|
||
if 'code' in r:
|
||
if re.search(r'ONLINE\sLIVE',r['code']):
|
||
r['type'] = 'online live'
|
||
elif re.search(r'ONLINE',r['code']):
|
||
r['type'] = 'online'
|
||
|
||
# does it have a section? it is the last course
|
||
if r['crn']: # is a new course or a continuation?
|
||
if verbose: print(" it's a new section.")
|
||
if this_course:
|
||
if not this_course['extra']: this_course.pop('extra',None)
|
||
all_courses.append(this_course)
|
||
this_course = r
|
||
#print(r['name'])
|
||
this_course['extra'] = []
|
||
else:
|
||
# is a continuation line
|
||
if verbose: print(" additional meeting: " + str(r))
|
||
for k,v in list(r.items()):
|
||
if not v: r.pop(k,None)
|
||
# TODO: if extra line is different type?
|
||
#if this_course['type']=='online' and r['type'] != 'online': this_course['type'] = 'hybrid'
|
||
#elif this_course['type']!='online' and r['type'] == 'online': this_course['type'] = 'hybrid'
|
||
this_course['extra'].append(r)
|
||
return all_courses
|
||
|
||
|
||
# Schedule / course filling history
|
||
# csv headers: crn, code, teacher, datetime, cap, act, wlcap, wlact
|
||
# Log the history of enrollments per course during registration
|
||
def log_section_filling(current_sched_list):
|
||
rows = 'timestamp crn code teacher cap act wl_cap wl_act'.split(' ')
|
||
rows_j = 'crn code teacher cap act wl_cap wl_act'.split(' ')
|
||
print(rows_j)
|
||
now = datetime.datetime.now().strftime('%Y-%m-%dT%H-%M')
|
||
csv_fn = 'cache/reg_history_' + short_sem + '.csv'
|
||
with codecs.open(csv_fn,'a','utf-8') as f:
|
||
writer = csv.writer(f)
|
||
for S in current_sched_list:
|
||
#print(S)
|
||
items = [now,]
|
||
[ items.append( S[X] ) for X in rows_j ]
|
||
writer.writerow(items)
|
||
|
||
# Same as above, but compressed, act only
|
||
def log_section_filling2(current_sched_list):
|
||
|
||
|
||
|
||
now = datetime.datetime.now().strftime('%Y-%m-%dT%H')
|
||
|
||
todays_data = { int(S['crn']): S['act'] for S in current_sched_list }
|
||
#print(todays_data)
|
||
|
||
todays_df = pd.DataFrame.from_dict(todays_data, orient='index', columns=[now])
|
||
todays_df = todays_df.rename_axis('crn')
|
||
#print(todays_df)
|
||
todays_df.to_csv('cache/reg_today_new.csv', index=True)
|
||
|
||
try:
|
||
myframe = pd.read_csv('cache/reg_data_' + short_sem + '.csv')
|
||
print(myframe)
|
||
except:
|
||
fff = open('cache/reg_data_'+short_sem+'.csv','w')
|
||
fff.write('crn\n')
|
||
fff.close()
|
||
myframe = pd.read_csv('cache/reg_data_' + short_sem + '.csv')
|
||
#myframe = pd.DataFrame.from_dict(todays_data, orient='index', columns=[now])
|
||
#myframe = myframe.rename_axis('crn')
|
||
print("Creating new data file for this semester.")
|
||
|
||
new_df = myframe.join( todays_df, on='crn', how='outer' )
|
||
new_df = new_df.rename_axis('crn')
|
||
print(new_df)
|
||
|
||
reg_data_filename = 'reg_data_' + short_sem + '.csv'
|
||
new_df.to_csv('cache/' + reg_data_filename, index=False)
|
||
put_file('/web/phowell/schedule/', 'cache/', reg_data_filename, 0)
|
||
print('ok')
|
||
|
||
|
||
|
||
# Use Firefox and log in to ssb and get full schedule. Only works where selenium is installed
|
||
def scrape_schedule():
|
||
|
||
#url = "https://ssb.gavilan.edu/prod/twbkwbis.P_GenMenu?name=bmenu.P_StuMainMnu"
|
||
url = "https://ssb-prod.ec.gavilan.edu/PROD/twbkwbis.P_GenMenu?name=bmenu.P_MainMnu"
|
||
|
||
|
||
text = ''
|
||
|
||
from selenium import webdriver
|
||
from selenium.webdriver.common.keys import Keys
|
||
from selenium.webdriver.support.ui import WebDriverWait, Select
|
||
from selenium.webdriver.common.by import By
|
||
from selenium.webdriver.support import expected_conditions as EC
|
||
|
||
try:
|
||
driver = webdriver.Firefox()
|
||
driver.get(url)
|
||
driver.find_element_by_id("UserID").clear()
|
||
driver.find_element_by_id("UserID").send_keys(GOO)
|
||
driver.find_element_by_name("PIN").send_keys(GOO_PIN)
|
||
driver.find_element_by_name("loginform").submit()
|
||
driver.implicitly_wait(5)
|
||
|
||
print(driver.title)
|
||
|
||
driver.find_element_by_link_text("Students").click()
|
||
driver.implicitly_wait(5)
|
||
print(driver.title)
|
||
|
||
driver.find_element_by_link_text("Registration").click()
|
||
driver.implicitly_wait(5)
|
||
print(driver.title)
|
||
|
||
driver.find_element_by_link_text("Search for Classes").click()
|
||
driver.implicitly_wait(15)
|
||
print(driver.title)
|
||
|
||
dd = Select(driver.find_element_by_name("p_term"))
|
||
if (dd):
|
||
dd.select_by_visible_text(SEMESTER)
|
||
driver.find_element_by_xpath("/html/body/div/div[4]/form").submit()
|
||
driver.implicitly_wait(15)
|
||
print(driver.title)
|
||
|
||
driver.find_element_by_xpath("/html/body/div/div[4]/form/input[18]").click()
|
||
driver.implicitly_wait(10)
|
||
print(driver.title)
|
||
|
||
driver.find_element_by_name("SUB_BTN").click()
|
||
driver.implicitly_wait(40)
|
||
time.sleep(15)
|
||
driver.implicitly_wait(40)
|
||
print(driver.title)
|
||
text = driver.page_source
|
||
driver.quit()
|
||
|
||
except Exception as e:
|
||
print("Got an exception: ", e)
|
||
finally:
|
||
print("")
|
||
#driver.quit()
|
||
|
||
|
||
|
||
|
||
|
||
|
||
codecs.open('cache/' + filename_html,'w', 'utf-8').write(text)
|
||
|
||
|
||
|
||
#print(text)
|
||
as_list = ssb_to_csv(text)
|
||
#print(as_list)
|
||
as_dict = to_section_list(as_list)
|
||
jj = json.dumps(as_dict,indent=2)
|
||
|
||
# TODO
|
||
try:
|
||
ps = codecs.open('cache/'+filename,'r','utf-8')
|
||
prev_sched = json.loads(ps.read())
|
||
ps.close()
|
||
|
||
if 1: # sometimes I want to re-run this without affecting the logs.
|
||
log_section_filling(as_dict)
|
||
log_section_filling2(as_dict)
|
||
|
||
dd = DeepDiff(prev_sched, as_dict, ignore_order=True)
|
||
pretty_json = json.dumps( json.loads( dd.to_json() ), indent=2 )
|
||
codecs.open('cache/%s_sched_diff.json' % short_sem,'w','utf-8').write( pretty_json ) # dd.to_json() )
|
||
|
||
except Exception as e:
|
||
print(e)
|
||
print("Can't do diff?")
|
||
|
||
# Next, rename the prev sched_xxYY.json data file to have its date,
|
||
# make this new one, and then upload it to the website.
|
||
# Maybe even count the entries and do a little sanity checking
|
||
#
|
||
# print("Last modified: %s" % time.ctime(os.path.getmtime("test.txt")))
|
||
# print("Created: %s" % time.ctime(os.path.getctime("test.txt")))
|
||
|
||
|
||
try:
|
||
last_mod = time.ctime(os.path.getmtime('cache/' + filename))
|
||
|
||
import pathlib
|
||
prev_stat = pathlib.Path('cache/' + filename).stat()
|
||
mtime = dt.fromtimestamp(prev_stat.st_mtime)
|
||
print(mtime)
|
||
except:
|
||
print("Couldn't Diff.")
|
||
# fname = pathlib.Path('test.py')
|
||
# assert fname.exists(), f'No such file: {fname}' # check that the file exists
|
||
# print(fname.stat())
|
||
#
|
||
# os.stat_result(st_mode=33206, st_ino=5066549581564298, st_dev=573948050, st_nlink=1, st_uid=0, st_gid=0, st_size=413,
|
||
# st_atime=1523480272, st_mtime=1539787740, st_ctime=1523480272)
|
||
|
||
|
||
|
||
codecs.open('cache/' + filename, 'w', 'utf-8').write(jj)
|
||
|
||
|
||
|
||
put_file('/web/phowell/schedule/', 'cache/', filename, 0) # /gavilan.edu/_files/php/
|
||
|
||
return as_dict
|
||
|
||
def dza_sched():
|
||
text = codecs.open('cache/sched_fa22_deanza.html','r','utf-8').read()
|
||
as_list = ssb_to_csv(text)
|
||
#print(as_list)
|
||
as_dict = to_section_list(as_list)
|
||
jj = json.dumps(as_dict,indent=2)
|
||
codecs.open('cache/fa22_sched_deanza.json','w','utf-8').write(jj)
|
||
|
||
# recreate schedule json files with most current online schedule format.
|
||
def recent_schedules():
|
||
# # todo: sems is a global in this file. Is that the right thing to do?
|
||
#all_scheds = [ os.listdir( 'cache/rosters/' + shortToLongSem(s)) for s in sems ]
|
||
#for i,s in enumerate(sems):
|
||
for s in ['sp21',]:
|
||
filename = 'cache/sched_' + s + '.html'
|
||
print("Filename is %s" % filename)
|
||
input = codecs.open( filename, 'r', 'utf-8').read()
|
||
output = ssb_to_csv(input)
|
||
|
||
csv_fn = 'cache/temp_sched_' + s + '.csv'
|
||
if os.path.isfile(csv_fn):
|
||
os.remove(csv_fn)
|
||
|
||
codecs.open(csv_fn,'w','utf-8').write(output)
|
||
|
||
jsn = to_section_list(output)
|
||
jsn_fn = 'cache/semesters/'+shortToLongSem(s)+'/'+s+'_sched.json'
|
||
if os.path.isfile(jsn_fn):
|
||
os.remove(jsn_fn)
|
||
codecs.open(jsn_fn,'w').write(json.dumps(jsn))
|
||
print("I put the most recent schedule JSON files in ./cache/semesters/... folders.")
|
||
|
||
|
||
|
||
|
||
|
||
################
|
||
################ ROSTERS AND REGISTRATION
|
||
################
|
||
################
|
||
################
|
||
|
||
# todo: the pipeline is disorganized. Organize it to have
|
||
# a hope of taking all this to a higher level.
|
||
#
|
||
|
||
# todo: where does this belong in the pipeline? compare with recent_schedules()
|
||
|
||
|
||
|
||
# Take the generically named rosters uploads files and move them to a semester folder and give them a date.
|
||
def move_to_folder(sem,year,folder):
|
||
semester = year+sem
|
||
semester_path = 'cache/rosters/%s' % semester
|
||
now = datetime.datetime.now().strftime('%Y-%m-%dT%H-%M')
|
||
print("+ Moving roster files to folder: %s" % semester_path)
|
||
if not os.path.isdir(semester_path):
|
||
print("+ Creating folder: %s" % semester_path)
|
||
os.makedirs(semester_path)
|
||
os.rename('cache/rosters/courses-%s.csv' % folder, 'cache/rosters/%s/courses.%s.csv' % (semester,now))
|
||
os.rename('cache/rosters/enrollments-%s.csv' % folder, 'cache/rosters/%s/enrollments.%s.csv' % (semester,now))
|
||
os.rename('cache/rosters/users-%s.csv' % folder, 'cache/rosters/%s/users.%s.csv' % (semester,now))
|
||
|
||
|
||
|
||
# Take raw upload (csv) files and make one big json out of them.
|
||
# This relates to enrollment files, not schedule.
|
||
def convert_roster_files(semester="",year="",folder=""):
|
||
if not semester:
|
||
semester = input("the semester? (ex: spring) ")
|
||
folder = input("Folder? (ex 2020-02-25-14-58-20) ")
|
||
uf = open('cache/rosters/users-'+folder+'.csv','r')
|
||
cf = open('cache/rosters/courses-'+folder+'.csv','r')
|
||
ef = open('cache/rosters/enrollments-'+folder+'.csv','r')
|
||
u = csv.DictReader(uf)
|
||
c = csv.DictReader(cf)
|
||
e = csv.DictReader(ef)
|
||
uu = [i for i in u]
|
||
cc = [i for i in c]
|
||
ee = [i for i in e]
|
||
uf.close()
|
||
cf.close()
|
||
ef.close()
|
||
myrosterfile = 'cache/rosters/roster_%s_%s.json' % (year, semester)
|
||
|
||
if os.path.exists(myrosterfile):
|
||
print(" -- Moving previous combined roster json file. opening %s ..." % myrosterfile)
|
||
last_fileobj = open(myrosterfile,'r')
|
||
last_file = json.load(last_fileobj)
|
||
|
||
last_fileobj.close()
|
||
|
||
info = last_file[3]
|
||
last_date = info['date_filestring']
|
||
|
||
print(' -- writing: cache/rosters/%s%s/roster_%s.json ...' % (year,semester,last_date))
|
||
|
||
try:
|
||
os.rename(myrosterfile, 'cache/rosters/%s%s/roster_%s.json ...' % (year,semester,last_date))
|
||
print(' -- ok')
|
||
except Exception as e:
|
||
print(" ** Failed because i couldn't move the previous roster file: %s" % myrosterfile)
|
||
print(e)
|
||
myrosterfile = "new_" + myrosterfile
|
||
pass
|
||
#os.remove('cache/old_rosters/roster_'+semester+'.'+last_date+'.json')
|
||
#os.rename(myrosterfile, 'cache/old_rosters/roster_'+semester+'.'+last_date+'.json')
|
||
|
||
newinfo = {'date_filestring': datetime.datetime.now().strftime('%Y-%m-%dT%H-%M'), }
|
||
try:
|
||
new_roster = codecs.open(myrosterfile,'w', 'utf-8')
|
||
new_roster.write( json.dumps( [uu,cc,ee,newinfo], indent=2 ))
|
||
new_roster.close()
|
||
print(" -- Wrote roster info to: %s." % myrosterfile)
|
||
except Exception as e:
|
||
print(" ** Failed because i couldn't move the previous roster file: %s" % myrosterfile)
|
||
print(" ** " + str(e))
|
||
|
||
|
||
|
||
|
||
|
||
|
||
# From instructure sftp site
|
||
def fetch_current_rosters():
|
||
dt_label = datetime.datetime.now().strftime('%Y-%m-%d-%H-%M-%S')
|
||
cnopts = pysftp.CnOpts()
|
||
cnopts.hostkeys = None
|
||
with pysftp.Connection(instructure_url,username=instructure_username, private_key=instructure_private_key,cnopts=cnopts) as sftp:
|
||
sftp.chdir('SIS')
|
||
files = sftp.listdir()
|
||
print("\n--> %s I see these files at instructure ftp site: " % dt_label )
|
||
[print(" %s" % f) for f in files]
|
||
i = 0
|
||
got_courses = 0
|
||
if len(files)>1: # and 'users.csv' in files:
|
||
try:
|
||
if 'users.csv' in files:
|
||
sftp.get('users.csv','cache/rosters/users-'+dt_label+'.csv')
|
||
i += 1
|
||
except:
|
||
print(' * users.csv not present')
|
||
try:
|
||
if 'courses.csv' in files:
|
||
sftp.get('courses.csv','cache/rosters/courses-'+dt_label+'.csv')
|
||
i += 1
|
||
got_courses = 1
|
||
except:
|
||
print(' * courses.csv not present')
|
||
try:
|
||
if 'enrollments.csv' in files:
|
||
sftp.get('enrollments.csv','cache/rosters/enrollments-'+dt_label+'.csv')
|
||
i += 1
|
||
except:
|
||
print(' * enrollments.csv not present')
|
||
print(' Saved %i data files in rosters folder.' % i)
|
||
|
||
if got_courses:
|
||
courses = open('cache/rosters/courses-%s.csv' % dt_label,'r')
|
||
courses.readline()
|
||
a = courses.readline()
|
||
print(a)
|
||
courses.close()
|
||
parts = a.split(',')
|
||
year = parts[1][0:4]
|
||
ss = parts[1][4:6]
|
||
#print parts[1]
|
||
sem = {'30':'spring', '50':'summer', '70':'fall' }
|
||
this_sem = sem[ss]
|
||
print(" -> This semester is: %s, %s" % (this_sem,year))
|
||
|
||
print(' -> %s building data file...' % dt_label)
|
||
convert_roster_files(this_sem,year,dt_label)
|
||
print(' -> moving files...')
|
||
move_to_folder(this_sem,year,dt_label)
|
||
else:
|
||
print(" * No courses file. Not moving files.")
|
||
else:
|
||
print("--> Don't see files.")
|
||
sftp.close()
|
||
|
||
def fetch_current_rosters_auto():
|
||
|
||
schedule.every().hour.at(":57").do(fetch_current_rosters)
|
||
|
||
schedule.every().day.at("12:35").do(sync_non_interactive)
|
||
schedule.every().day.at("21:00").do(sync_non_interactive)
|
||
|
||
|
||
print("running every hour on the :57\n")
|
||
while True:
|
||
try:
|
||
schedule.run_pending()
|
||
except Exception as e:
|
||
import traceback
|
||
print(" ---- * * * Failed with: %s" % str(e))
|
||
ff = open('cache/pipeline.log.txt','a')
|
||
ff.write(datetime.datetime.now().strftime('%Y-%m-%d-%H-%M-%S') + "\n")
|
||
ff.write(traceback.format_exc()+"\n---------\n\n")
|
||
ff.close()
|
||
#schedule.CancelJob
|
||
time.sleep(15)
|
||
|
||
|
||
|
||
|
||
# read schedule file with an eye toward watching what's filling up
|
||
def schedule_filling():
|
||
sem = 'spring2021' # todo: hardcoded
|
||
days = []
|
||
for f in sorted(os.listdir('cache/rosters/'+sem+'/')):
|
||
if f.endswith('.html'):
|
||
match = re.search(r'sched_(\d\d\d\d)_(\d\d)_(\d+)\.html',f)
|
||
if match:
|
||
print(f)
|
||
y = match.group(1)
|
||
m = match.group(2)
|
||
d = match.group(3)
|
||
print("Schedule from %s %s %s." % (y,m,d))
|
||
csv_sched = ssb_to_csv(open('cache/rosters/'+sem+'/'+f,'r').read())
|
||
jsn = to_section_list(csv_sched)
|
||
#print(json.dumps(jsn,indent=2))
|
||
days.append(jsn)
|
||
day1 = days[-2]
|
||
day2 = days[-1]
|
||
df = jsondiff.diff(day1, day2)
|
||
gains = defaultdict( list )
|
||
|
||
for D in df.keys():
|
||
if isinstance(D, int):
|
||
#print(day1[D]['code'] + '\t' + day1[D]['crn'] + ' Before: ' + day1[D]['act'] + ' After: ' + day2[D]['act'])
|
||
try:
|
||
gain = int(day2[D]['act']) - int(day1[D]['act'])
|
||
gains[gain].append( day1[D]['code'] + ' ' + day1[D]['crn'] )
|
||
except:
|
||
print("No gain for " + str(D))
|
||
#print("\t" + str(df[D]))
|
||
else:
|
||
print(D)
|
||
print(df[D])
|
||
for key, value in sorted(gains.items(), key=lambda x: x[0]):
|
||
print("{} : {}".format(key, value))
|
||
|
||
#print(json.dumps(gains,indent=2))
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
################
|
||
################ SENDING DATA AWAY
|
||
################
|
||
################
|
||
################
|
||
|
||
# Upload a json file to www
|
||
def put_file(remotepath,localpath, localfile,prompt=1):
|
||
folder = datetime.datetime.now().strftime('%Y-%m-%d-%H-%M-%S')
|
||
cnopts = pysftp.CnOpts()
|
||
|
||
cnopts.hostkeys = None
|
||
|
||
with pysftp.Connection(FTP_SITE,username=FTP_USER, password=FTP_PW,cnopts=cnopts) as sftp:
|
||
#todo: these paths
|
||
#files = sftp.listdir()
|
||
#print(folder + "\tI see these files on remote: ", files, "\n")
|
||
sftp.chdir(remotepath)
|
||
files = sftp.listdir()
|
||
print(folder + "\tI see these files on remote: ", files, "\n")
|
||
|
||
localf = os.listdir(localpath)
|
||
|
||
print("I see these local: ", localf)
|
||
|
||
if prompt:
|
||
input('ready to upload')
|
||
sftp.put(localpath+localfile, localfile, preserve_mtime=True)
|
||
sftp.close()
|
||
|
||
|
||
"""
|
||
# copy files and directories from local static, to remote static,
|
||
# preserving modification times on the files
|
||
for f in localf:
|
||
print("This local file: " + f + " ", end=' ')
|
||
if not f in files:
|
||
sftp.put('video_srt/'+classfoldername+'/'+f, f, preserve_mtime=True)
|
||
print("Uploaded.")
|
||
else:
|
||
print("Skipped.")
|
||
"""
|
||
|
||
"""if len(files)==3 and 'users.csv' in files:
|
||
sftp.get('courses.csv','rosters/courses-'+folder+'.csv')
|
||
sftp.get('users.csv','rosters/users-'+folder+'.csv')
|
||
sftp.get('enrollments.csv','rosters/enrollments-'+folder+'.csv')
|
||
print folder + '\tSaved three data files in rosters folder.'
|
||
|
||
courses = open('rosters/courses-'+folder+'.csv','r')
|
||
courses.readline()
|
||
a = courses.readline()
|
||
print a
|
||
courses.close()
|
||
parts = a.split(',')
|
||
year = parts[1][0:4]
|
||
ss = parts[1][4:6]
|
||
#print parts[1]
|
||
sem = {'30':'spring', '50':'summer', '70':'fall' }
|
||
this_sem = sem[ss]
|
||
#print this_sem, "", year
|
||
print folder + '\tbuilding data file...'
|
||
convert_roster_files(this_sem,year,folder)
|
||
print folder + '\tmoving files...'
|
||
move_to_folder(this_sem,year,folder)
|
||
else:
|
||
print folder + "\tDon't see all three files."""
|
||
|
||
|
||
|
||
################
|
||
################ GOOGLE DOCS
|
||
################
|
||
################
|
||
################
|
||
|
||
def sec(t): return "<h3>"+t+"</h3>\n"
|
||
def para(t): return "<p>"+t+"</p>\n"
|
||
def ul(t): return "<ul>"+t+"</ul>\n"
|
||
def li(t): return "<li>"+t+"</li>\n"
|
||
|
||
def question(t,bracket=1):
|
||
ret = ''
|
||
match = re.search( r'\[(.*)\]', t)
|
||
if match and bracket:
|
||
ret += "<a name='" + match.group(1) + "'></a>"
|
||
t = re.sub( r'\[.*\]','',t)
|
||
else:
|
||
parts = t.split(' ')
|
||
id = ''
|
||
for p in parts:
|
||
if re.search(r'[a-zA-Z]',p[0]): id += p[0]
|
||
ret += "<a name='%s'></a>" % id.lower()
|
||
return ret + '<div class="accordion" data-accordion=""><h4 class="acrd_cntl">' + t + '</h4>\n<div class="acrd_cntnt">'
|
||
|
||
def answer(t):
|
||
return t + '</div></div>\n'
|
||
|
||
def read_paragraph_element(element,type="NORMAL_TEXT"):
|
||
"""Returns the text in the given ParagraphElement.
|
||
|
||
Args:
|
||
element: a ParagraphElement from a Google Doc.
|
||
"""
|
||
text_run = element.get('textRun')
|
||
begin = ''
|
||
end = ''
|
||
if not text_run:
|
||
return ''
|
||
if 'textStyle' in text_run and 'link' in text_run['textStyle']:
|
||
begin = '<a href="' + text_run['textStyle']['link']['url'] + '">'
|
||
end = '</a>'
|
||
if 'textStyle' in text_run and 'bold' in text_run['textStyle'] and text_run['textStyle']['bold']==True and type=="NORMAL_TEXT":
|
||
begin = '<strong>' + begin
|
||
end = end + '</strong>'
|
||
|
||
content = text_run.get('content')
|
||
content = re.sub(u'\u000b','<br />\n',content)
|
||
|
||
return begin + content + end
|
||
|
||
|
||
def get_doc(docid, bracket=1, verbose=0):
|
||
import pickle
|
||
import os.path
|
||
from googleapiclient.discovery import build
|
||
from google_auth_oauthlib.flow import InstalledAppFlow
|
||
from google.auth.transport.requests import Request
|
||
|
||
#ooout = open(fileout,'w')
|
||
|
||
# If modifying these scopes, delete the file token.pickle.
|
||
SCOPES = ['https://www.googleapis.com/auth/documents.readonly']
|
||
creds = None
|
||
# The file token.pickle stores the user's access and refresh tokens, and is
|
||
# created automatically when the authorization flow completes for the first
|
||
# time.
|
||
if os.path.exists('token.pickle'):
|
||
with open('token.pickle', 'rb') as token:
|
||
creds = pickle.load(token)
|
||
# If there are no (valid) credentials available, let the user log in.
|
||
if not creds or not creds.valid:
|
||
if creds and creds.expired and creds.refresh_token:
|
||
creds.refresh(Request())
|
||
else:
|
||
flow = InstalledAppFlow.from_client_secrets_file(
|
||
'credentials.json', SCOPES)
|
||
creds = flow.run_local_server(port=0)
|
||
# Save the credentials for the next run
|
||
with open('token.pickle', 'wb') as token:
|
||
pickle.dump(creds, token)
|
||
|
||
service = build('docs', 'v1', credentials=creds)
|
||
|
||
# Retrieve the documents contents from the Docs service.
|
||
document = service.documents().get(documentId=docid).execute()
|
||
if verbose: print(document)
|
||
|
||
tempout = codecs.open('cache/trash/gdoctemp.txt','w','utf-8')
|
||
tempout.write( json.dumps(document,indent=2) + "\n\n\n------------------------------------\n\n")
|
||
if verbose: print('The title of the document is: {}'.format(document.get('title')))
|
||
doc_content = document.get('body').get('content')
|
||
if verbose: print(doc_content)
|
||
|
||
doc_objects = document.get('inlineObjects')
|
||
if verbose: print(doc_objects)
|
||
|
||
doc_lists = document.get('lists')
|
||
|
||
text = '<div class="acrd_grp" data-accordion-group="">'
|
||
last_type = ''
|
||
answer_text = ''
|
||
in_a_list = ''
|
||
|
||
img_count = 1
|
||
img_lookup = {}
|
||
img_heights = {}
|
||
img_widths = {}
|
||
|
||
if doc_objects:
|
||
for k,value in doc_objects.items():
|
||
tempout.write( "->" + k + "=" + json.dumps(value,indent=2) + "\n\n\n--\n\n")
|
||
if 'inlineObjectProperties' in value:
|
||
if 'embeddedObject' in value['inlineObjectProperties']:
|
||
if 'imageProperties' in value['inlineObjectProperties']['embeddedObject']:
|
||
if 'contentUri' in value['inlineObjectProperties']['embeddedObject']['imageProperties']:
|
||
print(k)
|
||
uu = value['inlineObjectProperties']['embeddedObject']['imageProperties']['contentUri']
|
||
response = requests.get(uu, stream=True)
|
||
name = 'image_' + str(img_count) + '.' + response.headers['content-type'].split('/')[1]
|
||
img_count += 1
|
||
|
||
img_lookup[k] = name
|
||
|
||
with open('cache/doc_images/'+name, 'wb') as out_file:
|
||
shutil.copyfileobj(response.raw, out_file)
|
||
print(uu)
|
||
print(response.headers)
|
||
print(name)
|
||
#input('x?')
|
||
del response
|
||
if 'size' in value['inlineObjectProperties']['embeddedObject']:
|
||
img_heights[k] = int(value['inlineObjectProperties']['embeddedObject']['size']['height']['magnitude'])
|
||
img_widths[k] = int(value['inlineObjectProperties']['embeddedObject']['size']['width']['magnitude'])
|
||
|
||
tempout.write('- - - - - - - -\n\n')
|
||
#for value in doc_lists:
|
||
# tempout.write( json.dumps(value,indent=2) + "\n\n\n--\n\n")
|
||
|
||
tempout.write('- - - - - - - -\n\n')
|
||
list_stack = []
|
||
list_depth = 0
|
||
last_list_depth = 0
|
||
for value in doc_content:
|
||
tempout.write( json.dumps(value,indent=2) + "\n\n\n")
|
||
if verbose: print(json.dumps(value, sort_keys=True, indent=4))
|
||
|
||
# todo: x link, x bold, list, image.
|
||
tag_fxn = para
|
||
if 'paragraph' in value:
|
||
this_text = ''
|
||
|
||
if 'bullet' in value['paragraph']:
|
||
# either we're (1)starting a new list, (2)in one, (3)starting a nested one, or (4)finished a nested one.
|
||
|
||
lid = value['paragraph']['bullet']['listId']
|
||
|
||
if not list_stack: # 1
|
||
list_stack.append(lid)
|
||
else:
|
||
if lid == list_stack[0]: # 2
|
||
pass
|
||
|
||
else:
|
||
if not lid in list_stack: # 3
|
||
list_stack.append(lid)
|
||
else: # 4
|
||
x = list_stack.pop()
|
||
while x != lid: list_stack.pop()
|
||
elif len(list_stack) > 0: # current para isn't a bullet but we still have a list open.
|
||
list_stack = []
|
||
|
||
list_depth = len(list_stack)
|
||
|
||
deeper = list_depth - last_list_depth
|
||
|
||
if deeper > 0:
|
||
answer_text += "<ul>" * deeper
|
||
elif deeper < 0:
|
||
deeper = -1 * deeper
|
||
answer_text += "</ul>" * deeper
|
||
|
||
if len(list_stack):
|
||
tag_fxn = li
|
||
|
||
elements = value.get('paragraph').get('elements')
|
||
|
||
# inlineObjectElement": {
|
||
# "inlineObjectId": "kix.ssseeu8j9cfx",
|
||
|
||
if 'paragraphStyle' in value.get('paragraph'):
|
||
style = value.get('paragraph').get('paragraphStyle')
|
||
#text += json.dumps(style, sort_keys=True, indent=4)
|
||
if 'namedStyleType' in style:
|
||
type = style['namedStyleType']
|
||
|
||
for elem in elements:
|
||
|
||
# text content
|
||
this_text += read_paragraph_element(elem,type)
|
||
|
||
# image content
|
||
if 'inlineObjectElement' in elem:
|
||
vpi = elem['inlineObjectElement']
|
||
if 'inlineObjectId' in vpi:
|
||
ii = vpi['inlineObjectId']
|
||
if ii in img_lookup:
|
||
img = img_lookup[ii]
|
||
h = img_heights[ii]
|
||
w = img_widths[ii]
|
||
this_text += '<img src="doc_images/%s" width="%i" height="%i" />' % (img,w,h)
|
||
|
||
|
||
|
||
if last_type=='NORMAL_TEXT' and type!=last_type:
|
||
text += answer(answer_text)
|
||
answer_text = ''
|
||
|
||
if type=='HEADING_2':
|
||
text += sec(this_text)
|
||
this_text = ''
|
||
elif type=='HEADING_3':
|
||
text += question(this_text,bracket)
|
||
this_text = ''
|
||
else:
|
||
answer_text += tag_fxn(this_text)
|
||
this_text = ''
|
||
last_type = type
|
||
last_list_depth = list_depth
|
||
|
||
elif 'table' in value:
|
||
# The text in table cells are in nested Structural Elements and tables may be
|
||
# nested.
|
||
text += "\nTABLE\n"
|
||
#table = value.get('table')
|
||
#for row in table.get('tableRows'):
|
||
# cells = row.get('tableCells')
|
||
# for cell in cells:
|
||
# text += read_strucutural_elements(cell.get('content'))
|
||
#elif 'tableOfContents' in value:
|
||
# # The text in the TOC is also in a Structural Element.
|
||
# toc = value.get('tableOfContents')
|
||
# text += read_strucutural_elements(toc.get('content'))
|
||
|
||
#else:
|
||
# print(json.dumps(value, sort_keys=True, indent=4))
|
||
|
||
text += answer(answer_text)
|
||
#text += '</div>'
|
||
#print(text)
|
||
return text
|
||
|
||
######### TRY #2 ######
|
||
|
||
|
||
def read_paragraph_element_2(element,type="NORMAL_TEXT"):
|
||
text_run = element.get('textRun')
|
||
begin = ''
|
||
end = ''
|
||
if not text_run: return ''
|
||
if 'textStyle' in text_run and 'link' in text_run['textStyle']:
|
||
begin = '<a href="' + text_run['textStyle']['link']['url'] + '">'
|
||
end = '</a>'
|
||
if 'textStyle' in text_run and 'bold' in text_run['textStyle'] and text_run['textStyle']['bold']==True and type=="NORMAL_TEXT":
|
||
begin = '<strong>' + begin
|
||
end = end + '</strong>'
|
||
elif 'textStyle' in text_run and 'italic' in text_run['textStyle'] and text_run['textStyle']['italic']==True and type=="NORMAL_TEXT":
|
||
begin = '<em>' + begin
|
||
end = end + '</em>'
|
||
content = text_run.get('content')
|
||
content = re.sub(u'\u000b','<br />\n',content)
|
||
return begin + content + end
|
||
|
||
# t is a string that begins with "Icons: " ... and contains comma(space) separated list
|
||
def handle_icons(t):
|
||
text = t[7:].strip()
|
||
parts = text.split(", ")
|
||
return ('icons',parts)
|
||
|
||
# t is a string that begins with "Tags: " ... and contains comma(space) separated list
|
||
def handle_tags(t):
|
||
text = t[6:].strip()
|
||
parts = text.split(", ")
|
||
return ('tags',parts)
|
||
|
||
def handle_question(t,bracket=1):
|
||
anchor = ''
|
||
match = re.search( r'\[(.*)\]', t)
|
||
if match and bracket:
|
||
anchor = match.group(1).lower()
|
||
t = re.sub( r'\[.*\]','',t)
|
||
else:
|
||
parts = t.split(' ')
|
||
for p in parts:
|
||
if re.search(r'[a-zA-Z]',p[0]): anchor += p[0].lower()
|
||
return ('question', t, anchor)
|
||
|
||
def handle_answer(t):
|
||
return ('answer',t)
|
||
|
||
def handle_sec(t): return ('section',t)
|
||
def handle_para(t): return ('paragraph',t)
|
||
def handle_ul(t): return ('unorderdedlist',t)
|
||
def handle_li(t): return ('listitem',t)
|
||
|
||
|
||
|
||
img_count = 1
|
||
img_lookup = {}
|
||
img_heights = {}
|
||
img_widths = {}
|
||
|
||
|
||
def fetch_doc_image(k,value):
|
||
global img_count, img_lookup, img_heights, img_widths
|
||
if 'inlineObjectProperties' in value:
|
||
if 'embeddedObject' in value['inlineObjectProperties']:
|
||
if 'imageProperties' in value['inlineObjectProperties']['embeddedObject']:
|
||
if 'contentUri' in value['inlineObjectProperties']['embeddedObject']['imageProperties']:
|
||
print(k)
|
||
uu = value['inlineObjectProperties']['embeddedObject']['imageProperties']['contentUri']
|
||
response = requests.get(uu, stream=True)
|
||
name = 'image_' + str(img_count) + '.' + response.headers['content-type'].split('/')[1]
|
||
img_count += 1
|
||
img_lookup[k] = name
|
||
|
||
with open('cache/doc_images/'+name, 'wb') as out_file:
|
||
shutil.copyfileobj(response.raw, out_file)
|
||
print(uu)
|
||
print(response.headers)
|
||
print(name)
|
||
del response
|
||
if 'size' in value['inlineObjectProperties']['embeddedObject']:
|
||
img_heights[k] = int(value['inlineObjectProperties']['embeddedObject']['size']['height']['magnitude'])
|
||
img_widths[k] = int(value['inlineObjectProperties']['embeddedObject']['size']['width']['magnitude'])
|
||
|
||
|
||
def get_doc_generic(docid, bracket=1, verbose=0):
|
||
import pickle
|
||
import os.path
|
||
from googleapiclient.discovery import build
|
||
from google_auth_oauthlib.flow import InstalledAppFlow
|
||
from google.auth.transport.requests import Request
|
||
global img_count, img_lookup, img_heights, img_widths
|
||
|
||
# If modifying these scopes, delete the file token.pickle.
|
||
SCOPES = ['https://www.googleapis.com/auth/documents.readonly']
|
||
creds = None
|
||
# The file token.pickle stores the user's access and refresh tokens, and is
|
||
# created automatically when the authorization flow completes for the first
|
||
# time.
|
||
if os.path.exists('token.pickle'):
|
||
with open('token.pickle', 'rb') as token:
|
||
creds = pickle.load(token)
|
||
if not creds or not creds.valid:
|
||
if creds and creds.expired and creds.refresh_token:
|
||
creds.refresh(Request())
|
||
else:
|
||
flow = InstalledAppFlow.from_client_secrets_file(
|
||
'credentials.json', SCOPES)
|
||
creds = flow.run_local_server(port=0)
|
||
# Save the credentials for the next run
|
||
with open('token.pickle', 'wb') as token:
|
||
pickle.dump(creds, token)
|
||
|
||
service = build('docs', 'v1', credentials=creds)
|
||
|
||
# Retrieve the documents contents from the Docs service.
|
||
document = service.documents().get(documentId=docid).execute()
|
||
|
||
tempout = codecs.open('cache/trash/gdoctemp.txt','w','utf-8')
|
||
tempout.write( json.dumps(document,indent=2) \
|
||
+ "\n\n\n------------------------------------\n\n")
|
||
if verbose: print('The title of the document is: {}'.format(document.get('title')))
|
||
|
||
doc_content = document.get('body').get('content')
|
||
doc_objects = document.get('inlineObjects')
|
||
doc_lists = document.get('lists')
|
||
|
||
#text = ''
|
||
result = []
|
||
last_type = ''
|
||
#answer_text = ''
|
||
answer = []
|
||
in_a_list = ''
|
||
|
||
# Get all the images
|
||
for k,value in doc_objects.items():
|
||
tempout.write( "->" + k + "=" + json.dumps(value,indent=2) + "\n\n\n--\n\n")
|
||
fetched = fetch_doc_image(k,value)
|
||
|
||
list_stack = []
|
||
list_depth = 0
|
||
last_list_depth = 0
|
||
for value in doc_content:
|
||
tempout.write( json.dumps(value,indent=2) + "\n\n\n")
|
||
if verbose: print(json.dumps(value, sort_keys=True, indent=4))
|
||
|
||
tag_fxn = handle_para
|
||
if 'paragraph' in value:
|
||
this_text = ''
|
||
|
||
# First we deal with if we're in a list.
|
||
if 'bullet' in value['paragraph']:
|
||
# either we're (1)starting a new list, (2)in one (do nothing),
|
||
# (3)starting a nested one, or (4)finished a nested one.
|
||
lid = value['paragraph']['bullet']['listId']
|
||
if not list_stack: # 1
|
||
list_stack.append(lid)
|
||
else:
|
||
if not lid == list_stack[0]:
|
||
if not lid in list_stack: # 3
|
||
list_stack.append(lid)
|
||
else: # 4
|
||
x = list_stack.pop()
|
||
while x != lid: list_stack.pop()
|
||
elif len(list_stack) > 0:
|
||
# current para isn't a bullet but we still have a list open.
|
||
list_stack = []
|
||
|
||
|
||
list_depth = len(list_stack)
|
||
deeper = list_depth - last_list_depth
|
||
if deeper > 0:
|
||
answer.append("<ul>" * deeper)
|
||
elif deeper < 0:
|
||
deeper = -1 * deeper
|
||
answer.append("</ul>" * deeper)
|
||
if len(list_stack):
|
||
tag_fxn = handle_li
|
||
|
||
# NOW the tag_fxn is either 'para' or 'li'... let's get the styling info next,
|
||
elements = value.get('paragraph').get('elements')
|
||
if 'paragraphStyle' in value.get('paragraph'):
|
||
style = value.get('paragraph').get('paragraphStyle')
|
||
if 'namedStyleType' in style:
|
||
type = style['namedStyleType']
|
||
|
||
# and FINALLY, the actual contents.
|
||
for elem in elements:
|
||
# text content
|
||
this_text += read_paragraph_element_2(elem,type)
|
||
|
||
# image content
|
||
if 'inlineObjectElement' in elem:
|
||
vpi = elem['inlineObjectElement']
|
||
if 'inlineObjectId' in vpi:
|
||
ii = vpi['inlineObjectId']
|
||
if ii in img_lookup:
|
||
img = img_lookup[ii]
|
||
h = img_heights[ii]
|
||
w = img_widths[ii]
|
||
this_text += '<img src="doc_images/%s" width="%i" height="%i" />' % (img,w,h)
|
||
|
||
|
||
# Now for something tricky. Call an appropriate handler, based on:
|
||
# (a) what is the paragraph style type?
|
||
# (b) is it different from the prev one?
|
||
|
||
if last_type=='NORMAL_TEXT' and type!=last_type:
|
||
if this_text.strip():
|
||
result.append(handle_answer(answer))
|
||
answer = []
|
||
#answer_text = ''
|
||
|
||
if type=='HEADING_2' and this_text.strip():
|
||
result.append( handle_sec(this_text) )
|
||
this_text = ''
|
||
elif type=='HEADING_3' and this_text.strip():
|
||
result.append(handle_question(this_text,bracket))
|
||
this_text = ''
|
||
else:
|
||
if this_text.lower().startswith('tags:'):
|
||
tag_fxn = handle_tags
|
||
if this_text.lower().startswith('icons:'):
|
||
tag_fxn = handle_icons
|
||
if this_text.strip():
|
||
answer.append(tag_fxn(this_text))
|
||
this_text = ''
|
||
last_type = type
|
||
last_list_depth = list_depth
|
||
|
||
elif 'table' in value:
|
||
pass
|
||
|
||
|
||
result.append(handle_answer(answer))
|
||
return json.dumps(result,indent=4)
|
||
|
||
|
||
|
||
|
||
def scrape_schedule_py():
|
||
return 1
|
||
|
||
"""
|
||
cur_session = requests.Session()
|
||
mygav_url = "https://lum-prod.ec.gavilan.edu/"
|
||
|
||
r1 = requests.get(mygav_url)
|
||
|
||
login_url1 = "https://lum-prod.ec.gavilan.edu/c/portal/login"
|
||
|
||
|
||
login_url = "https://eis-prod.ec.gavilan.edu/authenticationendpoint/login.do?commonAuthCallerPath=%2Fsamlsso&forceAuth=false&passiveAuth=false&tenantDomain=carbon.super&sessionDataKey=57203341-6823-4511-b88e-4e104aa2fd71&relyingParty=LP5PROD_LuminisPortalEntity&type=samlsso&sp=Luminis+Portal+PROD&isSaaSApp=false&authenticators=BasicAuthenticator:LOCAL"
|
||
"""
|
||
|
||
|
||
|
||
def scrape_schedule_multi():
|
||
|
||
global SEMESTER, short_sem, semester_begin, filename, filename_html
|
||
|
||
SEMESTER = 'Summer 2022'
|
||
short_sem = 'su22'
|
||
semester_begin = strptime('06/13', '%m/%d')
|
||
filename = 'su22_sched.json'
|
||
filename_html = 'su22_sched.html'
|
||
|
||
scrape_schedule()
|
||
|
||
SEMESTER = 'Fall 2022'
|
||
short_sem = 'fa22'
|
||
semester_begin = strptime('08/22', '%m/%d')
|
||
filename = 'fa22_sched.json'
|
||
filename_html = 'fa22_sched.html'
|
||
|
||
scrape_schedule()
|
||
|
||
SEMESTER = 'Spring 2022'
|
||
short_sem = 'sp22'
|
||
semester_begin = strptime('01/31', '%m/%d')
|
||
filename = 'sp22_sched.json'
|
||
filename_html = 'sp22_sched.html'
|
||
|
||
scrape_schedule()
|
||
|
||
|
||
|
||
if __name__ == "__main__":
|
||
|
||
print ('')
|
||
options = { 1: ['Re-create schedule csv and json files from raw html',recent_schedules] ,
|
||
2: ['Fetch rosters',fetch_current_rosters] ,
|
||
3: ['Fetch rosters AND canvas data automatically',fetch_current_rosters_auto] ,
|
||
4: ['Compute how registration is filling up classes', schedule_filling] ,
|
||
5: ['Manually convert 3 csv files to joined json enrollment file.', convert_roster_files] ,
|
||
6: ['Canvas data: interactive sync', interactive ],
|
||
7: ['Canvas data: automated sync', sync_non_interactive ],
|
||
8: ['Scrape schedule from ssb', scrape_schedule_multi ],
|
||
9: ['Test ssb calls with python', scrape_schedule_py ],
|
||
10: ['Parse deanza schedule', dza_sched ],
|
||
}
|
||
|
||
if len(sys.argv) > 1 and re.search(r'^\d+',sys.argv[1]):
|
||
resp = int(sys.argv[1])
|
||
print("\n\nPerforming: %s\n\n" % options[resp][0])
|
||
|
||
else:
|
||
print ('')
|
||
for key in options:
|
||
print(str(key) + '.\t' + options[key][0])
|
||
|
||
print('')
|
||
resp = input('Choose: ')
|
||
|
||
# Call the function in the options dict
|
||
options[ int(resp)][1]()
|
||
|
||
# Testing
|
||
|
||
#if __name__ == "__main__":
|
||
#users = fetch('/api/v1/courses/69/users?per_page=100',1)
|
||
#print "These are the users: "
|
||
#print users
|
||
|
||
#getSemesterSchedule()
|
||
|
||
|
||
#get_doc()
|
||
#pass
|