synced pipelines w/ surface

This commit is contained in:
Peter Howell 2023-03-24 09:58:20 -07:00
parent 411971cf29
commit 37e8007035
1 changed files with 252 additions and 36 deletions

View File

@ -5,6 +5,7 @@ from time import strptime
from bs4 import BeautifulSoup as bs from bs4 import BeautifulSoup as bs
from util import UnicodeDictReader from util import UnicodeDictReader
from datetime import datetime as dt from datetime import datetime as dt
from dateutil import parser
import pandas as pd import pandas as pd
import codecs, json, requests, re, csv, datetime, pysftp, os, jsondiff, os.path import codecs, json, requests, re, csv, datetime, pysftp, os, jsondiff, os.path
import sys, shutil, hmac, hashlib, base64, schedule, time, pathlib, datetime import sys, shutil, hmac, hashlib, base64, schedule, time, pathlib, datetime
@ -29,9 +30,6 @@ And some subsequent processing:
- Raw logs into something more useful - Raw logs into something more useful
""" """
verbose = False verbose = False
users = {} users = {}
@ -428,6 +426,10 @@ def getSemesterSchedule(short='sp21'): # I used to be current_sch
return schedule return schedule
def get_enrlmts_for_user(user,enrollments):
#active enrollments
u_en = enrollments[ lambda x: (x['user_id'] == user) & (x['workflow']=='active') ]
return u_en[['type','course_id']]
################ ################
@ -812,7 +814,7 @@ def to_section_list(input_text,verbose=0):
if 'loc' in r: if 'loc' in r:
if r['loc'] == 'ONLINE': r['type'] = 'online' if r['loc'] == 'ONLINE': r['type'] = 'online'
if r['loc'] == 'ONLINE' and r['time']: r['type'] = 'online live'
if r['loc'] == 'ONLINE LIVE': r['type'] = 'online live' if r['loc'] == 'ONLINE LIVE': r['type'] = 'online live'
if r['loc']: r['site'] = room_to_site(r['loc'],verbose) if r['loc']: r['site'] = room_to_site(r['loc'],verbose)
@ -893,9 +895,7 @@ def log_section_filling2(current_sched_list):
reg_data_filename = 'reg_data_' + short_sem + '.csv' reg_data_filename = 'reg_data_' + short_sem + '.csv'
new_df.to_csv('cache/' + reg_data_filename, index=False) new_df.to_csv('cache/' + reg_data_filename, index=False)
put_file('/web/phowell/schedule/', 'cache/', reg_data_filename, 0) put_file('/home/public/schedule/', 'cache/', reg_data_filename, 0)
print('ok')
# Use Firefox and log in to ssb and get full schedule. Only works where selenium is installed # Use Firefox and log in to ssb and get full schedule. Only works where selenium is installed
@ -1022,9 +1022,7 @@ def scrape_schedule():
codecs.open('cache/' + filename, 'w', 'utf-8').write(jj) codecs.open('cache/' + filename, 'w', 'utf-8').write(jj)
put_file('/home/public/schedule/', 'cache/', filename, 0) # /gavilan.edu/_files/php/
put_file('/web/phowell/schedule/', 'cache/', filename, 0) # /gavilan.edu/_files/php/
return as_dict return as_dict
@ -1081,7 +1079,8 @@ def recent_schedules():
# Take the generically named rosters uploads files and move them to a semester folder and give them a date. # Take the generically named rosters uploads files and move them to a semester folder and give them a date.
def move_to_folder(sem,year,folder): def move_to_folder(sem,year,folder):
semester = year+sem semester = year+sem
semester_path = 'cache/rosters/%s' % semester if not os.path.isdir('cache/rosters/'+semester):
os.makedirs('cache/rosters/'+semester)
now = datetime.datetime.now().strftime('%Y-%m-%dT%H-%M') now = datetime.datetime.now().strftime('%Y-%m-%dT%H-%M')
print("+ Moving roster files to folder: %s" % semester_path) print("+ Moving roster files to folder: %s" % semester_path)
if not os.path.isdir(semester_path): if not os.path.isdir(semester_path):
@ -1159,7 +1158,7 @@ def fetch_current_rosters():
with pysftp.Connection(instructure_url,username=instructure_username, private_key=instructure_private_key,cnopts=cnopts) as sftp: with pysftp.Connection(instructure_url,username=instructure_username, private_key=instructure_private_key,cnopts=cnopts) as sftp:
sftp.chdir('SIS') sftp.chdir('SIS')
files = sftp.listdir() files = sftp.listdir()
print("\n--> %s I see these files at instructure ftp site: " % dt_label ) print("--> %s I see these files at instructure ftp site: " % dt_label )
[print(" %s" % f) for f in files] [print(" %s" % f) for f in files]
i = 0 i = 0
got_courses = 0 got_courses = 0
@ -1287,9 +1286,9 @@ def schedule_filling():
# Upload a json file to www # Upload a json file to www
def put_file(remotepath,localpath, localfile,prompt=1): def put_file(remotepath,localpath, localfile,prompt=1):
show_all = 0
folder = datetime.datetime.now().strftime('%Y-%m-%d-%H-%M-%S') folder = datetime.datetime.now().strftime('%Y-%m-%d-%H-%M-%S')
cnopts = pysftp.CnOpts() cnopts = pysftp.CnOpts()
cnopts.hostkeys = None cnopts.hostkeys = None
with pysftp.Connection(FTP_SITE,username=FTP_USER, password=FTP_PW,cnopts=cnopts) as sftp: with pysftp.Connection(FTP_SITE,username=FTP_USER, password=FTP_PW,cnopts=cnopts) as sftp:
@ -1298,12 +1297,9 @@ def put_file(remotepath,localpath, localfile,prompt=1):
#print(folder + "\tI see these files on remote: ", files, "\n") #print(folder + "\tI see these files on remote: ", files, "\n")
sftp.chdir(remotepath) sftp.chdir(remotepath)
files = sftp.listdir() files = sftp.listdir()
print(folder + "\tI see these files on remote: ", files, "\n") if show_all: print(folder + "\tI see these files on remote: ", files, "\n")
localf = os.listdir(localpath) localf = os.listdir(localpath)
if show_all: print("I see these local: ", localf)
print("I see these local: ", localf)
if prompt: if prompt:
input('ready to upload') input('ready to upload')
sftp.put(localpath+localfile, localfile, preserve_mtime=True) sftp.put(localpath+localfile, localfile, preserve_mtime=True)
@ -1862,32 +1858,249 @@ def scrape_schedule_multi():
global SEMESTER, short_sem, semester_begin, filename, filename_html global SEMESTER, short_sem, semester_begin, filename, filename_html
SEMESTER = 'Summer 2022' SEMESTER = 'Spring 2023'
short_sem = 'su22' short_sem = 'sp23'
semester_begin = strptime('06/13', '%m/%d') semester_begin = strptime('01/30', '%m/%d')
filename = 'su22_sched.json' filename = 'sp23_sched.json'
filename_html = 'su22_sched.html' filename_html = 'sp23_sched.html'
scrape_schedule() SEM = ['Fall 2022', 'Summer 2022 (View only)', 'Spring 2022 (View only)',
'Fall 2021 (View only)', 'Summer 2021 (View only)', 'Spring 2021 (View only)', 'Fall 2020 (View only)', 'Summer 2020 (View only)', 'Spring 2020 (View only)',
'Fall 2019 (View only)', 'Summer 2019 (View only)', 'Spring 2019 (View only)', 'Fall 2018 (View only)', 'Summer 2018 (View only)', 'Spring 2018 (View only)' ]
srt = 'fa22,su22,sp22,fa21,su21,sp21,fa20,su20,sp20,fa19,su19,sp19,fa18,su18,sp18'.split(',')
beg = ['08/22','06/13','01/31','08/23','06/14','02/01','08/24','06/15','01/27','08/26','06/17','01/28','08/27','06/18','01/29']
#for i in [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14]:
#SEMESTER = SEM[i]
#short_sem = srt[i]
#semester_begin = strptime(beg[i], '%m/%d')
#filename = '%s_sched.json' % short_sem
#filename_html = '%s_sched.html' % short_sem
as_dict = scrape_schedule()
expanded = list_latestarts(short_sem)
fields = "gp,dean,dept,num,code,crn,teacher,name,act,cap,site,type".split(",")
ffcsv = codecs.open('cache/enrollment_%s.csv' % short_sem, 'w', 'utf-8')
with ffcsv as csvfile:
csvwriter = csv.writer(csvfile)
csvwriter.writerow(fields)
for S in expanded:
parts = S['code'].split(' ')
S['dept'] = parts[0]
S['num'] = parts[1]
S['gp'] = gp[parts[0]]
S['dean'] = dean[parts[0]]
S['sem'] = short_sem
# S['act'] = S['cap']
if S['loc'] == "ONLINE LIVE": S['site'] = 'OnlineLive'
csvwriter.writerow( [ S[x] for x in fields ] )
put_file('/home/public/schedule/', 'cache/', 'enrollment_%s.csv' % short_sem, 0)
def scrape_for_db():
global SEMESTER, gp, dean, short_sem, semester_begin, filename, filename_html
fields = 'sem,crn,dept,num,gp,dean,code,name,teacher,type,cap,act,loc,site,date,days,time,cred,ztc'.split(',')
"""
SEMESTER = 'Fall 2022' SEMESTER = 'Fall 2022'
short_sem = 'fa22' short_sem = 'fa22'
semester_begin = strptime('08/22', '%m/%d') semester_begin = strptime('08/22', '%m/%d')
filename = 'fa22_sched.json' filename = 'fa22_sched.json'
filename_html = 'fa22_sched.html' filename_html = 'fa22_sched.html'
scrape_schedule() as_dict = scrape_schedule()
fff = codecs.open('cache/%s_sched.sql' % filename, 'w', 'utf-8')
SEMESTER = 'Spring 2022' fff.write("CREATE TABLE IF NOT EXISTS schedule ( id text, sem text, dept text, num text, gp text, dean text, code text, crn text, name text, teacher text,mode text, loc text, cap text, act text, site text, date text, cred text, ztc text, days text, time text);\n")
short_sem = 'sp22' for S in as_dict:
semester_begin = strptime('01/31', '%m/%d') parts = S['code'].split(' ')
filename = 'sp22_sched.json' S['dept'] = parts[0]
filename_html = 'sp22_sched.html' S['num'] = parts[1]
S['gp'] = gp[parts[0]]
scrape_schedule() S['dean'] = dean[parts[0]]
S['sem'] = short_sem
str = "INSERT INTO schedule (sem,crn,dept,num,gp,dean,code,name,teacher,mode,cap,act,loc,site,date,days,time,cred,ztc) VALUES (%s);\n" % \
", ".join( [ "'" + re.sub(r"'", "", S[x]) + "'" for x in fields ] )
print(str)
fff.write(str)
fff.write('UPDATE schedule SET site="OnlineLive" WHERE loc="ONLINE LIVE";\n')
fff.close()
"""
SEMESTER = 'Spring 2023 (View only)'
short_sem = 'sp23'
semester_begin = strptime('01/30', '%m/%d')
filename = 'sp23_sched.json'
filename_html = 'sp23_sched.html'
as_dict = scrape_schedule()
fff = codecs.open('cache/%s_sched.sql' % filename, 'w', 'utf-8')
fff.write("CREATE TABLE IF NOT EXISTS schedule ( id text, sem text, dept text, num text, gp text, dean text, code text, crn text, name text, teacher text,mode text, loc text, cap text, act text, site text, date text, cred text, ztc text, days text, time text);\n")
for S in as_dict:
parts = S['code'].split(' ')
S['dept'] = parts[0]
S['num'] = parts[1]
S['gp'] = gp[parts[0]]
S['dean'] = dean[parts[0]]
S['sem'] = short_sem
str = "INSERT INTO schedule (sem,crn,dept,num,gp,dean,code,name,teacher,mode,cap,act,loc,site,date,days,time,cred,ztc) VALUES (%s);\n" % \
", ".join( [ "'" + re.sub(r"'", "", S[x]) + "'" for x in fields ] )
print(str)
fff.write(str)
fff.write('UPDATE schedule SET site="OnlineLive" WHERE loc="ONLINE LIVE";\n')
fff.close()
def argos_data():
global dean,gp
f2 = codecs.open('cache/enrollment_sp23.csv','w','utf-8')
writer = csv.writer(f2)
headers = 'gp dean dept num code crn name act site'.split(' ')
writer.writerow(headers)
f = codecs.open('cache/sched_draft_sp23.csv','r','utf-8')
reader = csv.reader(f, delimiter=',')
headers = next(reader)
for r in reader:
d = dict(list(zip(headers,r)))
print(d)
my_dean = dean[d['Subj']]
my_gp = gp[d['Subj']]
dept = d['Subj']
num = d['Crse No']
code = dept + " " + num
crn = d['CRN']
name = d['Course Title']
act = d['Open Seats']
campus = d['Campus']
session = d['Session']
if campus == "Off Campus": site = session
else: site = campus
print(site)
writer.writerow([my_gp,my_dean,dept,num,code,crn,name,act,site])
def expand_old_semesters():
terms = 'sp16,su16,fa16,sp17,su17,fa17,sp18,su18,fa18,sp19,su19,fa19,sp20,su20,fa20,sp21,su21,fa21,sp22,su22,fa22'.split(',')
terms = 'sp16,su16,fa16,sp17,su17,fa17,sp18,su18,fa18,sp19,su19,fa19,sp20,su20'.split(',')
terms.reverse()
for t in terms:
list_latestarts(t)
input('press return to continue.')
# Input: xxxx_sched.json. Output: xxxx_latestarts.txt
def list_latestarts(term):
show_summary = 1
the_year = '20' + term[2:4]
print("year: ", the_year, " semester: ", term)
term_in = "cache/%s_sched.json" % term
term_out = "cache/%s_latestarts.txt" % term
expanded_out = "%s_sched_expanded.json" % term
print("Writing output to " + term_out)
infile = codecs.open(term_in, "r", "utf-8")
outfile = codecs.open(term_out, "w", "utf-8")
exoutfile = codecs.open('cache/' + expanded_out, "w", "utf-8")
expanded = []
sched = json.loads(infile.read())
#print sched
by_date = {}
if show_summary: print("course \t loc \t type \t time")
for C in sched:
if (not C['type']) and C['loc'] != 'ONLINE': # and C['time']:
C['type'] = 'in-person'
if show_summary: print("%s \t %s \t %s \t %s" % (C['code'],C['loc'],C['type'],C['time']))
if 'extra' in C:
if 'partofday' in C and ('type' in C['extra'][0]) and (C['extra'][0]['type'] == 'online') and C['loc'] != "ONLINE LIVE":
C['type'] = 'hybrid'
times = C['time'].split("-")
if len(times) > 1:
time_start = times[0]
time_end = times[1]
try:
startt = time.strptime(time_start,"%I:%M %p")
endt = time.strptime(time_end,"%I:%M %p")
min_start = startt.tm_min
min_end = endt.tm_min
if min_start == 0: min_start = "00"
else: min_start = str(min_start)
if min_end == 0: min_end = "00"
else: min_end = str(min_end)
C['time_start'] = "%i:%s" % (startt.tm_hour, min_start )
C['time_end'] = "%i:%s" % (endt.tm_hour, min_end )
if 0:
print("+ Parsed %s into %s and %s." % (C['time'], C['time_start'], C['time_end']))
except Exception as e:
print(e, "\n-- problem parsing time ", time_start, " or ", time_end)
else:
C['time_start'] = ''
C['time_end'] = ''
if re.search('TBA',C['date']):
C['start'] = ''
C['end'] = ''
C['doy'] = ''
expanded.append(C)
continue
parts = C['date'].split("-")
start = parts[0] + "/" + the_year
end = parts[1] + "/" + the_year
try:
startd = parser.parse(start)
endd = parser.parse(end)
C['start'] = "%i-%i" % (startd.month,startd.day)
C['end'] = "%i-%i" % (endd.month,endd.day)
C['doy'] = startd.timetuple().tm_yday
expanded.append(C)
except Exception as e:
print(e, "\n-- problem parsing ", start, " or ", end)
if not startd in by_date:
by_date[startd] = []
by_date[startd].append(C)
exoutfile.write( json.dumps(expanded,indent=2) )
exoutfile.close()
put_file('/home/public/schedule/', 'cache/', expanded_out, 0)
for X in sorted(by_date.keys()):
#print("Start: ", X)
if len(by_date[X]) < 200:
prettydate = X.strftime("%A, %B %d")
#print(prettydate + ": " + str(len(by_date[X])) + " courses")
outfile.write(prettydate + ": " + str(len(by_date[X])) + " courses" + "\n")
for Y in by_date[X]:
#print "\t" + Y['code'] + " " + Y['crn'] + "\t" + Y['teacher']
#print(Y)
#outfile.write("\t" + Y['code'] + " " + Y['crn'] + "\t" + Y['teacher'] + "\t" + Y['type'] +"\n")
outfile.write("\t" + Y['code'] + " " + Y['crn'] + "\t" + Y['teacher'] + "\t" + Y['type'] + "\t" + "\n")
return expanded
if __name__ == "__main__": if __name__ == "__main__":
print ('') print ('')
@ -1900,7 +2113,10 @@ if __name__ == "__main__":
7: ['Canvas data: automated sync', sync_non_interactive ], 7: ['Canvas data: automated sync', sync_non_interactive ],
8: ['Scrape schedule from ssb', scrape_schedule_multi ], 8: ['Scrape schedule from ssb', scrape_schedule_multi ],
9: ['Test ssb calls with python', scrape_schedule_py ], 9: ['Test ssb calls with python', scrape_schedule_py ],
10: ['Parse deanza schedule', dza_sched ], 10: ['schedule to db', scrape_for_db ],
11: ['clean argos draft schedule file', argos_data],
12: ['make expanded schedule json files of old semesters', expand_old_semesters ],
13: ['Parse deanza schedule', dza_sched ],
} }
if len(sys.argv) > 1 and re.search(r'^\d+',sys.argv[1]): if len(sys.argv) > 1 and re.search(r'^\d+',sys.argv[1]):