canvasapp/ssb.py

607 lines
20 KiB
Python
Raw Permalink Blame History

This file contains invisible Unicode characters

This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait, Select
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd
from datetime import datetime
import time, codecs, traceback
from bs4 import BeautifulSoup as bs
from io import StringIO
from time import strptime
from deepdiff import DeepDiff
from datetime import datetime as dt
from dateutil import parser
from util import fix_t_name, split_class_dept, split_class_code, split_class_code_letter
import json, re, sys, os, codecs, csv, pathlib
import schedules
def writepage(txt):
errfile = codecs.open('lastpage.txt','w','utf-8')
errfile.write(txt)
errfile.close()
DEBUG = 0
def d(s,end=''):
global DEBUG
if end and DEBUG: print(s,end=end)
elif DEBUG: print(s)
# Schedule / course filling history
# csv headers: crn, code, teacher, datetime, cap, act, wlcap, wlact
# Log the history of enrollments per course during registration
def log_section_filling(current_sched_list, short_sem):
rows = 'timestamp crn code teacher cap act wl_cap wl_act'.split(' ')
rows_j = 'crn code teacher cap act wl_cap wl_act'.split(' ')
print(rows_j)
now = datetime.now().strftime('%Y-%m-%dT%H-%M')
csv_fn = 'cache/reg_history_' + short_sem + '.csv'
with codecs.open(csv_fn,'a','utf-8') as f:
writer = csv.writer(f)
for S in current_sched_list:
#print(S)
items = [now,]
[ items.append( S[X] ) for X in rows_j ]
writer.writerow(items)
# Same as above, but compressed, act only
def log_section_filling2(current_sched_list, short_sem):
now = datetime.now().strftime('%Y-%m-%dT%H')
todays_data = { int(S['crn']): S['act'] for S in current_sched_list }
#print(todays_data)
todays_df = pd.DataFrame.from_dict(todays_data, orient='index', columns=[now])
todays_df = todays_df.rename_axis('crn')
todays_df.index = todays_df.index.astype(str)
#print(todays_df)
todays_df.to_csv('cache/reg_today_new.csv', index=True)
csv_path = pathlib.Path('cache') / f'reg_data_{short_sem}.csv'
csv_path.parent.mkdir(parents=True, exist_ok=True)
try:
myframe = pd.read_csv(csv_path)
print(myframe)
except FileNotFoundError:
myframe = pd.DataFrame(columns=['crn'])
print("Creating new data file for this semester.")
except pd.errors.EmptyDataError:
myframe = pd.DataFrame(columns=['crn'])
print("Existing data file was empty; starting fresh for this semester.")
if 'crn' in myframe.columns:
myframe = myframe.set_index('crn')
else:
myframe = myframe.rename_axis('crn')
myframe.index = myframe.index.astype(str)
combined_df = myframe.reindex(myframe.index.union(todays_df.index))
combined_df[now] = todays_df[now]
combined_df = combined_df.sort_index()
combined_df = combined_df.reset_index()
combined_df = combined_df.fillna('')
print(combined_df)
reg_data_filename = 'reg_data_' + short_sem + '.csv'
tmp_path = csv_path.with_suffix(csv_path.suffix + '.tmp')
combined_df.to_csv(tmp_path, index=False)
tmp_path.replace(csv_path)
# Take banner's html and make a csv(?) file
def ssb_to_csv(src):
#out = codecs.open(schedfile,'w','utf-8')
output = 'crn,code,sec,cmp,cred,name,days,time,cap,act,rem,wl_cap,wl_act,wl_rem,teacher,date,loc,ztc,note\n'
b = bs(src, 'html.parser')
tab = b.find(class_="datadisplaytable")
if not tab:
print("hmm... didn't find a 'datadisplaytable' in this html: ")
#print(src)
return 0
rows = tab.find_all('tr')
drows = list(filter(row_has_data,rows))
for dd in drows:
t = row_text(dd)
output += t
return output
# take text lines and condense them to one dict per section
def to_section_list(input_text,verbose=0):
this_course = ''
#todo: no output files
#jout = codecs.open(filename, 'w', 'utf-8')
#input = csv.DictReader(open(schedfile,'r'))
#input = UnicodeDictReader(input_text.splitlines())
all_courses = []
try:
f = StringIO(input_text)
except:
print("ERROR with this input_text:")
print(input_text)
reader = csv.reader(f, delimiter=',')
headers = next(reader)
for r in reader:
d = dict(list(zip(headers,r)))
#pdb.set_trace()
# clean funny unicode char in blank entries
r = {k: clean_funny2(v) for k,v in list(d.items()) }
if verbose: print("Cleaned: " + str(r))
if 'time' in r:
if r['time']=='TBA': r['time'] = ''
if r['time']: r['partofday'] = time_to_partofday(r['time'])
r['type'] = ''
if 'loc' in r:
if r['loc'] == 'ONLINE': r['type'] = 'online'
if r['loc'] == 'ONLINE' and r['time']: r['type'] = 'online live'
if r['loc'] == 'ONLINE LIVE': r['type'] = 'online live'
if r['loc']: r['site'] = room_to_site(r['loc'],verbose)
if 'code' in r:
if re.search(r'ONLINE\sLIVE',r['code']):
r['type'] = 'online live'
elif re.search(r'ONLINE',r['code']):
r['type'] = 'online'
# does it have a section? it is the last course
if r['crn']: # is a new course or a continuation?
if verbose: print(" it's a new section.")
if this_course:
if not this_course['extra']: this_course.pop('extra',None)
all_courses.append(this_course)
this_course = r
#print(r['name'])
this_course['extra'] = []
else:
# is a continuation line
if verbose: print(" additional meeting: " + str(r))
for k,v in list(r.items()):
if not v: r.pop(k,None)
# TODO: if extra line is different type?
#if this_course['type']=='online' and r['type'] != 'online': this_course['type'] = 'hybrid'
#elif this_course['type']!='online' and r['type'] == 'online': this_course['type'] = 'hybrid'
this_course['extra'].append(r)
return all_courses
##
## SCHEDULE PARSE HELPERS
##
##
def time_to_partofday(t):
#todo: account for multiple sites/rows
# 11:20 am-12:10 pm
mor = strptime('12:00 PM', '%I:%M %p')
mid = strptime( '2:00 PM', '%I:%M %p')
aft = strptime( '6:00 PM', '%I:%M %p')
if t == 'TBA':
return 'TBA'
t = t.upper()
parts = t.split('-')
try:
begin = strptime(parts[0], '%I:%M %p')
end = strptime(parts[1], '%I:%M %p')
if end > aft:
return "Evening"
if end > mid:
return "Afternoon"
if end > mor:
return "Midday"
return "Morning"
#return begin,end
except Exception as e:
#print 'problem parsing: ', t, " ",
return ""
# Deduce a 'site' field, based on room name and known offsite locations
def room_to_site(room,verbose=0):
#todo: account for multiple sites/rows
#todo: better way to store these offsite labels
othersites = 'AV,SBHS I-243,SBHS I-244,LOADCS,HOPEH,HOPEG,PLY,SAS,SBHS,LOHS,CHS,SBRAT,'.split(',')
# is it gilroy, mh, hol, other, online or hybrid?
site = 'Gilroy'
#if len(course[0]) > 13:
# room = course[0][13]
if room in othersites:
site = "Other"
if room == 'TBA':
site = 'TBA'
if room == 'AV':
site = 'San Martin Airport'
if re.search('MHG',room):
site = 'Morgan Hill'
if re.search('HOL',room):
site = 'Hollister'
if re.search('COY',room):
site = 'Coyote Valley'
if re.search('OFFSTE',room):
site = 'Other'
if re.search('ONLINE',room):
site = 'Online'
if verbose: print(room, '\t', end=' ')
return site
def row_has_data(r): # helper
if r.find_all('th'):
return False
if len(r.find_all('td')) > 2:
return True
if re.search('Note\:', r.get_text()):
return True
return False
def row_text(r): # helper
#global dbg
d("Row Txt Fxn gets: ")
arr = []
for t in r.find_all('td'):
if t.contents and len(t.contents) and t.contents[0].name == 'img':
arr.append("1")
d("img")
r_text = t.get_text()
arr.append(r_text)
if 'colspan' in t.attrs and t['colspan']=='2':
d('[colspan2]')
arr.append('')
d("\t"+r_text, end=" ")
d('')
if len(arr)==1 and re.search('Note\:',arr[0]):
note_line = clean_funny( arr[0] )
note_line = re.sub(r'\n',' ', note_line)
note_line = re.sub(r'"','', note_line)
#note_line = re.sub(r',','\,', note_line)
return ',,,,,,,,,,,,,,,,,,"' + note_line + '"\n'
del arr[0]
arr[1] = clean_funny(arr[1])
arr[2] = clean_funny(arr[2])
if arr[1]: arr[1] = arr[1] + " " + arr[2]
del arr[2]
arr = [ re.sub(r' ','',a) for a in arr]
arr = [ re.sub(',','. ',a) for a in arr]
arr = [ re.sub('\(P\)','',a) for a in arr]
arr = [ a.strip() for a in arr]
#del arr[-1]
r = ','.join(arr)+'\n'
r = re.sub('\n','',r)
r = re.sub('add to worksheet','',r)
d("Row Txt Fxn returns: " + r + "\n\n")
return r + '\n'
def clean_funny(str):
if str and str.encode('utf8') == ' ': return ''
return str
def clean_funny2(str):
if str and str == '\xa0': return ''
if str and str == ' ': return ''
return str
def clean_funny3(str):
return re.sub('\xa0','',str)
def scrape_schedule(short_sem, semester_label):
# Set up Chrome options
chrome_options = Options()
#chrome_options.add_argument("--headless") # Run headless
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")
# Start WebDriver
driver = webdriver.Chrome(options=chrome_options)
URL = "https://ssb-prod.ec.gavilan.edu/PROD/twbkwbis.P_GenMenu?name=bmenu.P_MainMnu"
GOO = "G00102586"
GOO_PIN = "987654bb"
filename = f"{short_sem}_sched.json"
filename_html = f"{short_sem}_sched.html"
try:
# Open page
driver.get(URL)
writepage(driver.page_source)
print(driver.title)
driver.find_element(By.ID,"UserID").clear()
driver.find_element(By.ID,"UserID").send_keys(GOO)
driver.find_element(By.NAME,"PIN").send_keys(GOO_PIN)
driver.find_element(By.NAME,"loginform").submit()
print('login')
driver.implicitly_wait(5)
writepage(driver.page_source)
print(driver.title)
driver.find_element(By.LINK_TEXT,"Student").click()
print('students')
driver.implicitly_wait(5)
writepage(driver.page_source)
print(driver.title)
# switch to new tab
driver.switch_to.window(driver.window_handles[1])
driver.find_element(By.LINK_TEXT,"Registration").click()
print('registration')
driver.implicitly_wait(5)
writepage(driver.page_source)
print(driver.title)
driver.find_element(By.LINK_TEXT,"Search for Classes").click()
print('search for classes')
driver.implicitly_wait(15)
writepage(driver.page_source)
print(driver.title)
dd = Select(driver.find_element(By.NAME,"p_term"))
if (dd):
dd.select_by_visible_text(semester_label)
driver.find_element(By.XPATH,"/html/body/div/div[4]/form").submit()
print('semester')
driver.implicitly_wait(15)
writepage(driver.page_source)
print(driver.title)
driver.find_element(By.XPATH,"/html/body/div/div[4]/form/input[18]").click()
print('advanced?')
driver.implicitly_wait(10)
writepage(driver.page_source)
print(driver.title)
driver.find_element(By.NAME,"SUB_BTN").click()
print('login')
driver.implicitly_wait(40)
time.sleep(15)
driver.implicitly_wait(40)
writepage(driver.page_source)
print(driver.title)
text = driver.page_source
codecs.open('cache/' + filename_html,'w', 'utf-8').write(text)
##
## Start parsing html
##
as_list = ssb_to_csv(text)
print(as_list)
as_dict = to_section_list(as_list)
jj = json.dumps(as_dict,indent=2)
##
## Diff from previous semester
##
try:
ps = codecs.open('cache/'+filename,'r','utf-8')
prev_sched = json.loads(ps.read())
ps.close()
if 1: # sometimes I want to re-run this without affecting the logs.
log_section_filling(as_dict, short_sem)
log_section_filling2(as_dict, short_sem)
dd = DeepDiff(prev_sched, as_dict, ignore_order=True)
pretty_json = json.dumps( json.loads( dd.to_json() ), indent=2 )
codecs.open('cache/%s_sched_diff.json' % short_sem,'w','utf-8').write( pretty_json ) # dd.to_json() )
# Next, rename the prev sched_xxYY.json data file to have its date,
# make this new one, and then upload it to the website.
# Maybe even count the entries and do a little sanity checking
#
# print("Last modified: %s" % time.ctime(os.path.getmtime("test.txt")))
# print("Created: %s" % time.ctime(os.path.getctime("test.txt")))
last_mod = time.ctime(os.path.getmtime('cache/' + filename))
prev_stat = pathlib.Path('cache/' + filename).stat()
mtime = dt.fromtimestamp(prev_stat.st_mtime)
print(mtime)
except Exception as e:
print("Couldn't Diff.")
print("Got an exception: ", e)
return 2
# fname = pathlib.Path('test.py')
# assert fname.exists(), f'No such file: {fname}' # check that the file exists
# print(fname.stat())
#
# os.stat_result(st_mode=33206, st_ino=5066549581564298, st_dev=573948050, st_nlink=1, st_uid=0, st_gid=0, st_size=413,
# st_atime=1523480272, st_mtime=1539787740, st_ctime=1523480272)
codecs.open(f'cache/{filename}', 'w', 'utf-8').write(jj)
return as_dict
except Exception as e:
print("Got an exception: ", e)
#print("There was an error: " + e.args[0] + ". The line where the code failed was " + str(traceback.extract_stack()))
return 1
finally:
driver.quit()
return 0
def expanded(short_sem):
course_to_gp, course_to_area, areacode_to_area, area_to_dean, course_to_dean, dean_code_to_name = schedules.campus_dept_hierarchy()
expanded = list_latestarts(short_sem)
fields = "gp,dean,dept,num,code,crn,teacher,name,act,cap,site,type".split(",")
ffcsv = codecs.open('cache/enrollment_%s.csv' % short_sem, 'w', 'utf-8')
with ffcsv as csvfile:
csvwriter = csv.writer(csvfile)
csvwriter.writerow(fields)
for S in expanded:
parts = S['code'].split(' ')
S['dept'] = parts[0]
S['num'] = parts[1]
S['gp'] = course_to_gp[parts[0]]
S['dean'] = course_to_dean[parts[0]]
S['sem'] = short_sem
# S['act'] = S['cap']
if S['loc'] == "ONLINE LIVE": S['site'] = 'OnlineLive'
csvwriter.writerow( [ S[x] for x in fields ] )
#put_file('/home/public/schedule/', 'cache/', 'enrollment_%s.csv' % short_sem, 0)
# Input: xxxx_sched.json. Output: xxxx_latestarts.txt
def list_latestarts(term):
show_summary = 1
the_year = '20' + term[2:4]
print("year: ", the_year, " semester: ", term)
#term_in = "cache/%s_sched.json" % term
term_out = "cache/%s_latestarts.txt" % term
expanded_out = "%s_sched_expanded.json" % term
print("Writing output to " + term_out)
#infile = codecs.open(term_in, "r", "utf-8")
outfile = codecs.open(term_out, "w", "utf-8")
exoutfile = codecs.open('cache/' + expanded_out, "w", "utf-8")
expanded = []
#sched = json.loads(infile.read())
#sched = requests.get(f"http://gavilan.cc/schedule/{term}_sched.json").json()
sched = json.loads( codecs.open(f"cache/{term}_sched.json","r","utf-8").read() )
by_date = {}
if show_summary: print("course \t loc \t type \t time")
for C in sched:
if (not C['type']) and C['loc'] != 'ONLINE': # and C['time']:
C['type'] = 'in-person'
if show_summary: print("%s \t %s \t %s \t %s" % (C['code'],C['loc'],C['type'],C['time']))
if 'extra' in C:
if 'partofday' in C and ('type' in C['extra'][0]) and (C['extra'][0]['type'] == 'online') and C['loc'] != "ONLINE LIVE":
C['type'] = 'hybrid'
times = C['time'].split("-")
if len(times) > 1:
time_start = times[0]
time_end = times[1]
try:
startt = time.strptime(time_start,"%I:%M %p")
endt = time.strptime(time_end,"%I:%M %p")
min_start = startt.tm_min
min_end = endt.tm_min
if min_start == 0: min_start = "00"
else: min_start = str(min_start)
if min_end == 0: min_end = "00"
else: min_end = str(min_end)
C['time_start'] = "%i:%s" % (startt.tm_hour, min_start )
C['time_end'] = "%i:%s" % (endt.tm_hour, min_end )
if 0:
print("+ Parsed %s into %s and %s." % (C['time'], C['time_start'], C['time_end']))
except Exception as e:
print(e, "\n-- problem parsing time ", time_start, " or ", time_end)
else:
C['time_start'] = ''
C['time_end'] = ''
if re.search('TBA',C['date']):
C['start'] = ''
C['end'] = ''
C['doy'] = ''
expanded.append(C)
continue
parts = C['date'].split("-")
start = parts[0] + "/" + the_year
end = parts[1] + "/" + the_year
try:
startd = parser.parse(start)
endd = parser.parse(end)
C['start'] = "%i-%i" % (startd.month,startd.day)
C['end'] = "%i-%i" % (endd.month,endd.day)
C['doy'] = startd.timetuple().tm_yday
expanded.append(C)
except Exception as e:
print(e, "\n-- problem parsing ", start, " or ", end)
if not startd in by_date:
by_date[startd] = []
by_date[startd].append(C)
exoutfile.write( json.dumps(expanded,indent=2) )
exoutfile.close()
#put_file('/home/public/schedule/', 'cache/', expanded_out, 0)
for X in sorted(by_date.keys()):
#print("Start: ", X)
if len(by_date[X]) < 200:
prettydate = X.strftime("%A, %B %d")
#print(prettydate + ": " + str(len(by_date[X])) + " courses")
outfile.write(prettydate + ": " + str(len(by_date[X])) + " courses" + "\n")
for Y in by_date[X]:
#print "\t" + Y['code'] + " " + Y['crn'] + "\t" + Y['teacher']
#print(Y)
#outfile.write("\t" + Y['code'] + " " + Y['crn'] + "\t" + Y['teacher'] + "\t" + Y['type'] +"\n")
outfile.write("\t" + Y['code'] + " " + Y['crn'] + "\t" + Y['teacher'] + "\t" + Y['type'] + "\t" + "\n")
outfile.close()
#put_file('/home/public/schedule/', 'cache/', "%s_latestarts.txt" % term, 0)
return expanded
# Get semesters to scrape
with open('cache/to_scrape.json', 'r') as f:
semesters = json.load(f)
# Loop through each item and call the function
for item in semesters:
result = scrape_schedule(item['short_sem'], item['sem'])
if result == 0:
ex = expanded(item['short_sem'])
print(f"Done with {item['sem']}. Sleeping 45 seconds.")
time.sleep(45)
else:
print(f"Stopped due to error: {result}")