591 lines
20 KiB
Python
591 lines
20 KiB
Python
from selenium import webdriver
|
||
from selenium.webdriver.common.by import By
|
||
from selenium.webdriver.chrome.options import Options
|
||
from selenium.webdriver.common.keys import Keys
|
||
from selenium.webdriver.chrome.options import Options
|
||
from selenium.webdriver.support.ui import WebDriverWait, Select
|
||
from selenium.webdriver.support import expected_conditions as EC
|
||
import pandas as pd
|
||
from datetime import datetime
|
||
import time, codecs, traceback
|
||
from bs4 import BeautifulSoup as bs
|
||
from io import StringIO
|
||
from time import strptime
|
||
from deepdiff import DeepDiff
|
||
from datetime import datetime as dt
|
||
from dateutil import parser
|
||
from util import fix_t_name, split_class_dept, split_class_code, split_class_code_letter
|
||
import json, re, sys, os, codecs, csv, pathlib
|
||
import schedules
|
||
|
||
def writepage(txt):
|
||
errfile = codecs.open('lastpage.txt','w','utf-8')
|
||
errfile.write(txt)
|
||
errfile.close()
|
||
|
||
|
||
|
||
DEBUG = 0
|
||
|
||
def d(s,end=''):
|
||
global DEBUG
|
||
if end and DEBUG: print(s,end=end)
|
||
elif DEBUG: print(s)
|
||
|
||
|
||
|
||
# Schedule / course filling history
|
||
# csv headers: crn, code, teacher, datetime, cap, act, wlcap, wlact
|
||
# Log the history of enrollments per course during registration
|
||
def log_section_filling(current_sched_list, short_sem):
|
||
rows = 'timestamp crn code teacher cap act wl_cap wl_act'.split(' ')
|
||
rows_j = 'crn code teacher cap act wl_cap wl_act'.split(' ')
|
||
print(rows_j)
|
||
now = datetime.now().strftime('%Y-%m-%dT%H-%M')
|
||
csv_fn = 'cache/reg_history_' + short_sem + '.csv'
|
||
with codecs.open(csv_fn,'a','utf-8') as f:
|
||
writer = csv.writer(f)
|
||
for S in current_sched_list:
|
||
#print(S)
|
||
items = [now,]
|
||
[ items.append( S[X] ) for X in rows_j ]
|
||
writer.writerow(items)
|
||
|
||
# Same as above, but compressed, act only
|
||
def log_section_filling2(current_sched_list, short_sem):
|
||
|
||
|
||
|
||
now = datetime.now().strftime('%Y-%m-%dT%H')
|
||
|
||
todays_data = { int(S['crn']): S['act'] for S in current_sched_list }
|
||
#print(todays_data)
|
||
|
||
todays_df = pd.DataFrame.from_dict(todays_data, orient='index', columns=[now])
|
||
todays_df = todays_df.rename_axis('crn')
|
||
#print(todays_df)
|
||
todays_df.to_csv('cache/reg_today_new.csv', index=True)
|
||
|
||
try:
|
||
myframe = pd.read_csv('cache/reg_data_' + short_sem + '.csv')
|
||
print(myframe)
|
||
except:
|
||
fff = open('cache/reg_data_'+short_sem+'.csv','w')
|
||
fff.write('crn\n')
|
||
fff.close()
|
||
myframe = pd.read_csv('cache/reg_data_' + short_sem + '.csv')
|
||
#myframe = pd.DataFrame.from_dict(todays_data, orient='index', columns=[now])
|
||
#myframe = myframe.rename_axis('crn')
|
||
print("Creating new data file for this semester.")
|
||
|
||
new_df = myframe.join( todays_df, on='crn', how='outer' )
|
||
new_df = new_df.rename_axis('crn')
|
||
print(new_df)
|
||
|
||
reg_data_filename = 'reg_data_' + short_sem + '.csv'
|
||
new_df.to_csv('cache/' + reg_data_filename, index=False)
|
||
|
||
|
||
# Take banner's html and make a csv(?) file
|
||
def ssb_to_csv(src):
|
||
#out = codecs.open(schedfile,'w','utf-8')
|
||
output = 'crn,code,sec,cmp,cred,name,days,time,cap,act,rem,wl_cap,wl_act,wl_rem,teacher,date,loc,ztc,note\n'
|
||
b = bs(src, 'html.parser')
|
||
tab = b.find(class_="datadisplaytable")
|
||
if not tab:
|
||
print("hmm... didn't find a 'datadisplaytable' in this html: ")
|
||
#print(src)
|
||
return 0
|
||
rows = tab.find_all('tr')
|
||
drows = list(filter(row_has_data,rows))
|
||
for dd in drows:
|
||
t = row_text(dd)
|
||
output += t
|
||
return output
|
||
|
||
|
||
|
||
# take text lines and condense them to one dict per section
|
||
def to_section_list(input_text,verbose=0):
|
||
this_course = ''
|
||
#todo: no output files
|
||
#jout = codecs.open(filename, 'w', 'utf-8')
|
||
#input = csv.DictReader(open(schedfile,'r'))
|
||
#input = UnicodeDictReader(input_text.splitlines())
|
||
all_courses = []
|
||
|
||
|
||
try:
|
||
f = StringIO(input_text)
|
||
except:
|
||
print("ERROR with this input_text:")
|
||
print(input_text)
|
||
reader = csv.reader(f, delimiter=',')
|
||
headers = next(reader)
|
||
for r in reader:
|
||
d = dict(list(zip(headers,r)))
|
||
#pdb.set_trace()
|
||
# clean funny unicode char in blank entries
|
||
r = {k: clean_funny2(v) for k,v in list(d.items()) }
|
||
if verbose: print("Cleaned: " + str(r))
|
||
|
||
if 'time' in r:
|
||
if r['time']=='TBA': r['time'] = ''
|
||
if r['time']: r['partofday'] = time_to_partofday(r['time'])
|
||
|
||
r['type'] = ''
|
||
|
||
if 'loc' in r:
|
||
if r['loc'] == 'ONLINE': r['type'] = 'online'
|
||
if r['loc'] == 'ONLINE' and r['time']: r['type'] = 'online live'
|
||
if r['loc'] == 'ONLINE LIVE': r['type'] = 'online live'
|
||
if r['loc']: r['site'] = room_to_site(r['loc'],verbose)
|
||
|
||
if 'code' in r:
|
||
if re.search(r'ONLINE\sLIVE',r['code']):
|
||
r['type'] = 'online live'
|
||
elif re.search(r'ONLINE',r['code']):
|
||
r['type'] = 'online'
|
||
|
||
# does it have a section? it is the last course
|
||
if r['crn']: # is a new course or a continuation?
|
||
if verbose: print(" it's a new section.")
|
||
if this_course:
|
||
if not this_course['extra']: this_course.pop('extra',None)
|
||
all_courses.append(this_course)
|
||
this_course = r
|
||
#print(r['name'])
|
||
this_course['extra'] = []
|
||
else:
|
||
# is a continuation line
|
||
if verbose: print(" additional meeting: " + str(r))
|
||
for k,v in list(r.items()):
|
||
if not v: r.pop(k,None)
|
||
# TODO: if extra line is different type?
|
||
#if this_course['type']=='online' and r['type'] != 'online': this_course['type'] = 'hybrid'
|
||
#elif this_course['type']!='online' and r['type'] == 'online': this_course['type'] = 'hybrid'
|
||
this_course['extra'].append(r)
|
||
return all_courses
|
||
|
||
|
||
##
|
||
## SCHEDULE PARSE HELPERS
|
||
##
|
||
##
|
||
def time_to_partofday(t):
|
||
#todo: account for multiple sites/rows
|
||
# 11:20 am-12:10 pm
|
||
mor = strptime('12:00 PM', '%I:%M %p')
|
||
mid = strptime( '2:00 PM', '%I:%M %p')
|
||
aft = strptime( '6:00 PM', '%I:%M %p')
|
||
if t == 'TBA':
|
||
return 'TBA'
|
||
t = t.upper()
|
||
parts = t.split('-')
|
||
try:
|
||
begin = strptime(parts[0], '%I:%M %p')
|
||
end = strptime(parts[1], '%I:%M %p')
|
||
if end > aft:
|
||
return "Evening"
|
||
if end > mid:
|
||
return "Afternoon"
|
||
if end > mor:
|
||
return "Midday"
|
||
return "Morning"
|
||
#return begin,end
|
||
except Exception as e:
|
||
#print 'problem parsing: ', t, " ",
|
||
return ""
|
||
|
||
# Deduce a 'site' field, based on room name and known offsite locations
|
||
def room_to_site(room,verbose=0):
|
||
#todo: account for multiple sites/rows
|
||
#todo: better way to store these offsite labels
|
||
othersites = 'AV,SBHS I-243,SBHS I-244,LOADCS,HOPEH,HOPEG,PLY,SAS,SBHS,LOHS,CHS,SBRAT,'.split(',')
|
||
# is it gilroy, mh, hol, other, online or hybrid?
|
||
site = 'Gilroy'
|
||
#if len(course[0]) > 13:
|
||
# room = course[0][13]
|
||
if room in othersites:
|
||
site = "Other"
|
||
if room == 'TBA':
|
||
site = 'TBA'
|
||
if room == 'AV':
|
||
site = 'San Martin Airport'
|
||
if re.search('MHG',room):
|
||
site = 'Morgan Hill'
|
||
if re.search('HOL',room):
|
||
site = 'Hollister'
|
||
if re.search('COY',room):
|
||
site = 'Coyote Valley'
|
||
if re.search('OFFSTE',room):
|
||
site = 'Other'
|
||
if re.search('ONLINE',room):
|
||
site = 'Online'
|
||
if verbose: print(room, '\t', end=' ')
|
||
return site
|
||
|
||
|
||
|
||
def row_has_data(r): # helper
|
||
if r.find_all('th'):
|
||
return False
|
||
if len(r.find_all('td')) > 2:
|
||
return True
|
||
if re.search('Note\:', r.get_text()):
|
||
return True
|
||
return False
|
||
|
||
def row_text(r): # helper
|
||
#global dbg
|
||
|
||
d("Row Txt Fxn gets: ")
|
||
arr = []
|
||
for t in r.find_all('td'):
|
||
if t.contents and len(t.contents) and t.contents[0].name == 'img':
|
||
arr.append("1")
|
||
d("img")
|
||
r_text = t.get_text()
|
||
arr.append(r_text)
|
||
if 'colspan' in t.attrs and t['colspan']=='2':
|
||
d('[colspan2]')
|
||
arr.append('')
|
||
d("\t"+r_text, end=" ")
|
||
d('')
|
||
|
||
if len(arr)==1 and re.search('Note\:',arr[0]):
|
||
note_line = clean_funny( arr[0] )
|
||
note_line = re.sub(r'\n',' ', note_line)
|
||
note_line = re.sub(r'"','', note_line)
|
||
#note_line = re.sub(r',','\,', note_line)
|
||
return ',,,,,,,,,,,,,,,,,,"' + note_line + '"\n'
|
||
del arr[0]
|
||
arr[1] = clean_funny(arr[1])
|
||
arr[2] = clean_funny(arr[2])
|
||
if arr[1]: arr[1] = arr[1] + " " + arr[2]
|
||
del arr[2]
|
||
arr = [ re.sub(r' ','',a) for a in arr]
|
||
arr = [ re.sub(',','. ',a) for a in arr]
|
||
arr = [ re.sub('\(P\)','',a) for a in arr]
|
||
arr = [ a.strip() for a in arr]
|
||
#del arr[-1]
|
||
r = ','.join(arr)+'\n'
|
||
r = re.sub('\n','',r)
|
||
r = re.sub('add to worksheet','',r)
|
||
d("Row Txt Fxn returns: " + r + "\n\n")
|
||
|
||
return r + '\n'
|
||
|
||
|
||
|
||
|
||
|
||
def clean_funny(str):
|
||
if str and str.encode('utf8') == ' ': return ''
|
||
return str
|
||
def clean_funny2(str):
|
||
if str and str == '\xa0': return ''
|
||
if str and str == ' ': return ''
|
||
return str
|
||
|
||
def clean_funny3(str):
|
||
return re.sub('\xa0','',str)
|
||
|
||
|
||
def scrape_schedule(short_sem, semester_label):
|
||
# Set up Chrome options
|
||
chrome_options = Options()
|
||
#chrome_options.add_argument("--headless") # Run headless
|
||
chrome_options.add_argument("--no-sandbox")
|
||
chrome_options.add_argument("--disable-dev-shm-usage")
|
||
|
||
# Start WebDriver
|
||
driver = webdriver.Chrome(options=chrome_options)
|
||
|
||
|
||
|
||
URL = "https://ssb-prod.ec.gavilan.edu/PROD/twbkwbis.P_GenMenu?name=bmenu.P_MainMnu"
|
||
|
||
GOO = "G00102586"
|
||
GOO_PIN = "987654bb"
|
||
|
||
filename = f"{short_sem}_sched.json"
|
||
filename_html = f"{short_sem}_sched.html"
|
||
|
||
|
||
try:
|
||
# Open page
|
||
driver.get(URL)
|
||
writepage(driver.page_source)
|
||
print(driver.title)
|
||
|
||
|
||
driver.find_element(By.ID,"UserID").clear()
|
||
driver.find_element(By.ID,"UserID").send_keys(GOO)
|
||
driver.find_element(By.NAME,"PIN").send_keys(GOO_PIN)
|
||
driver.find_element(By.NAME,"loginform").submit()
|
||
print('login')
|
||
driver.implicitly_wait(5)
|
||
|
||
writepage(driver.page_source)
|
||
print(driver.title)
|
||
|
||
driver.find_element(By.LINK_TEXT,"Student").click()
|
||
print('students')
|
||
driver.implicitly_wait(5)
|
||
|
||
writepage(driver.page_source)
|
||
print(driver.title)
|
||
|
||
driver.find_element(By.LINK_TEXT,"Registration").click()
|
||
print('registration')
|
||
driver.implicitly_wait(5)
|
||
|
||
writepage(driver.page_source)
|
||
print(driver.title)
|
||
|
||
driver.find_element(By.LINK_TEXT,"Search for Classes").click()
|
||
print('search for classes')
|
||
driver.implicitly_wait(15)
|
||
|
||
writepage(driver.page_source)
|
||
print(driver.title)
|
||
|
||
dd = Select(driver.find_element(By.NAME,"p_term"))
|
||
if (dd):
|
||
dd.select_by_visible_text(semester_label)
|
||
driver.find_element(By.XPATH,"/html/body/div/div[4]/form").submit()
|
||
print('semester')
|
||
driver.implicitly_wait(15)
|
||
|
||
writepage(driver.page_source)
|
||
print(driver.title)
|
||
|
||
driver.find_element(By.XPATH,"/html/body/div/div[4]/form/input[18]").click()
|
||
print('advanced?')
|
||
driver.implicitly_wait(10)
|
||
|
||
writepage(driver.page_source)
|
||
print(driver.title)
|
||
|
||
driver.find_element(By.NAME,"SUB_BTN").click()
|
||
print('login')
|
||
driver.implicitly_wait(40)
|
||
time.sleep(15)
|
||
driver.implicitly_wait(40)
|
||
|
||
writepage(driver.page_source)
|
||
print(driver.title)
|
||
text = driver.page_source
|
||
|
||
codecs.open('cache/' + filename_html,'w', 'utf-8').write(text)
|
||
|
||
##
|
||
## Start parsing html
|
||
##
|
||
|
||
as_list = ssb_to_csv(text)
|
||
print(as_list)
|
||
as_dict = to_section_list(as_list)
|
||
jj = json.dumps(as_dict,indent=2)
|
||
|
||
##
|
||
## Diff from previous semester
|
||
##
|
||
try:
|
||
|
||
ps = codecs.open('cache/'+filename,'r','utf-8')
|
||
prev_sched = json.loads(ps.read())
|
||
ps.close()
|
||
|
||
if 1: # sometimes I want to re-run this without affecting the logs.
|
||
log_section_filling(as_dict, short_sem)
|
||
log_section_filling2(as_dict, short_sem)
|
||
|
||
dd = DeepDiff(prev_sched, as_dict, ignore_order=True)
|
||
pretty_json = json.dumps( json.loads( dd.to_json() ), indent=2 )
|
||
codecs.open('cache/%s_sched_diff.json' % short_sem,'w','utf-8').write( pretty_json ) # dd.to_json() )
|
||
|
||
# Next, rename the prev sched_xxYY.json data file to have its date,
|
||
# make this new one, and then upload it to the website.
|
||
# Maybe even count the entries and do a little sanity checking
|
||
#
|
||
# print("Last modified: %s" % time.ctime(os.path.getmtime("test.txt")))
|
||
# print("Created: %s" % time.ctime(os.path.getctime("test.txt")))
|
||
|
||
last_mod = time.ctime(os.path.getmtime('cache/' + filename))
|
||
|
||
prev_stat = pathlib.Path('cache/' + filename).stat()
|
||
mtime = dt.fromtimestamp(prev_stat.st_mtime)
|
||
print(mtime)
|
||
except Exception as e:
|
||
print("Couldn't Diff.")
|
||
print("Got an exception: ", e)
|
||
# fname = pathlib.Path('test.py')
|
||
# assert fname.exists(), f'No such file: {fname}' # check that the file exists
|
||
# print(fname.stat())
|
||
#
|
||
# os.stat_result(st_mode=33206, st_ino=5066549581564298, st_dev=573948050, st_nlink=1, st_uid=0, st_gid=0, st_size=413,
|
||
# st_atime=1523480272, st_mtime=1539787740, st_ctime=1523480272)
|
||
|
||
|
||
|
||
codecs.open(f'cache/{filename}', 'w', 'utf-8').write(jj)
|
||
|
||
|
||
return as_dict
|
||
|
||
except Exception as e:
|
||
print("Got an exception: ", e)
|
||
#print("There was an error: " + e.args[0] + ". The line where the code failed was " + str(traceback.extract_stack()))
|
||
|
||
finally:
|
||
driver.quit()
|
||
|
||
|
||
def expanded(as_dict, short_sem):
|
||
#as_dict = scrape_schedule()
|
||
|
||
course_to_gp, course_to_area, areacode_to_area, area_to_dean, course_to_dean, dean_code_to_name = schedules.campus_dept_hierarchy()
|
||
|
||
expanded = list_latestarts(short_sem)
|
||
fields = "gp,dean,dept,num,code,crn,teacher,name,act,cap,site,type".split(",")
|
||
|
||
ffcsv = codecs.open('cache/enrollment_%s.csv' % short_sem, 'w', 'utf-8')
|
||
with ffcsv as csvfile:
|
||
csvwriter = csv.writer(csvfile)
|
||
csvwriter.writerow(fields)
|
||
|
||
for S in expanded:
|
||
parts = S['code'].split(' ')
|
||
S['dept'] = parts[0]
|
||
S['num'] = parts[1]
|
||
S['gp'] = course_to_gp[parts[0]]
|
||
S['dean'] = course_to_dean[parts[0]]
|
||
S['sem'] = short_sem
|
||
# S['act'] = S['cap']
|
||
if S['loc'] == "ONLINE LIVE": S['site'] = 'OnlineLive'
|
||
csvwriter.writerow( [ S[x] for x in fields ] )
|
||
|
||
#put_file('/home/public/schedule/', 'cache/', 'enrollment_%s.csv' % short_sem, 0)
|
||
|
||
|
||
|
||
# Input: xxxx_sched.json. Output: xxxx_latestarts.txt
|
||
def list_latestarts(term):
|
||
|
||
show_summary = 1
|
||
|
||
the_year = '20' + term[2:4]
|
||
print("year: ", the_year, " semester: ", term)
|
||
|
||
#term_in = "cache/%s_sched.json" % term
|
||
term_out = "cache/%s_latestarts.txt" % term
|
||
expanded_out = "%s_sched_expanded.json" % term
|
||
print("Writing output to " + term_out)
|
||
#infile = codecs.open(term_in, "r", "utf-8")
|
||
outfile = codecs.open(term_out, "w", "utf-8")
|
||
exoutfile = codecs.open('cache/' + expanded_out, "w", "utf-8")
|
||
expanded = []
|
||
#sched = json.loads(infile.read())
|
||
|
||
|
||
#sched = requests.get(f"http://gavilan.cc/schedule/{term}_sched.json").json()
|
||
sched = json.loads( codecs.open(f"cache/{term}_sched.json","r","utf-8").read() )
|
||
|
||
by_date = {}
|
||
|
||
if show_summary: print("course \t loc \t type \t time")
|
||
|
||
for C in sched:
|
||
if (not C['type']) and C['loc'] != 'ONLINE': # and C['time']:
|
||
C['type'] = 'in-person'
|
||
|
||
if show_summary: print("%s \t %s \t %s \t %s" % (C['code'],C['loc'],C['type'],C['time']))
|
||
|
||
if 'extra' in C:
|
||
if 'partofday' in C and ('type' in C['extra'][0]) and (C['extra'][0]['type'] == 'online') and C['loc'] != "ONLINE LIVE":
|
||
C['type'] = 'hybrid'
|
||
|
||
times = C['time'].split("-")
|
||
if len(times) > 1:
|
||
time_start = times[0]
|
||
time_end = times[1]
|
||
|
||
try:
|
||
startt = time.strptime(time_start,"%I:%M %p")
|
||
endt = time.strptime(time_end,"%I:%M %p")
|
||
min_start = startt.tm_min
|
||
min_end = endt.tm_min
|
||
if min_start == 0: min_start = "00"
|
||
else: min_start = str(min_start)
|
||
if min_end == 0: min_end = "00"
|
||
else: min_end = str(min_end)
|
||
C['time_start'] = "%i:%s" % (startt.tm_hour, min_start )
|
||
C['time_end'] = "%i:%s" % (endt.tm_hour, min_end )
|
||
if 0:
|
||
print("+ Parsed %s into %s and %s." % (C['time'], C['time_start'], C['time_end']))
|
||
except Exception as e:
|
||
print(e, "\n-- problem parsing time ", time_start, " or ", time_end)
|
||
else:
|
||
C['time_start'] = ''
|
||
C['time_end'] = ''
|
||
|
||
if re.search('TBA',C['date']):
|
||
C['start'] = ''
|
||
C['end'] = ''
|
||
C['doy'] = ''
|
||
expanded.append(C)
|
||
continue
|
||
|
||
parts = C['date'].split("-")
|
||
start = parts[0] + "/" + the_year
|
||
end = parts[1] + "/" + the_year
|
||
|
||
try:
|
||
startd = parser.parse(start)
|
||
endd = parser.parse(end)
|
||
C['start'] = "%i-%i" % (startd.month,startd.day)
|
||
C['end'] = "%i-%i" % (endd.month,endd.day)
|
||
C['doy'] = startd.timetuple().tm_yday
|
||
expanded.append(C)
|
||
except Exception as e:
|
||
print(e, "\n-- problem parsing ", start, " or ", end)
|
||
if not startd in by_date:
|
||
by_date[startd] = []
|
||
by_date[startd].append(C)
|
||
|
||
exoutfile.write( json.dumps(expanded,indent=2) )
|
||
exoutfile.close()
|
||
#put_file('/home/public/schedule/', 'cache/', expanded_out, 0)
|
||
|
||
for X in sorted(by_date.keys()):
|
||
#print("Start: ", X)
|
||
if len(by_date[X]) < 200:
|
||
prettydate = X.strftime("%A, %B %d")
|
||
#print(prettydate + ": " + str(len(by_date[X])) + " courses")
|
||
outfile.write(prettydate + ": " + str(len(by_date[X])) + " courses" + "\n")
|
||
for Y in by_date[X]:
|
||
#print "\t" + Y['code'] + " " + Y['crn'] + "\t" + Y['teacher']
|
||
#print(Y)
|
||
#outfile.write("\t" + Y['code'] + " " + Y['crn'] + "\t" + Y['teacher'] + "\t" + Y['type'] +"\n")
|
||
outfile.write("\t" + Y['code'] + " " + Y['crn'] + "\t" + Y['teacher'] + "\t" + Y['type'] + "\t" + "\n")
|
||
outfile.close()
|
||
#put_file('/home/public/schedule/', 'cache/', "%s_latestarts.txt" % term, 0)
|
||
return expanded
|
||
|
||
|
||
|
||
# Get semesters to scrape
|
||
with open('cache/to_scrape.json', 'r') as f:
|
||
semesters = json.load(f)
|
||
|
||
# Loop through each item and call the function
|
||
for item in semesters:
|
||
as_dict = scrape_schedule(item['short_sem'], item['sem'])
|
||
ex = expanded(as_dict, item['short_sem'])
|
||
print(f"Done with {item['sem']}. Sleeping 45 seconds.")
|
||
time.sleep(45)
|
||
|
||
|