cleaning - courses, pipelines

This commit is contained in:
phowell 2023-03-27 17:17:32 -07:00
parent 89165b6a09
commit 55290f9fa1
5 changed files with 74 additions and 71 deletions

View File

@ -2,7 +2,7 @@ import json, re, requests, codecs, sys, time, funcy, os
import pandas as pd
from dateutil import parser
from datetime import datetime
from util import print_table
from util import print_table, int_or_zero, float_or_zero
from pipelines import fetch, fetch_stream, getSemesterSchedule, fetch_collapse, header, url, shortToLongSem
from pipelines import sems
@ -19,14 +19,6 @@ stem_course_id = '11015' # TODO
#########
#########
def int_or_zero(x):
if x == None: return 0
else: return int(x)
def float_or_zero(x):
if x == None: return 0
else: return float(x)
# Gott 1 Bootcamp - report on who completed it.
def get_gott1_passers():
course = '1561'
@ -224,67 +216,6 @@ def change_course_ann_homepage(id="10458"):
print(r.text)
#########
######### BOOKSTORE
#########
#########
def scrape_bookstore():
big_courselist_url = "https://svc.bkstr.com/courseMaterial/courses?storeId=10190&termId=100058761"
bcu_cached = json.loads( open('cache/bookstore_courses.json','r').read() )
one_section = "https://svc.bkstr.com/courseMaterial/results?storeId=10190&langId=-1&catalogId=11077&requestType=DDCSBrowse" # NO TEXT
another_section = "https://svc.bkstr.com/courseMaterial/results?storeId=10190&langId=-1&catalogId=11077&requestType=DDCSBrowse" # 3 REQUIRED at:
# [""0""].courseSectionDTO[""0""].courseMaterialResultsList
#
# and also:
#
# [""0""].courseSectionDTO[""0""].sectionAdoptionDTO.materialAdoptions
# todo: where does the most recent schedule come from?
# Input: xxxx_sched.json. Output: xxxx_latestarts.txt
def list_latestarts():
#term = input("Name of current semester file? (ex: sp18) ")
term = "sp23" # sems[0]
term_in = "cache/" + term + "_sched.json"
term_out = "cache/" + term + "_latestarts.txt"
print("Writing output to " + term_out)
infile = open(term_in, "r")
outfile = open(term_out, "w")
sched = json.loads(infile.read())
#print sched
by_date = {}
for C in sched:
parts = C['date'].split("-")
start = parts[0]
codes = C['code'].split(' ')
dept = codes[0]
if dept in ['JLE','JFT','CWE']:
continue
if re.search('TBA',start): continue
try:
startd = parser.parse(start)
except Exception as e:
print(e, "\nproblem parsing ", start)
#print startd
if not startd in by_date:
by_date[startd] = []
by_date[startd].append(C)
for X in sorted(by_date.keys()):
#print "Start: " + str(X)
if len(by_date[X]) < 200:
prettydate = X.strftime("%A, %B %d")
print(prettydate + ": " + str(len(by_date[X])) + " courses")
outfile.write(prettydate + ": " + str(len(by_date[X])) + " courses" + "\n")
for Y in by_date[X]:
#print "\t" + Y['code'] + " " + Y['crn'] + "\t" + Y['teacher']
print(Y)
#outfile.write("\t" + Y['code'] + " " + Y['crn'] + "\t" + Y['teacher'] + "\t" + Y['type'] +"\n")
outfile.write("\t" + Y['code'] + " " + Y['crn'] + "\t" + Y['teacher'] + "\t" + Y['type'] + "\t" + "\n")
# All students enrolled in a class in the given semester. Simpler verson of below. Return SET of course_ids.
def users_in_semester():

View File

@ -6,6 +6,53 @@
# todo: where does the most recent schedule come from?
# Input: xxxx_sched.json. Output: xxxx_latestarts.txt
def list_latestarts():
#term = input("Name of current semester file? (ex: sp18) ")
term = "sp23" # sems[0]
term_in = "cache/" + term + "_sched.json"
term_out = "cache/" + term + "_latestarts.txt"
print("Writing output to " + term_out)
infile = open(term_in, "r")
outfile = open(term_out, "w")
sched = json.loads(infile.read())
#print sched
by_date = {}
for C in sched:
parts = C['date'].split("-")
start = parts[0]
codes = C['code'].split(' ')
dept = codes[0]
if dept in ['JLE','JFT','CWE']:
continue
if re.search('TBA',start): continue
try:
startd = parser.parse(start)
except Exception as e:
print(e, "\nproblem parsing ", start)
#print startd
if not startd in by_date:
by_date[startd] = []
by_date[startd].append(C)
for X in sorted(by_date.keys()):
#print "Start: " + str(X)
if len(by_date[X]) < 200:
prettydate = X.strftime("%A, %B %d")
print(prettydate + ": " + str(len(by_date[X])) + " courses")
outfile.write(prettydate + ": " + str(len(by_date[X])) + " courses" + "\n")
for Y in by_date[X]:
#print "\t" + Y['code'] + " " + Y['crn'] + "\t" + Y['teacher']
print(Y)
#outfile.write("\t" + Y['code'] + " " + Y['crn'] + "\t" + Y['teacher'] + "\t" + Y['type'] +"\n")
outfile.write("\t" + Y['code'] + " " + Y['crn'] + "\t" + Y['teacher'] + "\t" + Y['type'] + "\t" + "\n")
online_courses = {}
def prep_online_courses_df():
global online_courses

View File

@ -2074,7 +2074,7 @@ def expand_old_semesters():
input('press return to continue.')
# Input: xxxx_sched.json. Output: xxxx_latestarts.txt
def list_latestarts(term):
def list_latestarts(term="sp23"):
show_summary = 1

View File

@ -30,6 +30,25 @@ from pipelines import header, url, fetch
#from localcache import local_data_folder, sqlite_file, db, user_goo_to_email
#########
######### BOOKSTORE
#########
#########
def scrape_bookstore():
big_courselist_url = "https://svc.bkstr.com/courseMaterial/courses?storeId=10190&termId=100058761"
bcu_cached = json.loads( open('cache/bookstore_courses.json','r').read() )
one_section = "https://svc.bkstr.com/courseMaterial/results?storeId=10190&langId=-1&catalogId=11077&requestType=DDCSBrowse" # NO TEXT
another_section = "https://svc.bkstr.com/courseMaterial/results?storeId=10190&langId=-1&catalogId=11077&requestType=DDCSBrowse" # 3 REQUIRED at:
# [""0""].courseSectionDTO[""0""].courseMaterialResultsList
#
# and also:
#
# [""0""].courseSectionDTO[""0""].sectionAdoptionDTO.materialAdoptions
def survey_answer(q=0):

View File

@ -42,7 +42,13 @@ def clean_title(st):
if len(st)>50: return st[:50]+'...'
return st
def int_or_zero(x):
if x == None: return 0
else: return int(x)
def float_or_zero(x):
if x == None: return 0
else: return float(x)
def match59(x):
if x['links']['context']==7959: return True