synced pipelines w/ surface

2023-03-24 09:58:20 -07:00 · 2023-03-24 09:58:20 -07:00 · 37e8007035
parent 411971cf29
commit 37e8007035
1 changed files with 252 additions and 36 deletions
--- a/pipelines.py
+++ b/pipelines.py
@ -5,6 +5,7 @@ from time import strptime
 from bs4 import BeautifulSoup as bs
 from util import UnicodeDictReader
 from datetime import datetime as dt
 from dateutil import parser                          
 import pandas as pd
 import codecs, json, requests, re, csv, datetime, pysftp, os, jsondiff, os.path
 import sys, shutil, hmac, hashlib, base64, schedule, time, pathlib, datetime
@ -29,9 +30,6 @@ And some subsequent processing:
 - Raw logs into something more useful
 """                                              
 verbose = False
 users = {}
@ -428,6 +426,10 @@ def getSemesterSchedule(short='sp21'):                # I used to be current_sch
    return schedule
 def get_enrlmts_for_user(user,enrollments):
    #active enrollments
    u_en = enrollments[ lambda x: (x['user_id'] == user) & (x['workflow']=='active') ]
    return u_en[['type','course_id']]
 ################
@ -812,7 +814,7 @@ def to_section_list(input_text,verbose=0):
        if 'loc' in r:
            if r['loc'] == 'ONLINE': r['type'] = 'online'
-                                                         
+            if r['loc'] == 'ONLINE' and r['time']: r['type'] = 'online live'
            if r['loc'] == 'ONLINE LIVE': r['type'] = 'online live'
            if r['loc']: r['site'] = room_to_site(r['loc'],verbose)
@ -893,9 +895,7 @@ def log_section_filling2(current_sched_list):
    reg_data_filename = 'reg_data_' + short_sem + '.csv'
    new_df.to_csv('cache/' + reg_data_filename, index=False)
-    put_file('/web/phowell/schedule/', 'cache/', reg_data_filename, 0)                
+    put_file('/home/public/schedule/', 'cache/', reg_data_filename, 0)                
    print('ok')
 # Use Firefox and log in to ssb and get full schedule. Only works where selenium is installed
@ -1022,9 +1022,7 @@ def scrape_schedule():
    codecs.open('cache/' + filename, 'w', 'utf-8').write(jj)
-                                                                                                         
+    put_file('/home/public/schedule/', 'cache/', filename, 0)                             #  /gavilan.edu/_files/php/
    put_file('/web/phowell/schedule/', 'cache/', filename, 0)                             #  /gavilan.edu/_files/php/
    return as_dict
@ -1081,7 +1079,8 @@ def recent_schedules():
 # Take the generically named rosters uploads files and move them to a semester folder and give them a date.    
 def move_to_folder(sem,year,folder):
    semester = year+sem
-    semester_path = 'cache/rosters/%s' % semester
+    if not os.path.isdir('cache/rosters/'+semester):
        os.makedirs('cache/rosters/'+semester)
    now = datetime.datetime.now().strftime('%Y-%m-%dT%H-%M')
    print("+  Moving roster files to folder: %s" % semester_path)
    if not os.path.isdir(semester_path):
@ -1159,7 +1158,7 @@ def fetch_current_rosters():
    with pysftp.Connection(instructure_url,username=instructure_username, private_key=instructure_private_key,cnopts=cnopts) as sftp:
        sftp.chdir('SIS')
        files = sftp.listdir()
-        print("\n--> %s I see these files at instructure ftp site: " % dt_label )
+        print("--> %s I see these files at instructure ftp site: " % dt_label )
        [print("   %s" % f) for f in files]
        i = 0
        got_courses = 0
@ -1287,9 +1286,9 @@ def schedule_filling():
 # Upload a json file to www
 def put_file(remotepath,localpath, localfile,prompt=1):
    show_all = 0
    folder = datetime.datetime.now().strftime('%Y-%m-%d-%H-%M-%S')
    cnopts = pysftp.CnOpts()
    cnopts.hostkeys = None
    with pysftp.Connection(FTP_SITE,username=FTP_USER, password=FTP_PW,cnopts=cnopts) as sftp:
@ -1298,12 +1297,9 @@ def put_file(remotepath,localpath, localfile,prompt=1):
        #print(folder + "\tI see these files on remote: ", files, "\n")
        sftp.chdir(remotepath)
        files = sftp.listdir()
-        print(folder + "\tI see these files on remote: ", files, "\n")
+        if show_all: print(folder + "\tI see these files on remote: ", files, "\n")
        localf = os.listdir(localpath)
-        
+        if show_all: print("I see these local: ", localf)
        print("I see these local: ", localf)
        if prompt:
            input('ready to upload')
        sftp.put(localpath+localfile, localfile, preserve_mtime=True)
@ -1862,32 +1858,249 @@ def scrape_schedule_multi():
    global SEMESTER, short_sem, semester_begin, filename, filename_html
-    SEMESTER = 'Summer 2022'
+    SEMESTER = 'Spring 2023'
-    short_sem = 'su22'
+    short_sem = 'sp23'
-    semester_begin = strptime('06/13', '%m/%d')
+    semester_begin = strptime('01/30', '%m/%d')
-    filename = 'su22_sched.json'
+    filename = 'sp23_sched.json'
-    filename_html = 'su22_sched.html'
+    filename_html = 'sp23_sched.html'
-    scrape_schedule()
+    SEM = ['Fall 2022', 'Summer 2022 (View only)', 'Spring 2022 (View only)',
           'Fall 2021 (View only)', 'Summer 2021 (View only)', 'Spring 2021 (View only)', 'Fall 2020 (View only)', 'Summer 2020 (View only)', 'Spring 2020 (View only)',
           'Fall 2019 (View only)', 'Summer 2019 (View only)', 'Spring 2019 (View only)', 'Fall 2018 (View only)', 'Summer 2018 (View only)', 'Spring 2018 (View only)' ]
    srt =  'fa22,su22,sp22,fa21,su21,sp21,fa20,su20,sp20,fa19,su19,sp19,fa18,su18,sp18'.split(',')
    beg = ['08/22','06/13','01/31','08/23','06/14','02/01','08/24','06/15','01/27','08/26','06/17','01/28','08/27','06/18','01/29']
    #for i in [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14]:
    #SEMESTER = SEM[i]
    #short_sem = srt[i]
    #semester_begin = strptime(beg[i], '%m/%d')
    #filename = '%s_sched.json' % short_sem
    #filename_html = '%s_sched.html' % short_sem
    as_dict = scrape_schedule()
    expanded = list_latestarts(short_sem)
    fields = "gp,dean,dept,num,code,crn,teacher,name,act,cap,site,type".split(",")
    ffcsv = codecs.open('cache/enrollment_%s.csv' % short_sem, 'w', 'utf-8')
    with ffcsv as csvfile:
        csvwriter = csv.writer(csvfile)
        csvwriter.writerow(fields)
        for S in expanded:
            parts = S['code'].split(' ')
            S['dept'] = parts[0]
            S['num'] = parts[1]
            S['gp'] = gp[parts[0]]
            S['dean'] = dean[parts[0]]
            S['sem'] = short_sem
            # S['act'] = S['cap']
            if S['loc'] == "ONLINE LIVE": S['site'] = 'OnlineLive'
            csvwriter.writerow( [ S[x] for x in fields ] )
    put_file('/home/public/schedule/', 'cache/', 'enrollment_%s.csv' % short_sem, 0) 
 def scrape_for_db():
    global SEMESTER, gp, dean, short_sem, semester_begin, filename, filename_html
    fields = 'sem,crn,dept,num,gp,dean,code,name,teacher,type,cap,act,loc,site,date,days,time,cred,ztc'.split(',')
    """
    SEMESTER = 'Fall 2022'
    short_sem = 'fa22'
    semester_begin = strptime('08/22', '%m/%d')
    filename = 'fa22_sched.json'
    filename_html = 'fa22_sched.html'
-    scrape_schedule()
+    as_dict = scrape_schedule()
-    
+    fff = codecs.open('cache/%s_sched.sql' % filename, 'w', 'utf-8')
-    SEMESTER = 'Spring 2022'
+    fff.write("CREATE TABLE IF NOT EXISTS schedule ( id text, sem text, dept text, num text, gp text, dean text, code text, crn text, name text, teacher text,mode text, loc text, cap text, act text, site text, date text, cred text, ztc text, days text, time text);\n")
-    short_sem = 'sp22'
+    for S in as_dict:
-    semester_begin = strptime('01/31', '%m/%d')
+        parts = S['code'].split(' ')
-    filename = 'sp22_sched.json'
+        S['dept'] = parts[0]
-    filename_html = 'sp22_sched.html'
+        S['num'] = parts[1]
-    
+        S['gp'] = gp[parts[0]]
-    scrape_schedule()
+        S['dean'] = dean[parts[0]]
        S['sem'] = short_sem
        str = "INSERT INTO schedule (sem,crn,dept,num,gp,dean,code,name,teacher,mode,cap,act,loc,site,date,days,time,cred,ztc) VALUES (%s);\n" % \
              ", ".join( [ "'" + re.sub(r"'", "", S[x]) + "'" for x in fields ] )
        print(str)
        fff.write(str)
    fff.write('UPDATE schedule SET site="OnlineLive" WHERE loc="ONLINE LIVE";\n')
    fff.close()
    """
    SEMESTER = 'Spring 2023 (View only)'
    short_sem = 'sp23'
    semester_begin = strptime('01/30', '%m/%d')
    filename = 'sp23_sched.json'
    filename_html = 'sp23_sched.html'
    as_dict = scrape_schedule()
    fff = codecs.open('cache/%s_sched.sql' % filename, 'w', 'utf-8')
    fff.write("CREATE TABLE IF NOT EXISTS schedule ( id text, sem text, dept text, num text, gp text, dean text, code text, crn text, name text, teacher text,mode text, loc text, cap text, act text, site text, date text, cred text, ztc text, days text, time text);\n")
    for S in as_dict:
        parts = S['code'].split(' ')
        S['dept'] = parts[0]
        S['num'] = parts[1]
        S['gp'] = gp[parts[0]]
        S['dean'] = dean[parts[0]]
        S['sem'] = short_sem
        str = "INSERT INTO schedule (sem,crn,dept,num,gp,dean,code,name,teacher,mode,cap,act,loc,site,date,days,time,cred,ztc) VALUES (%s);\n" % \
              ", ".join( [ "'" + re.sub(r"'", "", S[x]) + "'" for x in fields ] )
        print(str)
        fff.write(str)
    fff.write('UPDATE schedule SET site="OnlineLive" WHERE loc="ONLINE LIVE";\n')
    fff.close()
 def argos_data():
    global dean,gp
    f2 = codecs.open('cache/enrollment_sp23.csv','w','utf-8')
    writer = csv.writer(f2)
    headers = 'gp dean dept num code crn name act site'.split(' ')
    writer.writerow(headers)
    f = codecs.open('cache/sched_draft_sp23.csv','r','utf-8')
    reader = csv.reader(f, delimiter=',')
    headers = next(reader)
    for r in reader:
        d = dict(list(zip(headers,r)))
        print(d)
        my_dean = dean[d['Subj']]
        my_gp = gp[d['Subj']]
        dept = d['Subj']
        num = d['Crse No']
        code = dept + " " + num
        crn = d['CRN']
        name = d['Course Title']
        act = d['Open Seats']
        campus = d['Campus']
        session = d['Session']
        if campus == "Off Campus": site = session
        else: site = campus
        print(site)
        writer.writerow([my_gp,my_dean,dept,num,code,crn,name,act,site])
 def expand_old_semesters():
    terms = 'sp16,su16,fa16,sp17,su17,fa17,sp18,su18,fa18,sp19,su19,fa19,sp20,su20,fa20,sp21,su21,fa21,sp22,su22,fa22'.split(',')
    terms = 'sp16,su16,fa16,sp17,su17,fa17,sp18,su18,fa18,sp19,su19,fa19,sp20,su20'.split(',')
    terms.reverse()
    for t in terms:
        list_latestarts(t)
        input('press return to continue.')
 # Input: xxxx_sched.json. Output: xxxx_latestarts.txt    
 def list_latestarts(term):
    show_summary = 1
    the_year = '20' + term[2:4]
    print("year: ", the_year, "  semester: ", term)
    term_in = "cache/%s_sched.json" % term               
    term_out = "cache/%s_latestarts.txt" % term
    expanded_out = "%s_sched_expanded.json" % term
    print("Writing output to " + term_out)
    infile = codecs.open(term_in, "r", "utf-8")
    outfile = codecs.open(term_out, "w", "utf-8")
    exoutfile = codecs.open('cache/' + expanded_out, "w", "utf-8")
    expanded = []
    sched = json.loads(infile.read())
    #print sched
    by_date = {}
    if show_summary: print("course \t loc \t type \t time")
    for C in sched:
        if (not C['type']) and C['loc'] != 'ONLINE':  # and C['time']:
            C['type'] = 'in-person'
        if show_summary: print("%s \t %s \t %s \t %s" % (C['code'],C['loc'],C['type'],C['time']))
        if 'extra' in C:
            if 'partofday' in C and ('type' in C['extra'][0]) and (C['extra'][0]['type'] == 'online') and C['loc'] != "ONLINE LIVE":
                C['type'] = 'hybrid'
        times = C['time'].split("-")
        if len(times) > 1:
            time_start = times[0]
            time_end = times[1]
            try:
                startt = time.strptime(time_start,"%I:%M %p")
                endt = time.strptime(time_end,"%I:%M %p")
                min_start = startt.tm_min
                min_end = endt.tm_min
                if min_start == 0: min_start = "00"
                else: min_start = str(min_start)
                if min_end == 0: min_end = "00"
                else: min_end = str(min_end)
                C['time_start'] = "%i:%s" % (startt.tm_hour, min_start )
                C['time_end'] = "%i:%s" % (endt.tm_hour, min_end )
                if 0:
                    print("+  Parsed %s into %s and %s." % (C['time'], C['time_start'], C['time_end']))
            except Exception as e:
                print(e, "\n-- problem parsing time ", time_start, " or ", time_end)
        else:
            C['time_start'] = ''
            C['time_end'] = ''
        if re.search('TBA',C['date']): 
            C['start'] = ''
            C['end'] = ''
            C['doy'] = ''
            expanded.append(C)
            continue
        parts = C['date'].split("-") 
        start = parts[0] + "/" + the_year
        end = parts[1] + "/" + the_year
        try:
            startd = parser.parse(start)
            endd = parser.parse(end)
            C['start'] = "%i-%i" % (startd.month,startd.day)
            C['end'] = "%i-%i" % (endd.month,endd.day)
            C['doy'] = startd.timetuple().tm_yday
            expanded.append(C)
        except Exception as e:
            print(e, "\n-- problem parsing ", start, " or ", end)
        if not startd in by_date:
            by_date[startd] = []
        by_date[startd].append(C)
    exoutfile.write( json.dumps(expanded,indent=2) )
    exoutfile.close()
    put_file('/home/public/schedule/', 'cache/', expanded_out, 0)  
    for X in sorted(by_date.keys()):
        #print("Start: ", X)
        if len(by_date[X]) < 200:
            prettydate = X.strftime("%A, %B %d")
            #print(prettydate + ": " + str(len(by_date[X])) + " courses")
            outfile.write(prettydate + ": " + str(len(by_date[X])) + " courses" + "\n")
            for Y in by_date[X]:
                #print "\t" + Y['code'] + " " + Y['crn'] + "\t" + Y['teacher']
                #print(Y)
                #outfile.write("\t" + Y['code'] + " " + Y['crn'] + "\t" + Y['teacher'] + "\t" + Y['type'] +"\n")
                outfile.write("\t" + Y['code'] + " " + Y['crn'] + "\t" + Y['teacher'] + "\t" + Y['type'] + "\t" + "\n")
    return expanded
 if __name__ == "__main__":
    print ('')
@ -1900,7 +2113,10 @@ if __name__ == "__main__":
                7: ['Canvas data: automated sync', sync_non_interactive ], 
                8: ['Scrape schedule from ssb', scrape_schedule_multi ], 
                9: ['Test ssb calls with python', scrape_schedule_py ], 
-                10: ['Parse deanza schedule', dza_sched ],
+                10: ['schedule to db', scrape_for_db ], 
                11: ['clean argos draft schedule file', argos_data],
                12: ['make expanded schedule json files of old semesters', expand_old_semesters ],
                13: ['Parse deanza schedule', dza_sched ],
              }
    if len(sys.argv) > 1 and re.search(r'^\d+',sys.argv[1]):