#from sqlite3 import paramstyle #from time import strptime #from util import UnicodeDictReader import codecs, json, requests, re, csv, datetime, pysftp, os, jsondiff, os.path import sys, shutil, hmac, hashlib, base64, schedule, time, pathlib, datetime #import pdb from datetime import timedelta import datetime #from collections import defaultdict from canvas_secrets import apiKey, apiSecret, FTP_SITE, FTP_USER, FTP_PW, url, domain, account_id, header, header_media, g_id, g_secret from canvas_secrets import instructure_url, instructure_username, instructure_private_key import os, asyncio from dap.api import DAPClient from dap.dap_types import Credentials from dap.integration.database import DatabaseConnection from dap.replicator.sql import SQLReplicator """ Everything to do with fetching data, - From iLearn, via token - current roster uploads from instructures sftp site - raw logs and other from canvas data repo - from ssb, use firefox to scrape the schedule And some subsequent processing: - Raw roster files, into a more compact json format - Raw logs into something more useful """ verbose = False users = {} users_by_id = {} # todo: all these constants for SSB -- line 1008 # # todo: https://stackoverflow.com/questions/42656247/how-can-i-use-canvas-data-rest-api-using-python sys.setrecursionlimit( 100000 ) local_data_folder = 'cache/canvas_data/' mylog = codecs.open(local_data_folder + 'temp_log.txt','w') class FetchError(Exception): pass DEBUG = 0 def d(s,end=''): global DEBUG if end and DEBUG: print(s,end=end) elif DEBUG: print(s) ################ ################ CANVAS API MAIN FETCHING FUNCTIONS ################ ################ ################ # Main canvas querying fxn def fetch(target,verbose=0,params=0,media=0): # if there are more results, recursivly call myself, adding on to the results. results = 0 if target[0:4] != "http": target = url + target if verbose: print("++ Fetching: " + target) if media: r2 = requests.get(target, headers = header_media) elif params: r2 = requests.get(target, headers = header, params = params) else: r2 = requests.get(target, headers = header) #if verbose: #print "++ Got: " + r2.text try: results = json.loads(r2.text) count = len(results) except: print("-- Failed to parse: ", r2.text) if verbose: print("Got %i results" % count) if verbose > 1: print(r2.headers) tempout = codecs.open('cache/fetchcache.txt','a','utf-8') tempout.write(r2.text+"\n\n") tempout.close() if ('link' in r2.headers and count > 0): links = r2.headers['link'].split(',') for L in links: ll = L.split(';') link = ll[0].replace("<","") link = link.replace(">","") if re.search(r'next', ll[1]): if (verbose): print("++ More link: " + link) #link = re.sub(r'per_page=10$', 'per_page=100', link) # link.replace('per_page=10','per_page=500') #if (verbose): print("++ More link: " + link) nest = fetch(link,verbose,params,media) if isinstance(results,dict): results.update(nest) else: results.extend(nest) return results # Main canvas querying fxn - stream version - don't die on big requests def fetch_stream(target,verbose=0): # if there are more results, recursivly call myself, adding on to the results. results = 0 while target: if target[0:4] != "http": target = url + target if verbose: print("++ Fetching: " + target) r2 = requests.get(target, headers = header) if r2.status_code == 502: raise FetchError() try: results = json.loads(r2.text) count = len(results) except: print("-- Failed to parse: ", r2.text) if verbose: print("Got %i results" % count) if verbose > 1: print(r2.headers) tempout = codecs.open('cache/fetchcache.txt','a','utf-8') tempout.write(r2.text+"\n\n") tempout.close() next_link_found = 0 if ('link' in r2.headers and count > 0): links = r2.headers['link'].split(',') for L in links: ll = L.split(';') link = ll[0].replace("<","") link = link.replace(">","") if re.search(r'next', ll[1]): target = link next_link_found = 1 break if not next_link_found: target = 0 yield results # for dicts with one key, collapse that one key out, cause # paging makes problems... example: enrollment_terms def fetch_collapse(target,collapse='',verbose=0): # if there are more results, recursivly call myself, adding on to the results. results = 0 if target[0:4] != "http": target = url + target if verbose: print("++ Fetching: " + target) r2 = requests.get(target, headers = header) #if verbose: #print "++ Got: " + r2.text try: results = json.loads(r2.text) except: print("-- Failed to parse: ", r2.text) if verbose: print(r2.headers) if collapse and collapse in results: results = results[collapse] if ('link' in r2.headers): links = r2.headers['link'].split(',') for L in links: ll = L.split(';') link = ll[0].replace("<","") link = link.replace(">","") if re.search(r'next', ll[1]): if (verbose): print("++ More link: " + link) nest = fetch_collapse(link, collapse, verbose) if isinstance(results,dict): results.update(nest) else: results.extend(nest) return results ################ ################ CANVAS DATA ################ ################ ################ # Get canvas data 2024 style def canvas_data_2024_run(): print("Updating all tables.") asyncio.run(canvas_data_2024()) print("Done with all tables.") async def canvas_data_2024(): base_url: str = os.environ["DAP_API_URL"] client_id: str = os.environ["DAP_CLIENT_ID"] client_secret: str = os.environ["DAP_CLIENT_SECRET"] #connection_string: str = "postgresql://postgres:rolley34@192.168.1.6/db" # todo: use secrets connection_string: str = "postgresql://postgres:rolley34@deep1/db" desired_tables = "users,courses,communication_channels,context_modules,conversation_message_participants,conversation_messages,conversation_participants,conversations,course_sections,enrollment_states,enrollment_dates_overrides,enrollment_terms,enrollments,learning_outcome_groups,learning_outcome_question_results,learning_outcomes,pseudonyms,quizzes,scores,submissions,submission_versions,wiki_pages,wikis".split(',') credentials = Credentials.create(client_id=client_id, client_secret=client_secret) async with DatabaseConnection(connection_string).open() as db_connection: async with DAPClient(base_url, credentials) as session: #tables = await session.get_tables("canvas") for table in desired_tables: print(f" trying to update {table} ") try: #await SQLReplicator(session, db_connection).initialize("canvas", table) await SQLReplicator(session, db_connection).synchronize("canvas", table) except Exception as e: print(f" - skipping {table} because {e}") # Get canvas data 2024 style def setup_canvas_data_2024_run(): print("Setting up all tables.") asyncio.run(setup_canvas_data_2024()) print("Done with all tables.") async def setup_canvas_data_2024(): base_url: str = os.environ["DAP_API_URL"] client_id: str = os.environ["DAP_CLIENT_ID"] client_secret: str = os.environ["DAP_CLIENT_SECRET"] #connection_string: str = "postgresql://postgres:rolley34@192.168.1.6/db" connection_string: str = "postgresql://postgres:rolley34@192.168.1.192/db" desired_tables = "users,courses,communication_channels,context_modules,conversation_message_participants,conversation_messages,conversation_participants,conversations,course_sections,enrollment_states,enrollment_dates_overrides,enrollment_terms,enrollments,learning_outcome_groups,learning_outcome_question_results,learning_outcomes,pseudonyms,quizzes,scores,submissions,submission_versions,wiki_pages,wikis".split(',') credentials = Credentials.create(client_id=client_id, client_secret=client_secret) async with DatabaseConnection(connection_string).open() as db_connection: async with DAPClient(base_url, credentials) as session: #tables = await session.get_tables("canvas") for table in desired_tables: print(f" {table}") try: await SQLReplicator(session, db_connection).initialize("canvas", table) except Exception as e: print(f" - skipping {table} because {e}") ################ ################ ROSTERS AND REGISTRATION ################ ################ ################ # todo: the pipeline is disorganized. Organize it to have # a hope of taking all this to a higher level. # # todo: where does this belong in the pipeline? compare with recent_schedules() # Take the generically named rosters uploads files and move them to a semester folder and give them a date. def move_to_folder(sem,year,folder,files): semester = year+sem semester_path = 'cache/rosters/%s' % semester if not os.path.isdir('cache/rosters/'+semester): os.makedirs('cache/rosters/'+semester) now = datetime.datetime.now().strftime('%Y-%m-%dT%H-%M') print("+ Moving roster files to folder: %s" % semester_path) if not os.path.isdir(semester_path): print("+ Creating folder: %s" % semester_path) os.makedirs(semester_path) if 'courses.csv' in files: os.rename('cache/rosters/courses-%s.csv' % folder, 'cache/rosters/%s/courses.%s.csv' % (semester,now)) if 'enrollments.csv' in files: os.rename('cache/rosters/enrollments-%s.csv' % folder, 'cache/rosters/%s/enrollments.%s.csv' % (semester,now)) if 'users.csv' in files: os.rename('cache/rosters/users-%s.csv' % folder, 'cache/rosters/%s/users.%s.csv' % (semester,now)) # Take raw upload (csv) files and make one big json out of them. # This relates to enrollment files, not schedule. def convert_roster_files(semester="",year="",folder=""): if not semester: semester = input("the semester? (ex: spring) ") folder = input("Folder? (ex 2020-02-25-14-58-20) ") uf = open('cache/rosters/users-'+folder+'.csv','r') cf = open('cache/rosters/courses-'+folder+'.csv','r') ef = open('cache/rosters/enrollments-'+folder+'.csv','r') u = csv.DictReader(uf) c = csv.DictReader(cf) e = csv.DictReader(ef) uu = [i for i in u] cc = [i for i in c] ee = [i for i in e] uf.close() cf.close() ef.close() myrosterfile = 'cache/rosters/roster_%s_%s.json' % (year, semester) if os.path.exists(myrosterfile): print(" -- Moving previous combined roster json file. opening %s ..." % myrosterfile) last_fileobj = open(myrosterfile,'r') last_file = json.load(last_fileobj) last_fileobj.close() info = last_file[3] last_date = info['date_filestring'] print(' -- writing: cache/rosters/%s%s/roster_%s.json ...' % (year,semester,last_date)) try: os.rename(myrosterfile, 'cache/rosters/%s%s/roster_%s.json ...' % (year,semester,last_date)) print(' -- ok') except Exception as e: print(" ** Failed because i couldn't move the previous roster file: %s" % myrosterfile) print(e) myrosterfile = "new_" + myrosterfile pass #os.remove('cache/old_rosters/roster_'+semester+'.'+last_date+'.json') #os.rename(myrosterfile, 'cache/old_rosters/roster_'+semester+'.'+last_date+'.json') newinfo = {'date_filestring': datetime.datetime.now().strftime('%Y-%m-%dT%H-%M'), } try: new_roster = codecs.open(myrosterfile,'w', 'utf-8') new_roster.write( json.dumps( [uu,cc,ee,newinfo], indent=2 )) new_roster.close() print(" -- Wrote roster info to: %s." % myrosterfile) except Exception as e: print(" ** Failed because i couldn't move the previous roster file: %s" % myrosterfile) print(" ** " + str(e)) def file_doesnt_exist(name): # Get list of files in current directory files = os.listdir() # Filter out zero-size files and directories files = [f for f in files if os.path.isfile(f) and os.path.getsize(f) > 0] if name in files: print( f" * file: {name} already exists. not downloading." ) else: print( f" * file: {name} downloading." ) # Check if the file exists in the filtered list return not (name in files) # From instructure sftp site def fetch_current_rosters(): cnopts = pysftp.CnOpts() cnopts.hostkeys = None with pysftp.Connection(instructure_url,username=instructure_username, private_key=instructure_private_key,cnopts=cnopts) as sftp: sftp.chdir('SIS') files = sftp.listdir() ff = open('cache/pipeline.log.txt','a') now = datetime.datetime.now() exact_time = now.strftime('%Y-%m-%d-%H-%M-%S') rounded_hour = (now.replace(second=0, microsecond=0, minute=0, hour=now.hour) + timedelta(hours=now.minute//30)) rounded_time = rounded_hour.strftime('%Y-%m-%d-%H') if len(files)>0: # and 'users.csv' in files: print(f"--> {exact_time}: I see these files at instructure ftp site:") [print(f" - {f}") for f in files] i = 0 seen_files = [] check = ['login','users','courses','enrollments'] for checking in check: try: if f'{checking}.csv' in files and file_doesnt_exist(f'{checking}-{rounded_time}.csv'): sftp.get(f'{checking}.csv',f'cache/rosters/{checking}-{rounded_time}.csv') i += 1 seen_files.append(f'{checking}.csv') except: print(f' * {checking}.csv not present') print(' Saved %i data files in rosters folder.' % i) ff.write( f" Saved {i} data files: {seen_files}") if i>2: if 'courses.csv' in seen_files: courses = open(f'cache/rosters/courses-{rounded_time}.csv','r') courses.readline() a = courses.readline() print(a) courses.close() parts = a.split(',') year = parts[1][0:4] ss = parts[1][4:6] sem = {'30':'spring', '50':'summer', '70':'fall' } this_sem = sem[ss] print(f" -> This semester is: {this_sem}, {year}" ) print(f" -> Building data file... {rounded_time}") convert_roster_files(this_sem,year,rounded_time) print(' -> moving files...') ff.write( f" Moved files to folder: {this_sem} {year} {rounded_time}\n") move_to_folder(this_sem,year,rounded_time,seen_files) else: print(" * No courses file. Not moving files.") ff.write( f" * No courses file. Not moving files.\n") else: print(f"--> {exact_time}: Don't see files.") sftp.close() def fetch_current_rosters_auto(): fetch_minute = "56,57,58,59,00,01,02,03,04,05,06".split(",") for m in fetch_minute: schedule.every().hour.at(f":{m}").do(fetch_current_rosters) #schedule.every().day.at("12:35").do(sync_non_interactive) #schedule.every().day.at("21:00").do(sync_non_interactive) #print(f"running every hour on the :{fetch_minute}\n") while True: try: schedule.run_pending() time.sleep(4) except Exception as e: import traceback print(" ---- * * * Failed with: %s" % str(e)) ff = open('cache/pipeline.log.txt','a') ff.write(datetime.datetime.now().strftime('%Y-%m-%d-%H-%M-%S') + "\n") ff.write(traceback.format_exc()+"\n---------\n\n") ff.close() #schedule.CancelJob time.sleep(1) # Canvas data, download all new files def sync_non_interactive(): resp = do_request('/api/account/self/file/sync') mylog.write(json.dumps(resp, indent=4)) #mylog.close() gotten = os.listdir(local_data_folder) wanted = [] i = 0 for x in resp['files']: filename = x['filename'] exi = "No " if filename in gotten: exi = "Yes" else: wanted.append(x) print(str(i) + '.\tLocal: %s\tRemote: %s' % ( exi, filename )) i += 1 print("I will attempt to download %i files." % len(wanted)) #answer = input("Press enter to begin, or q to quit ") #if not answer == '': return good_count = 0 bad_count = 0 for W in wanted: print("Downloading: " + W['filename']) response = requests.request(method='GET', url=W['url'], stream=True) if(response.status_code != 200): print('Request response went bad. Got back a %s code, meaning the request was %s' % \ (response.status_code, response.reason)) print('URL: ' + W['url']) bad_count += 1 else: #Use the downloaded data with open(local_data_folder + W['filename'], 'wb') as fd: for chunk in response.iter_content(chunk_size=128): fd.write(chunk) print("Success") good_count += 1 print("Out of %i files, %i succeeded and %i failed." % (len(wanted),good_count,bad_count)) ## OLD STYLE CANVAS DATA # Get something from Canvas Data def do_request(path): #Set up the request pieces method = 'GET' host = 'api.inshosteddata.com' apiTime = datetime.utcnow().strftime('%a, %d %b %y %H:%M:%S GMT') apiContentType = 'application/json' msgList = [] msgList.append(method) msgList.append(host) msgList.append(apiContentType) msgList.append('') msgList.append(path) msgList.append('') msgList.append(apiTime) msgList.append(apiSecret) msgStr = bytes("".join("%s\n" % k for k in msgList).strip(),'utf-8') sig = base64.b64encode(hmac.new(key=bytes(apiSecret,'utf-8'),msg=msgStr,digestmod=hashlib.sha256).digest()) sig = sig.decode('utf-8') headers = {} headers['Authorization'] = 'HMACAuth {}:{}'.format(apiKey,sig) headers['Date'] = apiTime headers['Content-type'] = apiContentType #Submit the request/get a response uri = "https://"+host+path print (uri) print (headers) response = requests.request(method='GET', url=uri, headers=headers, stream=True) #Check to make sure the request was ok if(response.status_code != 200): print(('Request response went bad. Got back a ', response.status_code, ' code, meaning the request was ', response.reason)) else: #Use the downloaded data jsonData = response.json() #print(json.dumps(jsonData, indent=4)) return jsonData ################ ################ SENDING DATA AWAY ################ ################ ################ # Upload a json file to www def put_file(remotepath,localpath, localfile,prompt=1): show_all = 0 folder = datetime.datetime.now().strftime('%Y-%m-%d-%H-%M-%S') cnopts = pysftp.CnOpts() cnopts.hostkeys = None with pysftp.Connection(FTP_SITE,username=FTP_USER, password=FTP_PW,cnopts=cnopts) as sftp: #todo: these paths #files = sftp.listdir() #print(folder + "\tI see these files on remote: ", files, "\n") sftp.chdir(remotepath) files = sftp.listdir() if show_all: print(folder + "\tI see these files on remote: ", files, "\n") localf = os.listdir(localpath) if show_all: print("I see these local: ", localf) if prompt: input('ready to upload') sftp.put(localpath+localfile, localfile, preserve_mtime=True) sftp.close() """ # copy files and directories from local static, to remote static, # preserving modification times on the files for f in localf: print("This local file: " + f + " ", end=' ') if not f in files: sftp.put('video_srt/'+classfoldername+'/'+f, f, preserve_mtime=True) print("Uploaded.") else: print("Skipped.") """ """if len(files)==3 and 'users.csv' in files: sftp.get('courses.csv','rosters/courses-'+folder+'.csv') sftp.get('users.csv','rosters/users-'+folder+'.csv') sftp.get('enrollments.csv','rosters/enrollments-'+folder+'.csv') print folder + '\tSaved three data files in rosters folder.' courses = open('rosters/courses-'+folder+'.csv','r') courses.readline() a = courses.readline() print a courses.close() parts = a.split(',') year = parts[1][0:4] ss = parts[1][4:6] #print parts[1] sem = {'30':'spring', '50':'summer', '70':'fall' } this_sem = sem[ss] #print this_sem, "", year print folder + '\tbuilding data file...' convert_roster_files(this_sem,year,folder) print folder + '\tmoving files...' move_to_folder(this_sem,year,folder) else: print folder + "\tDon't see all three files.""" ################ ################ GOOGLE DOCS ################ ################ ################ def sec(t): return "

"+t+"

\n" def para(t): return "

"+t+"

\n" def ul(t): return "\n" def li(t): return "
  • "+t+"
  • \n" def question(t,bracket=1): ret = '' match = re.search( r'\[(.*)\]', t) if match and bracket: ret += "" t = re.sub( r'\[.*\]','',t) else: parts = t.split(' ') id = '' for p in parts: if re.search(r'[a-zA-Z]',p[0]): id += p[0] ret += "" % id.lower() return ret + '

    ' + t + '

    \n
    ' def answer(t): return t + '
    \n' def read_paragraph_element(element,type="NORMAL_TEXT"): """Returns the text in the given ParagraphElement. Args: element: a ParagraphElement from a Google Doc. """ text_run = element.get('textRun') begin = '' end = '' if not text_run: return '' if 'textStyle' in text_run and 'link' in text_run['textStyle']: begin = '' end = '' if 'textStyle' in text_run and 'bold' in text_run['textStyle'] and text_run['textStyle']['bold']==True and type=="NORMAL_TEXT": begin = '' + begin end = end + '' content = text_run.get('content') content = re.sub(u'\u000b','
    \n',content) return begin + content + end def get_doc(docid, bracket=1, verbose=0): import pickle import os.path from googleapiclient.discovery import build from google_auth_oauthlib.flow import InstalledAppFlow from google.auth.transport.requests import Request #ooout = open(fileout,'w') # If modifying these scopes, delete the file token.pickle. SCOPES = ['https://www.googleapis.com/auth/documents.readonly'] creds = None # The file token.pickle stores the user's access and refresh tokens, and is # created automatically when the authorization flow completes for the first # time. if os.path.exists('token.pickle'): with open('token.pickle', 'rb') as token: creds = pickle.load(token) # If there are no (valid) credentials available, let the user log in. if not creds or not creds.valid: if creds and creds.expired and creds.refresh_token: creds.refresh(Request()) else: flow = InstalledAppFlow.from_client_secrets_file( 'credentials.json', SCOPES) creds = flow.run_local_server(port=0) # Save the credentials for the next run with open('token.pickle', 'wb') as token: pickle.dump(creds, token) service = build('docs', 'v1', credentials=creds) # Retrieve the documents contents from the Docs service. document = service.documents().get(documentId=docid).execute() if verbose: print(document) tempout = codecs.open('cache/trash/gdoctemp.txt','w','utf-8') tempout.write( json.dumps(document,indent=2) + "\n\n\n------------------------------------\n\n") if verbose: print('The title of the document is: {}'.format(document.get('title'))) doc_content = document.get('body').get('content') if verbose: print(doc_content) doc_objects = document.get('inlineObjects') if verbose: print(doc_objects) doc_lists = document.get('lists') text = '
    ' last_type = '' answer_text = '' in_a_list = '' img_count = 1 img_lookup = {} img_heights = {} img_widths = {} if doc_objects: for k,value in doc_objects.items(): tempout.write( "->" + k + "=" + json.dumps(value,indent=2) + "\n\n\n--\n\n") if 'inlineObjectProperties' in value: if 'embeddedObject' in value['inlineObjectProperties']: if 'imageProperties' in value['inlineObjectProperties']['embeddedObject']: if 'contentUri' in value['inlineObjectProperties']['embeddedObject']['imageProperties']: print(k) uu = value['inlineObjectProperties']['embeddedObject']['imageProperties']['contentUri'] response = requests.get(uu, stream=True) name = 'image_' + str(img_count) + '.' + response.headers['content-type'].split('/')[1] img_count += 1 img_lookup[k] = name with open('cache/doc_images/'+name, 'wb') as out_file: shutil.copyfileobj(response.raw, out_file) print(uu) print(response.headers) print(name) #input('x?') del response if 'size' in value['inlineObjectProperties']['embeddedObject']: img_heights[k] = int(value['inlineObjectProperties']['embeddedObject']['size']['height']['magnitude']) img_widths[k] = int(value['inlineObjectProperties']['embeddedObject']['size']['width']['magnitude']) tempout.write('- - - - - - - -\n\n') #for value in doc_lists: # tempout.write( json.dumps(value,indent=2) + "\n\n\n--\n\n") tempout.write('- - - - - - - -\n\n') list_stack = [] list_depth = 0 last_list_depth = 0 for value in doc_content: tempout.write( json.dumps(value,indent=2) + "\n\n\n") if verbose: print(json.dumps(value, sort_keys=True, indent=4)) # todo: x link, x bold, list, image. tag_fxn = para if 'paragraph' in value: this_text = '' if 'bullet' in value['paragraph']: # either we're (1)starting a new list, (2)in one, (3)starting a nested one, or (4)finished a nested one. lid = value['paragraph']['bullet']['listId'] if not list_stack: # 1 list_stack.append(lid) else: if lid == list_stack[0]: # 2 pass else: if not lid in list_stack: # 3 list_stack.append(lid) else: # 4 x = list_stack.pop() while x != lid: list_stack.pop() elif len(list_stack) > 0: # current para isn't a bullet but we still have a list open. list_stack = [] list_depth = len(list_stack) deeper = list_depth - last_list_depth if deeper > 0: answer_text += "" * deeper if len(list_stack): tag_fxn = li elements = value.get('paragraph').get('elements') # inlineObjectElement": { # "inlineObjectId": "kix.ssseeu8j9cfx", if 'paragraphStyle' in value.get('paragraph'): style = value.get('paragraph').get('paragraphStyle') #text += json.dumps(style, sort_keys=True, indent=4) if 'namedStyleType' in style: type = style['namedStyleType'] for elem in elements: # text content this_text += read_paragraph_element(elem,type) # image content if 'inlineObjectElement' in elem: vpi = elem['inlineObjectElement'] if 'inlineObjectId' in vpi: ii = vpi['inlineObjectId'] if ii in img_lookup: img = img_lookup[ii] h = img_heights[ii] w = img_widths[ii] this_text += '' % (img,w,h) if last_type=='NORMAL_TEXT' and type!=last_type: text += answer(answer_text) answer_text = '' if type=='HEADING_2': text += sec(this_text) this_text = '' elif type=='HEADING_3': text += question(this_text,bracket) this_text = '' else: answer_text += tag_fxn(this_text) this_text = '' last_type = type last_list_depth = list_depth elif 'table' in value: # The text in table cells are in nested Structural Elements and tables may be # nested. text += "\nTABLE\n" #table = value.get('table') #for row in table.get('tableRows'): # cells = row.get('tableCells') # for cell in cells: # text += read_strucutural_elements(cell.get('content')) #elif 'tableOfContents' in value: # # The text in the TOC is also in a Structural Element. # toc = value.get('tableOfContents') # text += read_strucutural_elements(toc.get('content')) #else: # print(json.dumps(value, sort_keys=True, indent=4)) text += answer(answer_text) #text += '
    ' #print(text) return text ######### TRY #2 ###### def read_paragraph_element_2(element,type="NORMAL_TEXT"): text_run = element.get('textRun') begin = '' end = '' if not text_run: return '' if 'textStyle' in text_run and 'link' in text_run['textStyle']: begin = '' end = '' if 'textStyle' in text_run and 'bold' in text_run['textStyle'] and text_run['textStyle']['bold']==True and type=="NORMAL_TEXT": begin = '' + begin end = end + '' elif 'textStyle' in text_run and 'italic' in text_run['textStyle'] and text_run['textStyle']['italic']==True and type=="NORMAL_TEXT": begin = '' + begin end = end + '' content = text_run.get('content') content = re.sub(u'\u000b','
    \n',content) return begin + content + end # t is a string that begins with "Icons: " ... and contains comma(space) separated list def handle_icons(t): text = t[7:].strip() parts = text.split(", ") return ('icons',parts) # t is a string that begins with "Tags: " ... and contains comma(space) separated list def handle_tags(t): text = t[6:].strip() parts = text.split(", ") return ('tags',parts) def handle_question(t,bracket=1): anchor = '' match = re.search( r'\[(.*)\]', t) if match and bracket: anchor = match.group(1).lower() t = re.sub( r'\[.*\]','',t) else: parts = t.split(' ') for p in parts: if re.search(r'[a-zA-Z]',p[0]): anchor += p[0].lower() return ('question', t, anchor) def handle_answer(t): return ('answer',t) def handle_sec(t): return ('section',t) def handle_para(t): return ('paragraph',t) def handle_ul(t): return ('unorderdedlist',t) def handle_li(t): return ('listitem',t) img_count = 1 img_lookup = {} img_heights = {} img_widths = {} def fetch_doc_image(k,value): global img_count, img_lookup, img_heights, img_widths if 'inlineObjectProperties' in value: if 'embeddedObject' in value['inlineObjectProperties']: if 'imageProperties' in value['inlineObjectProperties']['embeddedObject']: if 'contentUri' in value['inlineObjectProperties']['embeddedObject']['imageProperties']: print(k) uu = value['inlineObjectProperties']['embeddedObject']['imageProperties']['contentUri'] response = requests.get(uu, stream=True) name = 'image_' + str(img_count) + '.' + response.headers['content-type'].split('/')[1] img_count += 1 img_lookup[k] = name with open('cache/doc_images/'+name, 'wb') as out_file: shutil.copyfileobj(response.raw, out_file) print(uu) print(response.headers) print(name) del response if 'size' in value['inlineObjectProperties']['embeddedObject']: img_heights[k] = int(value['inlineObjectProperties']['embeddedObject']['size']['height']['magnitude']) img_widths[k] = int(value['inlineObjectProperties']['embeddedObject']['size']['width']['magnitude']) def get_doc_generic(docid, bracket=1, verbose=0): import pickle import os.path from googleapiclient.discovery import build from google_auth_oauthlib.flow import InstalledAppFlow from google.auth.transport.requests import Request global img_count, img_lookup, img_heights, img_widths # If modifying these scopes, delete the file token.pickle. SCOPES = ['https://www.googleapis.com/auth/documents.readonly'] creds = None # The file token.pickle stores the user's access and refresh tokens, and is # created automatically when the authorization flow completes for the first # time. if os.path.exists('token.pickle'): with open('token.pickle', 'rb') as token: creds = pickle.load(token) if not creds or not creds.valid: if creds and creds.expired and creds.refresh_token: creds.refresh(Request()) else: flow = InstalledAppFlow.from_client_secrets_file( 'credentials.json', SCOPES) creds = flow.run_local_server(port=0) # Save the credentials for the next run with open('token.pickle', 'wb') as token: pickle.dump(creds, token) service = build('docs', 'v1', credentials=creds) # Retrieve the documents contents from the Docs service. document = service.documents().get(documentId=docid).execute() tempout = codecs.open('cache/trash/gdoctemp.txt','w','utf-8') tempout.write( json.dumps(document,indent=2) \ + "\n\n\n------------------------------------\n\n") if verbose: print('The title of the document is: {}'.format(document.get('title'))) doc_content = document.get('body').get('content') doc_objects = document.get('inlineObjects') doc_lists = document.get('lists') #text = '' result = [] last_type = '' #answer_text = '' answer = [] in_a_list = '' # Get all the images for k,value in doc_objects.items(): tempout.write( "->" + k + "=" + json.dumps(value,indent=2) + "\n\n\n--\n\n") fetched = fetch_doc_image(k,value) list_stack = [] list_depth = 0 last_list_depth = 0 for value in doc_content: tempout.write( json.dumps(value,indent=2) + "\n\n\n") if verbose: print(json.dumps(value, sort_keys=True, indent=4)) tag_fxn = handle_para if 'paragraph' in value: this_text = '' # First we deal with if we're in a list. if 'bullet' in value['paragraph']: # either we're (1)starting a new list, (2)in one (do nothing), # (3)starting a nested one, or (4)finished a nested one. lid = value['paragraph']['bullet']['listId'] if not list_stack: # 1 list_stack.append(lid) else: if not lid == list_stack[0]: if not lid in list_stack: # 3 list_stack.append(lid) else: # 4 x = list_stack.pop() while x != lid: list_stack.pop() elif len(list_stack) > 0: # current para isn't a bullet but we still have a list open. list_stack = [] list_depth = len(list_stack) deeper = list_depth - last_list_depth if deeper > 0: answer.append("" * deeper) if len(list_stack): tag_fxn = handle_li # NOW the tag_fxn is either 'para' or 'li'... let's get the styling info next, elements = value.get('paragraph').get('elements') if 'paragraphStyle' in value.get('paragraph'): style = value.get('paragraph').get('paragraphStyle') if 'namedStyleType' in style: type = style['namedStyleType'] # and FINALLY, the actual contents. for elem in elements: # text content this_text += read_paragraph_element_2(elem,type) # image content if 'inlineObjectElement' in elem: vpi = elem['inlineObjectElement'] if 'inlineObjectId' in vpi: ii = vpi['inlineObjectId'] if ii in img_lookup: img = img_lookup[ii] h = img_heights[ii] w = img_widths[ii] this_text += '' % (img,w,h) # Now for something tricky. Call an appropriate handler, based on: # (a) what is the paragraph style type? # (b) is it different from the prev one? if last_type=='NORMAL_TEXT' and type!=last_type: if this_text.strip(): result.append(handle_answer(answer)) answer = [] #answer_text = '' if type=='HEADING_2' and this_text.strip(): result.append( handle_sec(this_text) ) this_text = '' elif type=='HEADING_3' and this_text.strip(): result.append(handle_question(this_text,bracket)) this_text = '' else: if this_text.lower().startswith('tags:'): tag_fxn = handle_tags if this_text.lower().startswith('icons:'): tag_fxn = handle_icons if this_text.strip(): answer.append(tag_fxn(this_text)) this_text = '' last_type = type last_list_depth = list_depth elif 'table' in value: pass result.append(handle_answer(answer)) return json.dumps(result,indent=4) def process_reg_history(): from collections import defaultdict from itertools import groupby from operator import itemgetter def read_grouped_csv(path): with open(path, newline='') as f: fieldnames = ['datetime', 'crn', 'course', 'teacher', 'max', 'enrolled', 'waitlistmax', 'waitlisted'] reader = csv.DictReader(f, fieldnames=fieldnames) rows = sorted(reader, key=lambda r: r['datetime']) # Group by timestamp grouped = {} for ts, group in groupby(rows, key=itemgetter('datetime')): grouped[ts] = {r['crn']: r for r in group} return grouped def crossed_threshold(old_val, new_val, max_val): thresholds = [0.25, 0.5, 0.75, 1.0] if int(max_val) == 0: return False, None old_ratio = int(old_val) / int(max_val) new_ratio = int(new_val) / int(max_val) for t in thresholds: if old_ratio < t <= new_ratio: return True, int(t * 100) return False, None def detect_changes(prev, curr): changes = defaultdict(list) all_crns = prev.keys() | curr.keys() for crn in all_crns: o, n = prev.get(crn), curr.get(crn) if not o: changes[crn].append((n['datetime'], "Section was added.")) elif not n: changes[crn].append(( o['datetime'], f"Section was removed (last seen: teacher {o['teacher']}, " f"{o['enrolled']}/{o['max']} enrolled, {o['waitlisted']}/{o['waitlistmax']} waitlisted)." )) else: dt = n['datetime'] if o['teacher'] != n['teacher']: changes[crn].append((dt, f"Teacher changed from {o['teacher']} to {n['teacher']}.")) if o['enrolled'] != n['enrolled']: crossed, percent = crossed_threshold(o['enrolled'], n['enrolled'], n['max']) if crossed: changes[crn].append((dt, f"Enrollment crossed {percent}% ({n['enrolled']}/{n['max']}).")) if int(n['waitlisted']) > 10 and o['waitlisted'] != n['waitlisted']: changes[crn].append((dt, f"Waitlist exceeds 10: {n['waitlisted']}.")) return changes def process_diff_timeline(path): snapshots = read_grouped_csv(path) timeline = sorted(snapshots.keys()) timeline_diffs = [] course_names = {} # crn -> latest known course name for i in range(1, len(timeline)): prev_ts, curr_ts = timeline[i-1], timeline[i] prev, curr = snapshots[prev_ts], snapshots[curr_ts] # update course name map for crn, row in curr.items(): course_names[crn] = row['course'] delta = detect_changes(prev, curr) timeline_diffs.append(delta) # Flatten and group by crn crn_changes = defaultdict(list) for delta in timeline_diffs: for crn, changes in delta.items(): crn_changes[crn].extend(changes) # Sort changes for each CRN by datetime for crn in crn_changes: crn_changes[crn].sort(key=lambda x: x[0]) return crn_changes, course_names output1 = codecs.open('cache/reg_timeline_fa25.txt','w','utf-8') changes, course_names = process_diff_timeline("cache/reg_history_fa25.csv") for crn in sorted(changes, key=lambda c: course_names.get(c, "")): course = course_names.get(crn, "") print(f"\n{course} (CRN {crn}):") output1.write(f"\n{course} (CRN {crn}):\n") for dt, msg in changes[crn]: print(f" [{dt}] {msg}") output1.write(f" [{dt}] {msg}\n") def recreate_reg_data(): from collections import defaultdict from datetime import datetime def parse_row(row): dt = datetime.strptime(row['datetime'], "%Y-%m-%dT%H-%M") crn = row['crn'] enrolled = int(row['enrolled']) return dt, row['datetime'], crn, enrolled def reduce_latest_per_day(rows): latest = defaultdict(dict) # latest[crn][date] = (dt, ts, enrolled) latest_ts_by_date = {} # date → (dt, ts) for header naming for row in rows: dt, full_ts, crn, enrolled = parse_row(row) date_str = dt.date().isoformat() ts_header = dt.strftime("%Y-%m-%dT%H") # <-- this is what we want # for each crn, per day, keep latest reading if date_str not in latest[crn] or dt > latest[crn][date_str][0]: latest[crn][date_str] = (dt, ts_header, enrolled) # also record latest timestamp per day for consistent column headers if date_str not in latest_ts_by_date or dt > latest_ts_by_date[date_str][0]: latest_ts_by_date[date_str] = (dt, ts_header) return latest, [ts for _, ts in sorted(latest_ts_by_date.values())] def pivot_table(latest, headers): crns = sorted(latest) table = [] for crn in crns: row = [crn] for ts in headers: date_str = ts[:10] # match on YYYY-MM-DD val = latest[crn].get(date_str) if val and val[1] == ts: row.append(str(val[2])) else: row.append("") table.append(row) return ['crn'] + headers, table with open("cache/reg_history_fa25.csv", newline='') as f: fieldnames = ['datetime', 'crn', 'course', 'teacher', 'max', 'enrolled', 'waitlistmax', 'waitlisted'] reader = csv.DictReader(f, fieldnames=fieldnames) rows = list(reader) latest, headers = reduce_latest_per_day(rows) header_row, table = pivot_table(latest, headers) with open("cache/reg_data_fa25.csv", "w", newline='') as f: writer = csv.writer(f) writer.writerow(header_row) writer.writerows(table) if __name__ == "__main__": print ('') options = { 1: ['Fetch rosters on schedule',fetch_current_rosters_auto] , 2: ['Get canvas data 2024 style', canvas_data_2024_run ], 3: ['Set up canvas data 2024 style', setup_canvas_data_2024_run], 4: ['Narrative timeline of section updates', process_reg_history], 5: ['Recreate reg_data from full reg history', recreate_reg_data], } '''1: ['Re-create schedule csv and json files from raw html',recent_schedules] , 2: ['Fetch rosters',fetch_current_rosters] , 3: 4: ['Compute how registration is filling up classes', schedule_filling] , 5: ['Manually convert 3 csv files to joined json enrollment file.', convert_roster_files] , 6: ['Canvas data: interactive sync', interactive ], 7: ['Canvas data: automated sync', sync_non_interactive ], 8: 9: 16: ['Scrape schedule from ssb', scrape_schedule_multi ], 14: ['Generate latestart schedule', list_latestarts ], 15: ['Test ssb calls with python', scrape_schedule_py ], 10: ['schedule to db', scrape_for_db ], 11: ['clean argos draft schedule file', argos_data_from_cvc], 12: ['make expanded schedule json files of old semesters', expand_old_semesters ], 13: ['Parse deanza schedule', dza_sched ], ''' if len(sys.argv) > 1 and re.search(r'^\d+',sys.argv[1]): resp = int(sys.argv[1]) print("\n\nPerforming: %s\n\n" % options[resp][0]) else: print ('') for key in options: print(str(key) + '.\t' + options[key][0]) print('') resp = input('Choose: ') # Call the function in the options dict options[ int(resp)][1]() # Testing #if __name__ == "__main__": #users = fetch('/api/v1/courses/69/users?per_page=100',1) #print "These are the users: " #print users #getSemesterSchedule() #get_doc() #pass