cleanup
This commit is contained in:
parent
ff5ed654eb
commit
9584f45f30
329
content.py
329
content.py
|
|
@ -1528,6 +1528,334 @@ LANE: HyFlex
|
|||
|
||||
|
||||
|
||||
################
|
||||
################ GOOGLE DOCS HELPERS (moved from pipelines)
|
||||
################
|
||||
|
||||
def sec(t): return "<h3>"+t+"</h3>\n"
|
||||
def para(t): return "<p>"+t+"</p>\n"
|
||||
def ul(t): return "<ul>"+t+"</ul>\n"
|
||||
def li(t): return "<li>"+t+"</li>\n"
|
||||
|
||||
def question(t,bracket=1):
|
||||
ret = ''
|
||||
match = re.search( r'\[(.*)\]', t)
|
||||
if match and bracket:
|
||||
ret += "<a name='" + match.group(1) + "'></a>"
|
||||
t = re.sub( r'\[.*\]','',t)
|
||||
else:
|
||||
parts = t.split(' ')
|
||||
id = ''
|
||||
for p in parts:
|
||||
if re.search(r'[a-zA-Z]',p[0]): id += p[0]
|
||||
ret += "<a name='%s'></a>" % id.lower()
|
||||
return ret + '<div class="accordion" data-accordion=""><h4 class="acrd_cntl">' + t + '</h4>\n<div class="acrd_cntnt">'
|
||||
|
||||
def answer(t):
|
||||
return t + '</div></div>\n'
|
||||
|
||||
def read_paragraph_element(element,type="NORMAL_TEXT"):
|
||||
text_run = element.get('textRun')
|
||||
begin = ''
|
||||
end = ''
|
||||
if not text_run:
|
||||
return ''
|
||||
if 'textStyle' in text_run and 'link' in text_run['textStyle']:
|
||||
begin = '<a href="' + text_run['textStyle']['link']['url'] + '">'
|
||||
end = '</a>'
|
||||
if 'textStyle' in text_run and 'bold' in text_run['textStyle'] and text_run['textStyle']['bold']==True and type=="NORMAL_TEXT":
|
||||
begin = '<strong>' + begin
|
||||
end = end + '</strong>'
|
||||
content = text_run.get('content')
|
||||
content = re.sub(u'\u000b','<br />\n',content)
|
||||
return begin + content + end
|
||||
|
||||
def read_paragraph_element_2(element,type="NORMAL_TEXT"):
|
||||
return read_paragraph_element(element,type)
|
||||
|
||||
|
||||
|
||||
# t is a string that begins with "Icons: " ... and contains comma(space) separated list
|
||||
def handle_icons(t):
|
||||
text = t[7:].strip()
|
||||
parts = text.split(", ")
|
||||
return ('icons',parts)
|
||||
|
||||
# t is a string that begins with "Tags: " ... and contains comma(space) separated list
|
||||
def handle_tags(t):
|
||||
text = t[6:].strip()
|
||||
parts = text.split(", ")
|
||||
return ('tags',parts)
|
||||
|
||||
def handle_question(t,bracket=1):
|
||||
anchor = ''
|
||||
match = re.search( r'\[(.*)\]', t)
|
||||
if match and bracket:
|
||||
anchor = match.group(1).lower()
|
||||
t = re.sub( r'\[.*\]','',t)
|
||||
else:
|
||||
parts = t.split(' ')
|
||||
for p in parts:
|
||||
if re.search(r'[a-zA-Z]',p[0]): anchor += p[0].lower()
|
||||
return ('question', t, anchor)
|
||||
|
||||
def handle_answer(t):
|
||||
return ('answer',t)
|
||||
|
||||
def handle_sec(t): return ('section',t)
|
||||
def handle_para(t): return ('paragraph',t)
|
||||
def handle_ul(t): return ('unorderdedlist',t)
|
||||
def handle_li(t): return ('listitem',t)
|
||||
|
||||
|
||||
|
||||
img_count = 1
|
||||
img_lookup = {}
|
||||
img_heights = {}
|
||||
img_widths = {}
|
||||
|
||||
|
||||
'''def fetch_doc_image(k,value):
|
||||
global img_count, img_lookup, img_heights, img_widths
|
||||
if 'inlineObjectProperties' in value:
|
||||
if 'embeddedObject' in value['inlineObjectProperties']:
|
||||
if 'imageProperties' in value['inlineObjectProperties']['embeddedObject']:
|
||||
if 'contentUri' in value['inlineObjectProperties']['embeddedObject']['imageProperties']:
|
||||
print(k)
|
||||
uu = value['inlineObjectProperties']['embeddedObject']['imageProperties']['contentUri']
|
||||
response = requests.get(uu, stream=True)
|
||||
name = 'image_' + str(img_count) + '.' + response.headers['content-type'].split('/')[1]
|
||||
img_count += 1
|
||||
img_lookup[k] = name
|
||||
|
||||
with open('cache/doc_images/'+name, 'wb') as out_file:
|
||||
shutil.copyfileobj(response.raw, out_file)
|
||||
print(uu)
|
||||
print(response.headers)
|
||||
print(name)
|
||||
del response
|
||||
if 'size' in value['inlineObjectProperties']['embeddedObject']:
|
||||
img_heights[k] = int(value['inlineObjectProperties']['embeddedObject']['size']['height']['magnitude'])
|
||||
img_widths[k] = int(value['inlineObjectProperties']['embeddedObject']['size']['width']['magnitude'])
|
||||
'''
|
||||
|
||||
|
||||
def fetch_doc_image(k,value):
|
||||
import shutil
|
||||
if 'inlineObjectProperties' in value:
|
||||
if 'embeddedObject' in value['inlineObjectProperties']:
|
||||
if 'imageProperties' in value['inlineObjectProperties']['embeddedObject']:
|
||||
if 'contentUri' in value['inlineObjectProperties']['embeddedObject']['imageProperties']:
|
||||
uu = value['inlineObjectProperties']['embeddedObject']['imageProperties']['contentUri']
|
||||
response = requests.get(uu, stream=True)
|
||||
name = 'image_' + str(k) + '.' + response.headers['content-type'].split('/')[1]
|
||||
with open('cache/doc_images/'+name, 'wb') as out_file:
|
||||
shutil.copyfileobj(response.raw, out_file)
|
||||
del response
|
||||
return True
|
||||
|
||||
def get_doc(docid, bracket=1, verbose=0):
|
||||
import pickle, shutil
|
||||
import os.path
|
||||
from googleapiclient.discovery import build
|
||||
from google_auth_oauthlib.flow import InstalledAppFlow
|
||||
from google.auth.transport.requests import Request
|
||||
|
||||
#ooout = open(fileout,'w')
|
||||
|
||||
# If modifying these scopes, delete the file token.pickle.
|
||||
SCOPES = ['https://www.googleapis.com/auth/documents.readonly']
|
||||
creds = None
|
||||
# The file token.pickle stores the user's access and refresh tokens, and is
|
||||
# created automatically when the authorization flow completes for the first
|
||||
# time.
|
||||
if os.path.exists('token.pickle'):
|
||||
with open('token.pickle', 'rb') as token:
|
||||
creds = pickle.load(token)
|
||||
# If there are no (valid) credentials available, let the user log in.
|
||||
if not creds or not creds.valid:
|
||||
if creds and creds.expired and creds.refresh_token:
|
||||
creds.refresh(Request())
|
||||
else:
|
||||
flow = InstalledAppFlow.from_client_secrets_file(
|
||||
'credentials.json', SCOPES)
|
||||
creds = flow.run_local_server(port=0)
|
||||
# Save the credentials for the next run
|
||||
with open('token.pickle', 'wb') as token:
|
||||
pickle.dump(creds, token)
|
||||
|
||||
service = build('docs', 'v1', credentials=creds)
|
||||
|
||||
# Retrieve the documents contents from the Docs service.
|
||||
document = service.documents().get(documentId=docid).execute()
|
||||
if verbose: print(document)
|
||||
|
||||
tempout = codecs.open('cache/trash/gdoctemp.txt','w','utf-8')
|
||||
tempout.write( json.dumps(document,indent=2) + "\n\n\n------------------------------------\n\n")
|
||||
if verbose: print('The title of the document is: {}'.format(document.get('title')))
|
||||
doc_content = document.get('body').get('content')
|
||||
if verbose: print(doc_content)
|
||||
|
||||
doc_objects = document.get('inlineObjects')
|
||||
if verbose: print(doc_objects)
|
||||
|
||||
doc_lists = document.get('lists')
|
||||
|
||||
text = '<div class="acrd_grp" data-accordion-group="">'
|
||||
last_type = ''
|
||||
answer_text = ''
|
||||
in_a_list = ''
|
||||
|
||||
img_count = 1
|
||||
img_lookup = {}
|
||||
img_heights = {}
|
||||
img_widths = {}
|
||||
|
||||
if doc_objects:
|
||||
for k,value in doc_objects.items():
|
||||
tempout.write( "->" + k + "=" + json.dumps(value,indent=2) + "\n\n\n--\n\n")
|
||||
if 'inlineObjectProperties' in value:
|
||||
if 'embeddedObject' in value['inlineObjectProperties']:
|
||||
if 'imageProperties' in value['inlineObjectProperties']['embeddedObject']:
|
||||
if 'contentUri' in value['inlineObjectProperties']['embeddedObject']['imageProperties']:
|
||||
print(k)
|
||||
uu = value['inlineObjectProperties']['embeddedObject']['imageProperties']['contentUri']
|
||||
response = requests.get(uu, stream=True)
|
||||
name = 'image_' + str(img_count) + '.' + response.headers['content-type'].split('/')[1]
|
||||
img_count += 1
|
||||
|
||||
img_lookup[k] = name
|
||||
|
||||
with open('cache/doc_images/'+name, 'wb') as out_file:
|
||||
shutil.copyfileobj(response.raw, out_file)
|
||||
print(uu)
|
||||
print(response.headers)
|
||||
print(name)
|
||||
#input('x?')
|
||||
del response
|
||||
if 'size' in value['inlineObjectProperties']['embeddedObject']:
|
||||
img_heights[k] = int(value['inlineObjectProperties']['embeddedObject']['size']['height']['magnitude'])
|
||||
img_widths[k] = int(value['inlineObjectProperties']['embeddedObject']['size']['width']['magnitude'])
|
||||
|
||||
tempout.write('- - - - - - - -\n\n')
|
||||
#for value in doc_lists:
|
||||
# tempout.write( json.dumps(value,indent=2) + "\n\n\n--\n\n")
|
||||
|
||||
tempout.write('- - - - - - - -\n\n')
|
||||
list_stack = []
|
||||
list_depth = 0
|
||||
last_list_depth = 0
|
||||
for value in doc_content:
|
||||
tempout.write( json.dumps(value,indent=2) + "\n\n\n")
|
||||
if verbose: print(json.dumps(value, sort_keys=True, indent=4))
|
||||
|
||||
# todo: x link, x bold, list, image.
|
||||
tag_fxn = para
|
||||
if 'paragraph' in value:
|
||||
this_text = ''
|
||||
|
||||
if 'bullet' in value['paragraph']:
|
||||
# either we're (1)starting a new list, (2)in one, (3)starting a nested one, or (4)finished a nested one.
|
||||
|
||||
lid = value['paragraph']['bullet']['listId']
|
||||
|
||||
if not list_stack: # 1
|
||||
list_stack.append(lid)
|
||||
else:
|
||||
if lid == list_stack[0]: # 2
|
||||
pass
|
||||
|
||||
else:
|
||||
if not lid in list_stack: # 3
|
||||
list_stack.append(lid)
|
||||
else: # 4
|
||||
x = list_stack.pop()
|
||||
while x != lid: list_stack.pop()
|
||||
elif len(list_stack) > 0: # current para isn't a bullet but we still have a list open.
|
||||
list_stack = []
|
||||
|
||||
list_depth = len(list_stack)
|
||||
|
||||
deeper = list_depth - last_list_depth
|
||||
|
||||
if deeper > 0:
|
||||
answer_text += "<ul>" * deeper
|
||||
elif deeper < 0:
|
||||
deeper = -1 * deeper
|
||||
answer_text += "</ul>" * deeper
|
||||
|
||||
if len(list_stack):
|
||||
tag_fxn = li
|
||||
|
||||
elements = value.get('paragraph').get('elements')
|
||||
|
||||
# inlineObjectElement": {
|
||||
# "inlineObjectId": "kix.ssseeu8j9cfx",
|
||||
|
||||
if 'paragraphStyle' in value.get('paragraph'):
|
||||
style = value.get('paragraph').get('paragraphStyle')
|
||||
#text += json.dumps(style, sort_keys=True, indent=4)
|
||||
if 'namedStyleType' in style:
|
||||
type = style['namedStyleType']
|
||||
|
||||
for elem in elements:
|
||||
|
||||
# text content
|
||||
this_text += read_paragraph_element(elem,type)
|
||||
|
||||
# image content
|
||||
if 'inlineObjectElement' in elem:
|
||||
vpi = elem['inlineObjectElement']
|
||||
if 'inlineObjectId' in vpi:
|
||||
ii = vpi['inlineObjectId']
|
||||
if ii in img_lookup:
|
||||
img = img_lookup[ii]
|
||||
h = img_heights[ii]
|
||||
w = img_widths[ii]
|
||||
this_text += '<img src="doc_images/%s" width="%i" height="%i" />' % (img,w,h)
|
||||
|
||||
|
||||
|
||||
if last_type=='NORMAL_TEXT' and type!=last_type:
|
||||
text += answer(answer_text)
|
||||
answer_text = ''
|
||||
|
||||
if type=='HEADING_2':
|
||||
text += sec(this_text)
|
||||
this_text = ''
|
||||
elif type=='HEADING_3':
|
||||
text += question(this_text,bracket)
|
||||
this_text = ''
|
||||
else:
|
||||
answer_text += tag_fxn(this_text)
|
||||
this_text = ''
|
||||
last_type = type
|
||||
last_list_depth = list_depth
|
||||
|
||||
elif 'table' in value:
|
||||
# The text in table cells are in nested Structural Elements and tables may be
|
||||
# nested.
|
||||
text += "\nTABLE\n"
|
||||
#table = value.get('table')
|
||||
#for row in table.get('tableRows'):
|
||||
# cells = row.get('tableCells')
|
||||
# for cell in cells:
|
||||
# text += read_strucutural_elements(cell.get('content'))
|
||||
#elif 'tableOfContents' in value:
|
||||
# # The text in the TOC is also in a Structural Element.
|
||||
# toc = value.get('tableOfContents')
|
||||
# text += read_strucutural_elements(toc.get('content'))
|
||||
|
||||
#else:
|
||||
# print(json.dumps(value, sort_keys=True, indent=4))
|
||||
|
||||
text += answer(answer_text)
|
||||
#text += '</div>'
|
||||
#print(text)
|
||||
return text
|
||||
|
||||
def get_doc_generic(docid, bracket=1, verbose=0):
|
||||
return get_doc(docid, bracket, verbose)
|
||||
|
||||
|
||||
|
||||
|
|
@ -1567,4 +1895,3 @@ if __name__ == "__main__":
|
|||
|
||||
# Call the function in the options dict
|
||||
options[ int(resp)][1]()
|
||||
|
||||
|
|
|
|||
19
courses.py
19
courses.py
|
|
@ -2690,24 +2690,7 @@ def enrollment_helper():
|
|||
|
||||
# fill percentage for each section, then by mode, tod, campus
|
||||
|
||||
def try_clustering(df):
|
||||
# Import required libraries
|
||||
from sklearn.cluster import KMeans
|
||||
|
||||
# Preprocessing
|
||||
|
||||
# Assuming df is your DataFrame and "modes" is your categorical column
|
||||
#df['code'] = df['code'].astype('category').cat.codes
|
||||
|
||||
# Removing any other unnecessary columns
|
||||
df = df.drop(['code'], axis=1)
|
||||
|
||||
# Perform KMeans clustering
|
||||
kmeans = KMeans(n_clusters=4, random_state=0).fit(df)
|
||||
|
||||
# Get the cluster labels
|
||||
labels = kmeans.labels_
|
||||
|
||||
## moved: try_clustering now in search.py
|
||||
# Add labels to the DataFrame
|
||||
#df['clusters'] = labels
|
||||
#print(df)
|
||||
|
|
|
|||
|
|
@ -4,6 +4,98 @@
|
|||
|
||||
# from pipelines - canvas data
|
||||
|
||||
|
||||
# Canvas data, download all new files
|
||||
def sync_non_interactive():
|
||||
resp = do_request('/api/account/self/file/sync')
|
||||
mylog.write(json.dumps(resp, indent=4))
|
||||
#mylog.close()
|
||||
gotten = os.listdir(local_data_folder)
|
||||
wanted = []
|
||||
i = 0
|
||||
for x in resp['files']:
|
||||
filename = x['filename']
|
||||
exi = "No "
|
||||
if filename in gotten: exi = "Yes"
|
||||
else: wanted.append(x)
|
||||
|
||||
print(str(i) + '.\tLocal: %s\tRemote: %s' % ( exi, filename ))
|
||||
i += 1
|
||||
print("I will attempt to download %i files." % len(wanted))
|
||||
|
||||
#answer = input("Press enter to begin, or q to quit ")
|
||||
#if not answer == '': return
|
||||
|
||||
good_count = 0
|
||||
bad_count = 0
|
||||
for W in wanted:
|
||||
print("Downloading: " + W['filename'])
|
||||
response = requests.request(method='GET', url=W['url'], stream=True)
|
||||
if(response.status_code != 200):
|
||||
print('Request response went bad. Got back a %s code, meaning the request was %s' % \
|
||||
(response.status_code, response.reason))
|
||||
print('URL: ' + W['url'])
|
||||
bad_count += 1
|
||||
|
||||
else:
|
||||
#Use the downloaded data
|
||||
with open(local_data_folder + W['filename'], 'wb') as fd:
|
||||
for chunk in response.iter_content(chunk_size=128):
|
||||
fd.write(chunk)
|
||||
print("Success")
|
||||
good_count += 1
|
||||
print("Out of %i files, %i succeeded and %i failed." % (len(wanted),good_count,bad_count))
|
||||
|
||||
|
||||
## OLD STYLE CANVAS DATA
|
||||
|
||||
# Get something from Canvas Data
|
||||
def do_request(path):
|
||||
#Set up the request pieces
|
||||
method = 'GET'
|
||||
host = 'api.inshosteddata.com'
|
||||
apiTime = datetime.utcnow().strftime('%a, %d %b %y %H:%M:%S GMT')
|
||||
apiContentType = 'application/json'
|
||||
|
||||
msgList = []
|
||||
msgList.append(method)
|
||||
msgList.append(host)
|
||||
msgList.append(apiContentType)
|
||||
msgList.append('')
|
||||
msgList.append(path)
|
||||
msgList.append('')
|
||||
msgList.append(apiTime)
|
||||
msgList.append(apiSecret)
|
||||
|
||||
msgStr = bytes("".join("%s\n" % k for k in msgList).strip(),'utf-8')
|
||||
|
||||
sig = base64.b64encode(hmac.new(key=bytes(apiSecret,'utf-8'),msg=msgStr,digestmod=hashlib.sha256).digest())
|
||||
sig = sig.decode('utf-8')
|
||||
|
||||
headers = {}
|
||||
headers['Authorization'] = 'HMACAuth {}:{}'.format(apiKey,sig)
|
||||
headers['Date'] = apiTime
|
||||
headers['Content-type'] = apiContentType
|
||||
|
||||
|
||||
#Submit the request/get a response
|
||||
uri = "https://"+host+path
|
||||
print (uri)
|
||||
print (headers)
|
||||
response = requests.request(method='GET', url=uri, headers=headers, stream=True)
|
||||
|
||||
#Check to make sure the request was ok
|
||||
if(response.status_code != 200):
|
||||
print(('Request response went bad. Got back a ', response.status_code, ' code, meaning the request was ', response.reason))
|
||||
else:
|
||||
#Use the downloaded data
|
||||
jsonData = response.json()
|
||||
#print(json.dumps(jsonData, indent=4))
|
||||
return jsonData
|
||||
|
||||
|
||||
|
||||
|
||||
def file_doesnt_exist(name):
|
||||
# Get list of files in current directory
|
||||
files = os.listdir()
|
||||
|
|
|
|||
|
|
@ -8,7 +8,7 @@ from datetime import datetime as dt
|
|||
from datetime import timedelta
|
||||
from dateutil.parser import parse
|
||||
from os.path import exists, getmtime
|
||||
from pipelines import sync_non_interactive, url, header
|
||||
from pipelines import url, header
|
||||
import util
|
||||
from semesters import short_to_sis
|
||||
|
||||
|
|
@ -1121,7 +1121,7 @@ def full_reload():
|
|||
except Exception as e:
|
||||
print("Couldn't rename file:", str(e))
|
||||
|
||||
sync_non_interactive()
|
||||
#sync_non_interactive()
|
||||
|
||||
setup_table('requests_sum1')
|
||||
setup_table('courses')
|
||||
|
|
|
|||
511
pipelines.py
511
pipelines.py
|
|
@ -3,7 +3,7 @@ import codecs, json, requests, re, csv, datetime, os, jsondiff, os.path
|
|||
import sys, shutil, hmac, hashlib, base64, schedule, time, pathlib
|
||||
from datetime import timedelta
|
||||
|
||||
from canvas_secrets import apiKey, apiSecret, FTP_SITE, FTP_USER, FTP_PW, url, domain, account_id, header, header_media, g_id, g_secret
|
||||
from canvas_secrets import apiKey, apiSecret, FTP_SITE, FTP_USER, FTP_PW, url, domain, account_id, header, header_media
|
||||
from canvas_secrets import instructure_url, instructure_username, instructure_private_key
|
||||
|
||||
import os, asyncio
|
||||
|
|
@ -571,95 +571,6 @@ def fetch_current_rosters_auto(poll_seconds=15):
|
|||
time.sleep(max(5, int(poll_seconds)))
|
||||
|
||||
|
||||
# Canvas data, download all new files
|
||||
def sync_non_interactive():
|
||||
resp = do_request('/api/account/self/file/sync')
|
||||
mylog.write(json.dumps(resp, indent=4))
|
||||
#mylog.close()
|
||||
gotten = os.listdir(local_data_folder)
|
||||
wanted = []
|
||||
i = 0
|
||||
for x in resp['files']:
|
||||
filename = x['filename']
|
||||
exi = "No "
|
||||
if filename in gotten: exi = "Yes"
|
||||
else: wanted.append(x)
|
||||
|
||||
print(str(i) + '.\tLocal: %s\tRemote: %s' % ( exi, filename ))
|
||||
i += 1
|
||||
print("I will attempt to download %i files." % len(wanted))
|
||||
|
||||
#answer = input("Press enter to begin, or q to quit ")
|
||||
#if not answer == '': return
|
||||
|
||||
good_count = 0
|
||||
bad_count = 0
|
||||
for W in wanted:
|
||||
print("Downloading: " + W['filename'])
|
||||
response = requests.request(method='GET', url=W['url'], stream=True)
|
||||
if(response.status_code != 200):
|
||||
print('Request response went bad. Got back a %s code, meaning the request was %s' % \
|
||||
(response.status_code, response.reason))
|
||||
print('URL: ' + W['url'])
|
||||
bad_count += 1
|
||||
|
||||
else:
|
||||
#Use the downloaded data
|
||||
with open(local_data_folder + W['filename'], 'wb') as fd:
|
||||
for chunk in response.iter_content(chunk_size=128):
|
||||
fd.write(chunk)
|
||||
print("Success")
|
||||
good_count += 1
|
||||
print("Out of %i files, %i succeeded and %i failed." % (len(wanted),good_count,bad_count))
|
||||
|
||||
|
||||
## OLD STYLE CANVAS DATA
|
||||
|
||||
# Get something from Canvas Data
|
||||
def do_request(path):
|
||||
#Set up the request pieces
|
||||
method = 'GET'
|
||||
host = 'api.inshosteddata.com'
|
||||
apiTime = datetime.utcnow().strftime('%a, %d %b %y %H:%M:%S GMT')
|
||||
apiContentType = 'application/json'
|
||||
|
||||
msgList = []
|
||||
msgList.append(method)
|
||||
msgList.append(host)
|
||||
msgList.append(apiContentType)
|
||||
msgList.append('')
|
||||
msgList.append(path)
|
||||
msgList.append('')
|
||||
msgList.append(apiTime)
|
||||
msgList.append(apiSecret)
|
||||
|
||||
msgStr = bytes("".join("%s\n" % k for k in msgList).strip(),'utf-8')
|
||||
|
||||
sig = base64.b64encode(hmac.new(key=bytes(apiSecret,'utf-8'),msg=msgStr,digestmod=hashlib.sha256).digest())
|
||||
sig = sig.decode('utf-8')
|
||||
|
||||
headers = {}
|
||||
headers['Authorization'] = 'HMACAuth {}:{}'.format(apiKey,sig)
|
||||
headers['Date'] = apiTime
|
||||
headers['Content-type'] = apiContentType
|
||||
|
||||
|
||||
#Submit the request/get a response
|
||||
uri = "https://"+host+path
|
||||
print (uri)
|
||||
print (headers)
|
||||
response = requests.request(method='GET', url=uri, headers=headers, stream=True)
|
||||
|
||||
#Check to make sure the request was ok
|
||||
if(response.status_code != 200):
|
||||
print(('Request response went bad. Got back a ', response.status_code, ' code, meaning the request was ', response.reason))
|
||||
else:
|
||||
#Use the downloaded data
|
||||
jsonData = response.json()
|
||||
#print(json.dumps(jsonData, indent=4))
|
||||
return jsonData
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
|
@ -693,425 +604,7 @@ def put_file(remotepath,localpath, localfile,prompt=1):
|
|||
sftp.close()
|
||||
|
||||
|
||||
"""
|
||||
# copy files and directories from local static, to remote static,
|
||||
# preserving modification times on the files
|
||||
for f in localf:
|
||||
print("This local file: " + f + " ", end=' ')
|
||||
if not f in files:
|
||||
sftp.put('video_srt/'+classfoldername+'/'+f, f, preserve_mtime=True)
|
||||
print("Uploaded.")
|
||||
else:
|
||||
print("Skipped.")
|
||||
"""
|
||||
|
||||
"""if len(files)==3 and 'users.csv' in files:
|
||||
sftp.get('courses.csv','rosters/courses-'+folder+'.csv')
|
||||
sftp.get('users.csv','rosters/users-'+folder+'.csv')
|
||||
sftp.get('enrollments.csv','rosters/enrollments-'+folder+'.csv')
|
||||
print folder + '\tSaved three data files in rosters folder.'
|
||||
|
||||
courses = open('rosters/courses-'+folder+'.csv','r')
|
||||
courses.readline()
|
||||
a = courses.readline()
|
||||
print a
|
||||
courses.close()
|
||||
parts = a.split(',')
|
||||
year = parts[1][0:4]
|
||||
ss = parts[1][4:6]
|
||||
#print parts[1]
|
||||
sem = {'30':'spring', '50':'summer', '70':'fall' }
|
||||
this_sem = sem[ss]
|
||||
#print this_sem, "", year
|
||||
print folder + '\tbuilding data file...'
|
||||
convert_roster_files(this_sem,year,folder)
|
||||
print folder + '\tmoving files...'
|
||||
move_to_folder(this_sem,year,folder)
|
||||
else:
|
||||
print folder + "\tDon't see all three files."""
|
||||
|
||||
|
||||
|
||||
################
|
||||
################ GOOGLE DOCS
|
||||
################
|
||||
################
|
||||
################
|
||||
|
||||
def sec(t): return "<h3>"+t+"</h3>\n"
|
||||
def para(t): return "<p>"+t+"</p>\n"
|
||||
def ul(t): return "<ul>"+t+"</ul>\n"
|
||||
def li(t): return "<li>"+t+"</li>\n"
|
||||
|
||||
def question(t,bracket=1):
|
||||
ret = ''
|
||||
match = re.search( r'\[(.*)\]', t)
|
||||
if match and bracket:
|
||||
ret += "<a name='" + match.group(1) + "'></a>"
|
||||
t = re.sub( r'\[.*\]','',t)
|
||||
else:
|
||||
parts = t.split(' ')
|
||||
id = ''
|
||||
for p in parts:
|
||||
if re.search(r'[a-zA-Z]',p[0]): id += p[0]
|
||||
ret += "<a name='%s'></a>" % id.lower()
|
||||
return ret + '<div class="accordion" data-accordion=""><h4 class="acrd_cntl">' + t + '</h4>\n<div class="acrd_cntnt">'
|
||||
|
||||
def answer(t):
|
||||
return t + '</div></div>\n'
|
||||
|
||||
def read_paragraph_element(element,type="NORMAL_TEXT"):
|
||||
"""Returns the text in the given ParagraphElement.
|
||||
|
||||
Args:
|
||||
element: a ParagraphElement from a Google Doc.
|
||||
"""
|
||||
text_run = element.get('textRun')
|
||||
begin = ''
|
||||
end = ''
|
||||
if not text_run:
|
||||
return ''
|
||||
if 'textStyle' in text_run and 'link' in text_run['textStyle']:
|
||||
begin = '<a href="' + text_run['textStyle']['link']['url'] + '">'
|
||||
end = '</a>'
|
||||
if 'textStyle' in text_run and 'bold' in text_run['textStyle'] and text_run['textStyle']['bold']==True and type=="NORMAL_TEXT":
|
||||
begin = '<strong>' + begin
|
||||
end = end + '</strong>'
|
||||
|
||||
content = text_run.get('content')
|
||||
content = re.sub(u'\u000b','<br />\n',content)
|
||||
|
||||
return begin + content + end
|
||||
|
||||
|
||||
def get_doc(docid, bracket=1, verbose=0):
|
||||
import pickle
|
||||
import os.path
|
||||
from googleapiclient.discovery import build
|
||||
from google_auth_oauthlib.flow import InstalledAppFlow
|
||||
from google.auth.transport.requests import Request
|
||||
|
||||
#ooout = open(fileout,'w')
|
||||
|
||||
# If modifying these scopes, delete the file token.pickle.
|
||||
SCOPES = ['https://www.googleapis.com/auth/documents.readonly']
|
||||
creds = None
|
||||
# The file token.pickle stores the user's access and refresh tokens, and is
|
||||
# created automatically when the authorization flow completes for the first
|
||||
# time.
|
||||
if os.path.exists('token.pickle'):
|
||||
with open('token.pickle', 'rb') as token:
|
||||
creds = pickle.load(token)
|
||||
# If there are no (valid) credentials available, let the user log in.
|
||||
if not creds or not creds.valid:
|
||||
if creds and creds.expired and creds.refresh_token:
|
||||
creds.refresh(Request())
|
||||
else:
|
||||
flow = InstalledAppFlow.from_client_secrets_file(
|
||||
'credentials.json', SCOPES)
|
||||
creds = flow.run_local_server(port=0)
|
||||
# Save the credentials for the next run
|
||||
with open('token.pickle', 'wb') as token:
|
||||
pickle.dump(creds, token)
|
||||
|
||||
service = build('docs', 'v1', credentials=creds)
|
||||
|
||||
# Retrieve the documents contents from the Docs service.
|
||||
document = service.documents().get(documentId=docid).execute()
|
||||
if verbose: print(document)
|
||||
|
||||
tempout = codecs.open('cache/trash/gdoctemp.txt','w','utf-8')
|
||||
tempout.write( json.dumps(document,indent=2) + "\n\n\n------------------------------------\n\n")
|
||||
if verbose: print('The title of the document is: {}'.format(document.get('title')))
|
||||
doc_content = document.get('body').get('content')
|
||||
if verbose: print(doc_content)
|
||||
|
||||
doc_objects = document.get('inlineObjects')
|
||||
if verbose: print(doc_objects)
|
||||
|
||||
doc_lists = document.get('lists')
|
||||
|
||||
text = '<div class="acrd_grp" data-accordion-group="">'
|
||||
last_type = ''
|
||||
answer_text = ''
|
||||
in_a_list = ''
|
||||
|
||||
img_count = 1
|
||||
img_lookup = {}
|
||||
img_heights = {}
|
||||
img_widths = {}
|
||||
|
||||
if doc_objects:
|
||||
for k,value in doc_objects.items():
|
||||
tempout.write( "->" + k + "=" + json.dumps(value,indent=2) + "\n\n\n--\n\n")
|
||||
if 'inlineObjectProperties' in value:
|
||||
if 'embeddedObject' in value['inlineObjectProperties']:
|
||||
if 'imageProperties' in value['inlineObjectProperties']['embeddedObject']:
|
||||
if 'contentUri' in value['inlineObjectProperties']['embeddedObject']['imageProperties']:
|
||||
print(k)
|
||||
uu = value['inlineObjectProperties']['embeddedObject']['imageProperties']['contentUri']
|
||||
response = requests.get(uu, stream=True)
|
||||
name = 'image_' + str(img_count) + '.' + response.headers['content-type'].split('/')[1]
|
||||
img_count += 1
|
||||
|
||||
img_lookup[k] = name
|
||||
|
||||
with open('cache/doc_images/'+name, 'wb') as out_file:
|
||||
shutil.copyfileobj(response.raw, out_file)
|
||||
print(uu)
|
||||
print(response.headers)
|
||||
print(name)
|
||||
#input('x?')
|
||||
del response
|
||||
if 'size' in value['inlineObjectProperties']['embeddedObject']:
|
||||
img_heights[k] = int(value['inlineObjectProperties']['embeddedObject']['size']['height']['magnitude'])
|
||||
img_widths[k] = int(value['inlineObjectProperties']['embeddedObject']['size']['width']['magnitude'])
|
||||
|
||||
tempout.write('- - - - - - - -\n\n')
|
||||
#for value in doc_lists:
|
||||
# tempout.write( json.dumps(value,indent=2) + "\n\n\n--\n\n")
|
||||
|
||||
tempout.write('- - - - - - - -\n\n')
|
||||
list_stack = []
|
||||
list_depth = 0
|
||||
last_list_depth = 0
|
||||
for value in doc_content:
|
||||
tempout.write( json.dumps(value,indent=2) + "\n\n\n")
|
||||
if verbose: print(json.dumps(value, sort_keys=True, indent=4))
|
||||
|
||||
# todo: x link, x bold, list, image.
|
||||
tag_fxn = para
|
||||
if 'paragraph' in value:
|
||||
this_text = ''
|
||||
|
||||
if 'bullet' in value['paragraph']:
|
||||
# either we're (1)starting a new list, (2)in one, (3)starting a nested one, or (4)finished a nested one.
|
||||
|
||||
lid = value['paragraph']['bullet']['listId']
|
||||
|
||||
if not list_stack: # 1
|
||||
list_stack.append(lid)
|
||||
else:
|
||||
if lid == list_stack[0]: # 2
|
||||
pass
|
||||
|
||||
else:
|
||||
if not lid in list_stack: # 3
|
||||
list_stack.append(lid)
|
||||
else: # 4
|
||||
x = list_stack.pop()
|
||||
while x != lid: list_stack.pop()
|
||||
elif len(list_stack) > 0: # current para isn't a bullet but we still have a list open.
|
||||
list_stack = []
|
||||
|
||||
list_depth = len(list_stack)
|
||||
|
||||
deeper = list_depth - last_list_depth
|
||||
|
||||
if deeper > 0:
|
||||
answer_text += "<ul>" * deeper
|
||||
elif deeper < 0:
|
||||
deeper = -1 * deeper
|
||||
answer_text += "</ul>" * deeper
|
||||
|
||||
if len(list_stack):
|
||||
tag_fxn = li
|
||||
|
||||
elements = value.get('paragraph').get('elements')
|
||||
|
||||
# inlineObjectElement": {
|
||||
# "inlineObjectId": "kix.ssseeu8j9cfx",
|
||||
|
||||
if 'paragraphStyle' in value.get('paragraph'):
|
||||
style = value.get('paragraph').get('paragraphStyle')
|
||||
#text += json.dumps(style, sort_keys=True, indent=4)
|
||||
if 'namedStyleType' in style:
|
||||
type = style['namedStyleType']
|
||||
|
||||
for elem in elements:
|
||||
|
||||
# text content
|
||||
this_text += read_paragraph_element(elem,type)
|
||||
|
||||
# image content
|
||||
if 'inlineObjectElement' in elem:
|
||||
vpi = elem['inlineObjectElement']
|
||||
if 'inlineObjectId' in vpi:
|
||||
ii = vpi['inlineObjectId']
|
||||
if ii in img_lookup:
|
||||
img = img_lookup[ii]
|
||||
h = img_heights[ii]
|
||||
w = img_widths[ii]
|
||||
this_text += '<img src="doc_images/%s" width="%i" height="%i" />' % (img,w,h)
|
||||
|
||||
|
||||
|
||||
if last_type=='NORMAL_TEXT' and type!=last_type:
|
||||
text += answer(answer_text)
|
||||
answer_text = ''
|
||||
|
||||
if type=='HEADING_2':
|
||||
text += sec(this_text)
|
||||
this_text = ''
|
||||
elif type=='HEADING_3':
|
||||
text += question(this_text,bracket)
|
||||
this_text = ''
|
||||
else:
|
||||
answer_text += tag_fxn(this_text)
|
||||
this_text = ''
|
||||
last_type = type
|
||||
last_list_depth = list_depth
|
||||
|
||||
elif 'table' in value:
|
||||
# The text in table cells are in nested Structural Elements and tables may be
|
||||
# nested.
|
||||
text += "\nTABLE\n"
|
||||
#table = value.get('table')
|
||||
#for row in table.get('tableRows'):
|
||||
# cells = row.get('tableCells')
|
||||
# for cell in cells:
|
||||
# text += read_strucutural_elements(cell.get('content'))
|
||||
#elif 'tableOfContents' in value:
|
||||
# # The text in the TOC is also in a Structural Element.
|
||||
# toc = value.get('tableOfContents')
|
||||
# text += read_strucutural_elements(toc.get('content'))
|
||||
|
||||
#else:
|
||||
# print(json.dumps(value, sort_keys=True, indent=4))
|
||||
|
||||
text += answer(answer_text)
|
||||
#text += '</div>'
|
||||
#print(text)
|
||||
return text
|
||||
|
||||
######### TRY #2 ######
|
||||
|
||||
|
||||
def read_paragraph_element_2(element,type="NORMAL_TEXT"):
|
||||
text_run = element.get('textRun')
|
||||
begin = ''
|
||||
end = ''
|
||||
if not text_run: return ''
|
||||
if 'textStyle' in text_run and 'link' in text_run['textStyle']:
|
||||
begin = '<a href="' + text_run['textStyle']['link']['url'] + '">'
|
||||
end = '</a>'
|
||||
if 'textStyle' in text_run and 'bold' in text_run['textStyle'] and text_run['textStyle']['bold']==True and type=="NORMAL_TEXT":
|
||||
begin = '<strong>' + begin
|
||||
end = end + '</strong>'
|
||||
elif 'textStyle' in text_run and 'italic' in text_run['textStyle'] and text_run['textStyle']['italic']==True and type=="NORMAL_TEXT":
|
||||
begin = '<em>' + begin
|
||||
end = end + '</em>'
|
||||
content = text_run.get('content')
|
||||
content = re.sub(u'\u000b','<br />\n',content)
|
||||
return begin + content + end
|
||||
|
||||
# t is a string that begins with "Icons: " ... and contains comma(space) separated list
|
||||
def handle_icons(t):
|
||||
text = t[7:].strip()
|
||||
parts = text.split(", ")
|
||||
return ('icons',parts)
|
||||
|
||||
# t is a string that begins with "Tags: " ... and contains comma(space) separated list
|
||||
def handle_tags(t):
|
||||
text = t[6:].strip()
|
||||
parts = text.split(", ")
|
||||
return ('tags',parts)
|
||||
|
||||
def handle_question(t,bracket=1):
|
||||
anchor = ''
|
||||
match = re.search( r'\[(.*)\]', t)
|
||||
if match and bracket:
|
||||
anchor = match.group(1).lower()
|
||||
t = re.sub( r'\[.*\]','',t)
|
||||
else:
|
||||
parts = t.split(' ')
|
||||
for p in parts:
|
||||
if re.search(r'[a-zA-Z]',p[0]): anchor += p[0].lower()
|
||||
return ('question', t, anchor)
|
||||
|
||||
def handle_answer(t):
|
||||
return ('answer',t)
|
||||
|
||||
def handle_sec(t): return ('section',t)
|
||||
def handle_para(t): return ('paragraph',t)
|
||||
def handle_ul(t): return ('unorderdedlist',t)
|
||||
def handle_li(t): return ('listitem',t)
|
||||
|
||||
|
||||
|
||||
img_count = 1
|
||||
img_lookup = {}
|
||||
img_heights = {}
|
||||
img_widths = {}
|
||||
|
||||
|
||||
def fetch_doc_image(k,value):
|
||||
global img_count, img_lookup, img_heights, img_widths
|
||||
if 'inlineObjectProperties' in value:
|
||||
if 'embeddedObject' in value['inlineObjectProperties']:
|
||||
if 'imageProperties' in value['inlineObjectProperties']['embeddedObject']:
|
||||
if 'contentUri' in value['inlineObjectProperties']['embeddedObject']['imageProperties']:
|
||||
print(k)
|
||||
uu = value['inlineObjectProperties']['embeddedObject']['imageProperties']['contentUri']
|
||||
response = requests.get(uu, stream=True)
|
||||
name = 'image_' + str(img_count) + '.' + response.headers['content-type'].split('/')[1]
|
||||
img_count += 1
|
||||
img_lookup[k] = name
|
||||
|
||||
with open('cache/doc_images/'+name, 'wb') as out_file:
|
||||
shutil.copyfileobj(response.raw, out_file)
|
||||
print(uu)
|
||||
print(response.headers)
|
||||
print(name)
|
||||
del response
|
||||
if 'size' in value['inlineObjectProperties']['embeddedObject']:
|
||||
img_heights[k] = int(value['inlineObjectProperties']['embeddedObject']['size']['height']['magnitude'])
|
||||
img_widths[k] = int(value['inlineObjectProperties']['embeddedObject']['size']['width']['magnitude'])
|
||||
|
||||
|
||||
def get_doc_generic(docid, bracket=1, verbose=0):
|
||||
import pickle
|
||||
import os.path
|
||||
from googleapiclient.discovery import build
|
||||
from google_auth_oauthlib.flow import InstalledAppFlow
|
||||
from google.auth.transport.requests import Request
|
||||
global img_count, img_lookup, img_heights, img_widths
|
||||
|
||||
# If modifying these scopes, delete the file token.pickle.
|
||||
SCOPES = ['https://www.googleapis.com/auth/documents.readonly']
|
||||
creds = None
|
||||
# The file token.pickle stores the user's access and refresh tokens, and is
|
||||
# created automatically when the authorization flow completes for the first
|
||||
# time.
|
||||
if os.path.exists('token.pickle'):
|
||||
with open('token.pickle', 'rb') as token:
|
||||
creds = pickle.load(token)
|
||||
if not creds or not creds.valid:
|
||||
if creds and creds.expired and creds.refresh_token:
|
||||
creds.refresh(Request())
|
||||
else:
|
||||
flow = InstalledAppFlow.from_client_secrets_file(
|
||||
'credentials.json', SCOPES)
|
||||
creds = flow.run_local_server(port=0)
|
||||
# Save the credentials for the next run
|
||||
with open('token.pickle', 'wb') as token:
|
||||
pickle.dump(creds, token)
|
||||
|
||||
service = build('docs', 'v1', credentials=creds)
|
||||
|
||||
# Retrieve the documents contents from the Docs service.
|
||||
document = service.documents().get(documentId=docid).execute()
|
||||
|
||||
tempout = codecs.open('cache/trash/gdoctemp.txt','w','utf-8')
|
||||
tempout.write( json.dumps(document,indent=2) \
|
||||
+ "\n\n\n------------------------------------\n\n")
|
||||
if verbose: print('The title of the document is: {}'.format(document.get('title')))
|
||||
|
||||
doc_content = document.get('body').get('content')
|
||||
doc_objects = document.get('inlineObjects')
|
||||
doc_lists = document.get('lists')
|
||||
|
||||
#text = ''
|
||||
#text =
|
||||
result = []
|
||||
last_type = ''
|
||||
#answer_text = ''
|
||||
|
|
|
|||
23
search.py
23
search.py
|
|
@ -554,3 +554,26 @@ if __name__ == "__main__":
|
|||
|
||||
# Call the function in the options dict
|
||||
options[ int(resp)][1]()
|
||||
def try_clustering(df):
|
||||
from sklearn.cluster import KMeans
|
||||
df = df.drop(['code'], axis=1)
|
||||
kmeans = KMeans(n_clusters=4, random_state=0).fit(df)
|
||||
return kmeans.labels_
|
||||
def nlp_sample():
|
||||
from gensim import utils, corpora
|
||||
from nltk import stem
|
||||
stemmer = stem.porter.PorterStemmer()
|
||||
strings = [
|
||||
"Human machine interface for lab abc computer applications",
|
||||
"A survey of user opinion of computer system response time",
|
||||
"The EPS user interface management system",
|
||||
"System and human system engineering testing of EPS",
|
||||
"Relation of user perceived response time to error measurement",
|
||||
"The generation of random binary unordered trees",
|
||||
"The intersection graph of paths in trees",
|
||||
"Graph minors IV Widths of trees and well quasi ordering",
|
||||
"Graph minors A survey",
|
||||
]
|
||||
processed = [[stemmer.stem(y) for y in utils.simple_preprocess(x, min_len=4)] for x in strings]
|
||||
dictionary = corpora.Dictionary(processed)
|
||||
return dictionary
|
||||
|
|
|
|||
8
users.py
8
users.py
|
|
@ -1938,7 +1938,8 @@ def track_users_by_teacherclass():
|
|||
print(json.dumps(g2, indent=2))
|
||||
|
||||
|
||||
def nlp_sample():
|
||||
## moved: nlp_sample now in search.py
|
||||
# def nlp_sample():
|
||||
# Stream a training corpus directly from S3.
|
||||
#corpus = corpora.MmCorpus("s3://path/to/corpus")
|
||||
|
||||
|
|
@ -1955,9 +1956,7 @@ def nlp_sample():
|
|||
"Graph minors IV Widths of trees and well quasi ordering",
|
||||
"Graph minors A survey",
|
||||
]
|
||||
processed = [ [ stemmer.stem(y) for y in utils.simple_preprocess(x, min_len=4)] for x in strings]
|
||||
print(processed)
|
||||
dictionary = corpora.Dictionary( processed )
|
||||
# moved
|
||||
dct = dictionary
|
||||
print(dictionary)
|
||||
|
||||
|
|
@ -2980,4 +2979,3 @@ if __name__ == "__main__":
|
|||
|
||||
# Call the function in the options dict
|
||||
options[ int(resp)][1]()
|
||||
|
||||
|
|
|
|||
Loading…
Reference in New Issue