cleanup
This commit is contained in:
parent
ff5ed654eb
commit
9584f45f30
329
content.py
329
content.py
|
|
@ -1528,6 +1528,334 @@ LANE: HyFlex
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
################
|
||||||
|
################ GOOGLE DOCS HELPERS (moved from pipelines)
|
||||||
|
################
|
||||||
|
|
||||||
|
def sec(t): return "<h3>"+t+"</h3>\n"
|
||||||
|
def para(t): return "<p>"+t+"</p>\n"
|
||||||
|
def ul(t): return "<ul>"+t+"</ul>\n"
|
||||||
|
def li(t): return "<li>"+t+"</li>\n"
|
||||||
|
|
||||||
|
def question(t,bracket=1):
|
||||||
|
ret = ''
|
||||||
|
match = re.search( r'\[(.*)\]', t)
|
||||||
|
if match and bracket:
|
||||||
|
ret += "<a name='" + match.group(1) + "'></a>"
|
||||||
|
t = re.sub( r'\[.*\]','',t)
|
||||||
|
else:
|
||||||
|
parts = t.split(' ')
|
||||||
|
id = ''
|
||||||
|
for p in parts:
|
||||||
|
if re.search(r'[a-zA-Z]',p[0]): id += p[0]
|
||||||
|
ret += "<a name='%s'></a>" % id.lower()
|
||||||
|
return ret + '<div class="accordion" data-accordion=""><h4 class="acrd_cntl">' + t + '</h4>\n<div class="acrd_cntnt">'
|
||||||
|
|
||||||
|
def answer(t):
|
||||||
|
return t + '</div></div>\n'
|
||||||
|
|
||||||
|
def read_paragraph_element(element,type="NORMAL_TEXT"):
|
||||||
|
text_run = element.get('textRun')
|
||||||
|
begin = ''
|
||||||
|
end = ''
|
||||||
|
if not text_run:
|
||||||
|
return ''
|
||||||
|
if 'textStyle' in text_run and 'link' in text_run['textStyle']:
|
||||||
|
begin = '<a href="' + text_run['textStyle']['link']['url'] + '">'
|
||||||
|
end = '</a>'
|
||||||
|
if 'textStyle' in text_run and 'bold' in text_run['textStyle'] and text_run['textStyle']['bold']==True and type=="NORMAL_TEXT":
|
||||||
|
begin = '<strong>' + begin
|
||||||
|
end = end + '</strong>'
|
||||||
|
content = text_run.get('content')
|
||||||
|
content = re.sub(u'\u000b','<br />\n',content)
|
||||||
|
return begin + content + end
|
||||||
|
|
||||||
|
def read_paragraph_element_2(element,type="NORMAL_TEXT"):
|
||||||
|
return read_paragraph_element(element,type)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# t is a string that begins with "Icons: " ... and contains comma(space) separated list
|
||||||
|
def handle_icons(t):
|
||||||
|
text = t[7:].strip()
|
||||||
|
parts = text.split(", ")
|
||||||
|
return ('icons',parts)
|
||||||
|
|
||||||
|
# t is a string that begins with "Tags: " ... and contains comma(space) separated list
|
||||||
|
def handle_tags(t):
|
||||||
|
text = t[6:].strip()
|
||||||
|
parts = text.split(", ")
|
||||||
|
return ('tags',parts)
|
||||||
|
|
||||||
|
def handle_question(t,bracket=1):
|
||||||
|
anchor = ''
|
||||||
|
match = re.search( r'\[(.*)\]', t)
|
||||||
|
if match and bracket:
|
||||||
|
anchor = match.group(1).lower()
|
||||||
|
t = re.sub( r'\[.*\]','',t)
|
||||||
|
else:
|
||||||
|
parts = t.split(' ')
|
||||||
|
for p in parts:
|
||||||
|
if re.search(r'[a-zA-Z]',p[0]): anchor += p[0].lower()
|
||||||
|
return ('question', t, anchor)
|
||||||
|
|
||||||
|
def handle_answer(t):
|
||||||
|
return ('answer',t)
|
||||||
|
|
||||||
|
def handle_sec(t): return ('section',t)
|
||||||
|
def handle_para(t): return ('paragraph',t)
|
||||||
|
def handle_ul(t): return ('unorderdedlist',t)
|
||||||
|
def handle_li(t): return ('listitem',t)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
img_count = 1
|
||||||
|
img_lookup = {}
|
||||||
|
img_heights = {}
|
||||||
|
img_widths = {}
|
||||||
|
|
||||||
|
|
||||||
|
'''def fetch_doc_image(k,value):
|
||||||
|
global img_count, img_lookup, img_heights, img_widths
|
||||||
|
if 'inlineObjectProperties' in value:
|
||||||
|
if 'embeddedObject' in value['inlineObjectProperties']:
|
||||||
|
if 'imageProperties' in value['inlineObjectProperties']['embeddedObject']:
|
||||||
|
if 'contentUri' in value['inlineObjectProperties']['embeddedObject']['imageProperties']:
|
||||||
|
print(k)
|
||||||
|
uu = value['inlineObjectProperties']['embeddedObject']['imageProperties']['contentUri']
|
||||||
|
response = requests.get(uu, stream=True)
|
||||||
|
name = 'image_' + str(img_count) + '.' + response.headers['content-type'].split('/')[1]
|
||||||
|
img_count += 1
|
||||||
|
img_lookup[k] = name
|
||||||
|
|
||||||
|
with open('cache/doc_images/'+name, 'wb') as out_file:
|
||||||
|
shutil.copyfileobj(response.raw, out_file)
|
||||||
|
print(uu)
|
||||||
|
print(response.headers)
|
||||||
|
print(name)
|
||||||
|
del response
|
||||||
|
if 'size' in value['inlineObjectProperties']['embeddedObject']:
|
||||||
|
img_heights[k] = int(value['inlineObjectProperties']['embeddedObject']['size']['height']['magnitude'])
|
||||||
|
img_widths[k] = int(value['inlineObjectProperties']['embeddedObject']['size']['width']['magnitude'])
|
||||||
|
'''
|
||||||
|
|
||||||
|
|
||||||
|
def fetch_doc_image(k,value):
|
||||||
|
import shutil
|
||||||
|
if 'inlineObjectProperties' in value:
|
||||||
|
if 'embeddedObject' in value['inlineObjectProperties']:
|
||||||
|
if 'imageProperties' in value['inlineObjectProperties']['embeddedObject']:
|
||||||
|
if 'contentUri' in value['inlineObjectProperties']['embeddedObject']['imageProperties']:
|
||||||
|
uu = value['inlineObjectProperties']['embeddedObject']['imageProperties']['contentUri']
|
||||||
|
response = requests.get(uu, stream=True)
|
||||||
|
name = 'image_' + str(k) + '.' + response.headers['content-type'].split('/')[1]
|
||||||
|
with open('cache/doc_images/'+name, 'wb') as out_file:
|
||||||
|
shutil.copyfileobj(response.raw, out_file)
|
||||||
|
del response
|
||||||
|
return True
|
||||||
|
|
||||||
|
def get_doc(docid, bracket=1, verbose=0):
|
||||||
|
import pickle, shutil
|
||||||
|
import os.path
|
||||||
|
from googleapiclient.discovery import build
|
||||||
|
from google_auth_oauthlib.flow import InstalledAppFlow
|
||||||
|
from google.auth.transport.requests import Request
|
||||||
|
|
||||||
|
#ooout = open(fileout,'w')
|
||||||
|
|
||||||
|
# If modifying these scopes, delete the file token.pickle.
|
||||||
|
SCOPES = ['https://www.googleapis.com/auth/documents.readonly']
|
||||||
|
creds = None
|
||||||
|
# The file token.pickle stores the user's access and refresh tokens, and is
|
||||||
|
# created automatically when the authorization flow completes for the first
|
||||||
|
# time.
|
||||||
|
if os.path.exists('token.pickle'):
|
||||||
|
with open('token.pickle', 'rb') as token:
|
||||||
|
creds = pickle.load(token)
|
||||||
|
# If there are no (valid) credentials available, let the user log in.
|
||||||
|
if not creds or not creds.valid:
|
||||||
|
if creds and creds.expired and creds.refresh_token:
|
||||||
|
creds.refresh(Request())
|
||||||
|
else:
|
||||||
|
flow = InstalledAppFlow.from_client_secrets_file(
|
||||||
|
'credentials.json', SCOPES)
|
||||||
|
creds = flow.run_local_server(port=0)
|
||||||
|
# Save the credentials for the next run
|
||||||
|
with open('token.pickle', 'wb') as token:
|
||||||
|
pickle.dump(creds, token)
|
||||||
|
|
||||||
|
service = build('docs', 'v1', credentials=creds)
|
||||||
|
|
||||||
|
# Retrieve the documents contents from the Docs service.
|
||||||
|
document = service.documents().get(documentId=docid).execute()
|
||||||
|
if verbose: print(document)
|
||||||
|
|
||||||
|
tempout = codecs.open('cache/trash/gdoctemp.txt','w','utf-8')
|
||||||
|
tempout.write( json.dumps(document,indent=2) + "\n\n\n------------------------------------\n\n")
|
||||||
|
if verbose: print('The title of the document is: {}'.format(document.get('title')))
|
||||||
|
doc_content = document.get('body').get('content')
|
||||||
|
if verbose: print(doc_content)
|
||||||
|
|
||||||
|
doc_objects = document.get('inlineObjects')
|
||||||
|
if verbose: print(doc_objects)
|
||||||
|
|
||||||
|
doc_lists = document.get('lists')
|
||||||
|
|
||||||
|
text = '<div class="acrd_grp" data-accordion-group="">'
|
||||||
|
last_type = ''
|
||||||
|
answer_text = ''
|
||||||
|
in_a_list = ''
|
||||||
|
|
||||||
|
img_count = 1
|
||||||
|
img_lookup = {}
|
||||||
|
img_heights = {}
|
||||||
|
img_widths = {}
|
||||||
|
|
||||||
|
if doc_objects:
|
||||||
|
for k,value in doc_objects.items():
|
||||||
|
tempout.write( "->" + k + "=" + json.dumps(value,indent=2) + "\n\n\n--\n\n")
|
||||||
|
if 'inlineObjectProperties' in value:
|
||||||
|
if 'embeddedObject' in value['inlineObjectProperties']:
|
||||||
|
if 'imageProperties' in value['inlineObjectProperties']['embeddedObject']:
|
||||||
|
if 'contentUri' in value['inlineObjectProperties']['embeddedObject']['imageProperties']:
|
||||||
|
print(k)
|
||||||
|
uu = value['inlineObjectProperties']['embeddedObject']['imageProperties']['contentUri']
|
||||||
|
response = requests.get(uu, stream=True)
|
||||||
|
name = 'image_' + str(img_count) + '.' + response.headers['content-type'].split('/')[1]
|
||||||
|
img_count += 1
|
||||||
|
|
||||||
|
img_lookup[k] = name
|
||||||
|
|
||||||
|
with open('cache/doc_images/'+name, 'wb') as out_file:
|
||||||
|
shutil.copyfileobj(response.raw, out_file)
|
||||||
|
print(uu)
|
||||||
|
print(response.headers)
|
||||||
|
print(name)
|
||||||
|
#input('x?')
|
||||||
|
del response
|
||||||
|
if 'size' in value['inlineObjectProperties']['embeddedObject']:
|
||||||
|
img_heights[k] = int(value['inlineObjectProperties']['embeddedObject']['size']['height']['magnitude'])
|
||||||
|
img_widths[k] = int(value['inlineObjectProperties']['embeddedObject']['size']['width']['magnitude'])
|
||||||
|
|
||||||
|
tempout.write('- - - - - - - -\n\n')
|
||||||
|
#for value in doc_lists:
|
||||||
|
# tempout.write( json.dumps(value,indent=2) + "\n\n\n--\n\n")
|
||||||
|
|
||||||
|
tempout.write('- - - - - - - -\n\n')
|
||||||
|
list_stack = []
|
||||||
|
list_depth = 0
|
||||||
|
last_list_depth = 0
|
||||||
|
for value in doc_content:
|
||||||
|
tempout.write( json.dumps(value,indent=2) + "\n\n\n")
|
||||||
|
if verbose: print(json.dumps(value, sort_keys=True, indent=4))
|
||||||
|
|
||||||
|
# todo: x link, x bold, list, image.
|
||||||
|
tag_fxn = para
|
||||||
|
if 'paragraph' in value:
|
||||||
|
this_text = ''
|
||||||
|
|
||||||
|
if 'bullet' in value['paragraph']:
|
||||||
|
# either we're (1)starting a new list, (2)in one, (3)starting a nested one, or (4)finished a nested one.
|
||||||
|
|
||||||
|
lid = value['paragraph']['bullet']['listId']
|
||||||
|
|
||||||
|
if not list_stack: # 1
|
||||||
|
list_stack.append(lid)
|
||||||
|
else:
|
||||||
|
if lid == list_stack[0]: # 2
|
||||||
|
pass
|
||||||
|
|
||||||
|
else:
|
||||||
|
if not lid in list_stack: # 3
|
||||||
|
list_stack.append(lid)
|
||||||
|
else: # 4
|
||||||
|
x = list_stack.pop()
|
||||||
|
while x != lid: list_stack.pop()
|
||||||
|
elif len(list_stack) > 0: # current para isn't a bullet but we still have a list open.
|
||||||
|
list_stack = []
|
||||||
|
|
||||||
|
list_depth = len(list_stack)
|
||||||
|
|
||||||
|
deeper = list_depth - last_list_depth
|
||||||
|
|
||||||
|
if deeper > 0:
|
||||||
|
answer_text += "<ul>" * deeper
|
||||||
|
elif deeper < 0:
|
||||||
|
deeper = -1 * deeper
|
||||||
|
answer_text += "</ul>" * deeper
|
||||||
|
|
||||||
|
if len(list_stack):
|
||||||
|
tag_fxn = li
|
||||||
|
|
||||||
|
elements = value.get('paragraph').get('elements')
|
||||||
|
|
||||||
|
# inlineObjectElement": {
|
||||||
|
# "inlineObjectId": "kix.ssseeu8j9cfx",
|
||||||
|
|
||||||
|
if 'paragraphStyle' in value.get('paragraph'):
|
||||||
|
style = value.get('paragraph').get('paragraphStyle')
|
||||||
|
#text += json.dumps(style, sort_keys=True, indent=4)
|
||||||
|
if 'namedStyleType' in style:
|
||||||
|
type = style['namedStyleType']
|
||||||
|
|
||||||
|
for elem in elements:
|
||||||
|
|
||||||
|
# text content
|
||||||
|
this_text += read_paragraph_element(elem,type)
|
||||||
|
|
||||||
|
# image content
|
||||||
|
if 'inlineObjectElement' in elem:
|
||||||
|
vpi = elem['inlineObjectElement']
|
||||||
|
if 'inlineObjectId' in vpi:
|
||||||
|
ii = vpi['inlineObjectId']
|
||||||
|
if ii in img_lookup:
|
||||||
|
img = img_lookup[ii]
|
||||||
|
h = img_heights[ii]
|
||||||
|
w = img_widths[ii]
|
||||||
|
this_text += '<img src="doc_images/%s" width="%i" height="%i" />' % (img,w,h)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
if last_type=='NORMAL_TEXT' and type!=last_type:
|
||||||
|
text += answer(answer_text)
|
||||||
|
answer_text = ''
|
||||||
|
|
||||||
|
if type=='HEADING_2':
|
||||||
|
text += sec(this_text)
|
||||||
|
this_text = ''
|
||||||
|
elif type=='HEADING_3':
|
||||||
|
text += question(this_text,bracket)
|
||||||
|
this_text = ''
|
||||||
|
else:
|
||||||
|
answer_text += tag_fxn(this_text)
|
||||||
|
this_text = ''
|
||||||
|
last_type = type
|
||||||
|
last_list_depth = list_depth
|
||||||
|
|
||||||
|
elif 'table' in value:
|
||||||
|
# The text in table cells are in nested Structural Elements and tables may be
|
||||||
|
# nested.
|
||||||
|
text += "\nTABLE\n"
|
||||||
|
#table = value.get('table')
|
||||||
|
#for row in table.get('tableRows'):
|
||||||
|
# cells = row.get('tableCells')
|
||||||
|
# for cell in cells:
|
||||||
|
# text += read_strucutural_elements(cell.get('content'))
|
||||||
|
#elif 'tableOfContents' in value:
|
||||||
|
# # The text in the TOC is also in a Structural Element.
|
||||||
|
# toc = value.get('tableOfContents')
|
||||||
|
# text += read_strucutural_elements(toc.get('content'))
|
||||||
|
|
||||||
|
#else:
|
||||||
|
# print(json.dumps(value, sort_keys=True, indent=4))
|
||||||
|
|
||||||
|
text += answer(answer_text)
|
||||||
|
#text += '</div>'
|
||||||
|
#print(text)
|
||||||
|
return text
|
||||||
|
|
||||||
|
def get_doc_generic(docid, bracket=1, verbose=0):
|
||||||
|
return get_doc(docid, bracket, verbose)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -1567,4 +1895,3 @@ if __name__ == "__main__":
|
||||||
|
|
||||||
# Call the function in the options dict
|
# Call the function in the options dict
|
||||||
options[ int(resp)][1]()
|
options[ int(resp)][1]()
|
||||||
|
|
||||||
|
|
|
||||||
19
courses.py
19
courses.py
|
|
@ -2690,24 +2690,7 @@ def enrollment_helper():
|
||||||
|
|
||||||
# fill percentage for each section, then by mode, tod, campus
|
# fill percentage for each section, then by mode, tod, campus
|
||||||
|
|
||||||
def try_clustering(df):
|
## moved: try_clustering now in search.py
|
||||||
# Import required libraries
|
|
||||||
from sklearn.cluster import KMeans
|
|
||||||
|
|
||||||
# Preprocessing
|
|
||||||
|
|
||||||
# Assuming df is your DataFrame and "modes" is your categorical column
|
|
||||||
#df['code'] = df['code'].astype('category').cat.codes
|
|
||||||
|
|
||||||
# Removing any other unnecessary columns
|
|
||||||
df = df.drop(['code'], axis=1)
|
|
||||||
|
|
||||||
# Perform KMeans clustering
|
|
||||||
kmeans = KMeans(n_clusters=4, random_state=0).fit(df)
|
|
||||||
|
|
||||||
# Get the cluster labels
|
|
||||||
labels = kmeans.labels_
|
|
||||||
|
|
||||||
# Add labels to the DataFrame
|
# Add labels to the DataFrame
|
||||||
#df['clusters'] = labels
|
#df['clusters'] = labels
|
||||||
#print(df)
|
#print(df)
|
||||||
|
|
|
||||||
|
|
@ -4,6 +4,98 @@
|
||||||
|
|
||||||
# from pipelines - canvas data
|
# from pipelines - canvas data
|
||||||
|
|
||||||
|
|
||||||
|
# Canvas data, download all new files
|
||||||
|
def sync_non_interactive():
|
||||||
|
resp = do_request('/api/account/self/file/sync')
|
||||||
|
mylog.write(json.dumps(resp, indent=4))
|
||||||
|
#mylog.close()
|
||||||
|
gotten = os.listdir(local_data_folder)
|
||||||
|
wanted = []
|
||||||
|
i = 0
|
||||||
|
for x in resp['files']:
|
||||||
|
filename = x['filename']
|
||||||
|
exi = "No "
|
||||||
|
if filename in gotten: exi = "Yes"
|
||||||
|
else: wanted.append(x)
|
||||||
|
|
||||||
|
print(str(i) + '.\tLocal: %s\tRemote: %s' % ( exi, filename ))
|
||||||
|
i += 1
|
||||||
|
print("I will attempt to download %i files." % len(wanted))
|
||||||
|
|
||||||
|
#answer = input("Press enter to begin, or q to quit ")
|
||||||
|
#if not answer == '': return
|
||||||
|
|
||||||
|
good_count = 0
|
||||||
|
bad_count = 0
|
||||||
|
for W in wanted:
|
||||||
|
print("Downloading: " + W['filename'])
|
||||||
|
response = requests.request(method='GET', url=W['url'], stream=True)
|
||||||
|
if(response.status_code != 200):
|
||||||
|
print('Request response went bad. Got back a %s code, meaning the request was %s' % \
|
||||||
|
(response.status_code, response.reason))
|
||||||
|
print('URL: ' + W['url'])
|
||||||
|
bad_count += 1
|
||||||
|
|
||||||
|
else:
|
||||||
|
#Use the downloaded data
|
||||||
|
with open(local_data_folder + W['filename'], 'wb') as fd:
|
||||||
|
for chunk in response.iter_content(chunk_size=128):
|
||||||
|
fd.write(chunk)
|
||||||
|
print("Success")
|
||||||
|
good_count += 1
|
||||||
|
print("Out of %i files, %i succeeded and %i failed." % (len(wanted),good_count,bad_count))
|
||||||
|
|
||||||
|
|
||||||
|
## OLD STYLE CANVAS DATA
|
||||||
|
|
||||||
|
# Get something from Canvas Data
|
||||||
|
def do_request(path):
|
||||||
|
#Set up the request pieces
|
||||||
|
method = 'GET'
|
||||||
|
host = 'api.inshosteddata.com'
|
||||||
|
apiTime = datetime.utcnow().strftime('%a, %d %b %y %H:%M:%S GMT')
|
||||||
|
apiContentType = 'application/json'
|
||||||
|
|
||||||
|
msgList = []
|
||||||
|
msgList.append(method)
|
||||||
|
msgList.append(host)
|
||||||
|
msgList.append(apiContentType)
|
||||||
|
msgList.append('')
|
||||||
|
msgList.append(path)
|
||||||
|
msgList.append('')
|
||||||
|
msgList.append(apiTime)
|
||||||
|
msgList.append(apiSecret)
|
||||||
|
|
||||||
|
msgStr = bytes("".join("%s\n" % k for k in msgList).strip(),'utf-8')
|
||||||
|
|
||||||
|
sig = base64.b64encode(hmac.new(key=bytes(apiSecret,'utf-8'),msg=msgStr,digestmod=hashlib.sha256).digest())
|
||||||
|
sig = sig.decode('utf-8')
|
||||||
|
|
||||||
|
headers = {}
|
||||||
|
headers['Authorization'] = 'HMACAuth {}:{}'.format(apiKey,sig)
|
||||||
|
headers['Date'] = apiTime
|
||||||
|
headers['Content-type'] = apiContentType
|
||||||
|
|
||||||
|
|
||||||
|
#Submit the request/get a response
|
||||||
|
uri = "https://"+host+path
|
||||||
|
print (uri)
|
||||||
|
print (headers)
|
||||||
|
response = requests.request(method='GET', url=uri, headers=headers, stream=True)
|
||||||
|
|
||||||
|
#Check to make sure the request was ok
|
||||||
|
if(response.status_code != 200):
|
||||||
|
print(('Request response went bad. Got back a ', response.status_code, ' code, meaning the request was ', response.reason))
|
||||||
|
else:
|
||||||
|
#Use the downloaded data
|
||||||
|
jsonData = response.json()
|
||||||
|
#print(json.dumps(jsonData, indent=4))
|
||||||
|
return jsonData
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def file_doesnt_exist(name):
|
def file_doesnt_exist(name):
|
||||||
# Get list of files in current directory
|
# Get list of files in current directory
|
||||||
files = os.listdir()
|
files = os.listdir()
|
||||||
|
|
|
||||||
|
|
@ -8,7 +8,7 @@ from datetime import datetime as dt
|
||||||
from datetime import timedelta
|
from datetime import timedelta
|
||||||
from dateutil.parser import parse
|
from dateutil.parser import parse
|
||||||
from os.path import exists, getmtime
|
from os.path import exists, getmtime
|
||||||
from pipelines import sync_non_interactive, url, header
|
from pipelines import url, header
|
||||||
import util
|
import util
|
||||||
from semesters import short_to_sis
|
from semesters import short_to_sis
|
||||||
|
|
||||||
|
|
@ -1121,7 +1121,7 @@ def full_reload():
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print("Couldn't rename file:", str(e))
|
print("Couldn't rename file:", str(e))
|
||||||
|
|
||||||
sync_non_interactive()
|
#sync_non_interactive()
|
||||||
|
|
||||||
setup_table('requests_sum1')
|
setup_table('requests_sum1')
|
||||||
setup_table('courses')
|
setup_table('courses')
|
||||||
|
|
|
||||||
511
pipelines.py
511
pipelines.py
|
|
@ -3,7 +3,7 @@ import codecs, json, requests, re, csv, datetime, os, jsondiff, os.path
|
||||||
import sys, shutil, hmac, hashlib, base64, schedule, time, pathlib
|
import sys, shutil, hmac, hashlib, base64, schedule, time, pathlib
|
||||||
from datetime import timedelta
|
from datetime import timedelta
|
||||||
|
|
||||||
from canvas_secrets import apiKey, apiSecret, FTP_SITE, FTP_USER, FTP_PW, url, domain, account_id, header, header_media, g_id, g_secret
|
from canvas_secrets import apiKey, apiSecret, FTP_SITE, FTP_USER, FTP_PW, url, domain, account_id, header, header_media
|
||||||
from canvas_secrets import instructure_url, instructure_username, instructure_private_key
|
from canvas_secrets import instructure_url, instructure_username, instructure_private_key
|
||||||
|
|
||||||
import os, asyncio
|
import os, asyncio
|
||||||
|
|
@ -571,95 +571,6 @@ def fetch_current_rosters_auto(poll_seconds=15):
|
||||||
time.sleep(max(5, int(poll_seconds)))
|
time.sleep(max(5, int(poll_seconds)))
|
||||||
|
|
||||||
|
|
||||||
# Canvas data, download all new files
|
|
||||||
def sync_non_interactive():
|
|
||||||
resp = do_request('/api/account/self/file/sync')
|
|
||||||
mylog.write(json.dumps(resp, indent=4))
|
|
||||||
#mylog.close()
|
|
||||||
gotten = os.listdir(local_data_folder)
|
|
||||||
wanted = []
|
|
||||||
i = 0
|
|
||||||
for x in resp['files']:
|
|
||||||
filename = x['filename']
|
|
||||||
exi = "No "
|
|
||||||
if filename in gotten: exi = "Yes"
|
|
||||||
else: wanted.append(x)
|
|
||||||
|
|
||||||
print(str(i) + '.\tLocal: %s\tRemote: %s' % ( exi, filename ))
|
|
||||||
i += 1
|
|
||||||
print("I will attempt to download %i files." % len(wanted))
|
|
||||||
|
|
||||||
#answer = input("Press enter to begin, or q to quit ")
|
|
||||||
#if not answer == '': return
|
|
||||||
|
|
||||||
good_count = 0
|
|
||||||
bad_count = 0
|
|
||||||
for W in wanted:
|
|
||||||
print("Downloading: " + W['filename'])
|
|
||||||
response = requests.request(method='GET', url=W['url'], stream=True)
|
|
||||||
if(response.status_code != 200):
|
|
||||||
print('Request response went bad. Got back a %s code, meaning the request was %s' % \
|
|
||||||
(response.status_code, response.reason))
|
|
||||||
print('URL: ' + W['url'])
|
|
||||||
bad_count += 1
|
|
||||||
|
|
||||||
else:
|
|
||||||
#Use the downloaded data
|
|
||||||
with open(local_data_folder + W['filename'], 'wb') as fd:
|
|
||||||
for chunk in response.iter_content(chunk_size=128):
|
|
||||||
fd.write(chunk)
|
|
||||||
print("Success")
|
|
||||||
good_count += 1
|
|
||||||
print("Out of %i files, %i succeeded and %i failed." % (len(wanted),good_count,bad_count))
|
|
||||||
|
|
||||||
|
|
||||||
## OLD STYLE CANVAS DATA
|
|
||||||
|
|
||||||
# Get something from Canvas Data
|
|
||||||
def do_request(path):
|
|
||||||
#Set up the request pieces
|
|
||||||
method = 'GET'
|
|
||||||
host = 'api.inshosteddata.com'
|
|
||||||
apiTime = datetime.utcnow().strftime('%a, %d %b %y %H:%M:%S GMT')
|
|
||||||
apiContentType = 'application/json'
|
|
||||||
|
|
||||||
msgList = []
|
|
||||||
msgList.append(method)
|
|
||||||
msgList.append(host)
|
|
||||||
msgList.append(apiContentType)
|
|
||||||
msgList.append('')
|
|
||||||
msgList.append(path)
|
|
||||||
msgList.append('')
|
|
||||||
msgList.append(apiTime)
|
|
||||||
msgList.append(apiSecret)
|
|
||||||
|
|
||||||
msgStr = bytes("".join("%s\n" % k for k in msgList).strip(),'utf-8')
|
|
||||||
|
|
||||||
sig = base64.b64encode(hmac.new(key=bytes(apiSecret,'utf-8'),msg=msgStr,digestmod=hashlib.sha256).digest())
|
|
||||||
sig = sig.decode('utf-8')
|
|
||||||
|
|
||||||
headers = {}
|
|
||||||
headers['Authorization'] = 'HMACAuth {}:{}'.format(apiKey,sig)
|
|
||||||
headers['Date'] = apiTime
|
|
||||||
headers['Content-type'] = apiContentType
|
|
||||||
|
|
||||||
|
|
||||||
#Submit the request/get a response
|
|
||||||
uri = "https://"+host+path
|
|
||||||
print (uri)
|
|
||||||
print (headers)
|
|
||||||
response = requests.request(method='GET', url=uri, headers=headers, stream=True)
|
|
||||||
|
|
||||||
#Check to make sure the request was ok
|
|
||||||
if(response.status_code != 200):
|
|
||||||
print(('Request response went bad. Got back a ', response.status_code, ' code, meaning the request was ', response.reason))
|
|
||||||
else:
|
|
||||||
#Use the downloaded data
|
|
||||||
jsonData = response.json()
|
|
||||||
#print(json.dumps(jsonData, indent=4))
|
|
||||||
return jsonData
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -693,425 +604,7 @@ def put_file(remotepath,localpath, localfile,prompt=1):
|
||||||
sftp.close()
|
sftp.close()
|
||||||
|
|
||||||
|
|
||||||
"""
|
#text =
|
||||||
# copy files and directories from local static, to remote static,
|
|
||||||
# preserving modification times on the files
|
|
||||||
for f in localf:
|
|
||||||
print("This local file: " + f + " ", end=' ')
|
|
||||||
if not f in files:
|
|
||||||
sftp.put('video_srt/'+classfoldername+'/'+f, f, preserve_mtime=True)
|
|
||||||
print("Uploaded.")
|
|
||||||
else:
|
|
||||||
print("Skipped.")
|
|
||||||
"""
|
|
||||||
|
|
||||||
"""if len(files)==3 and 'users.csv' in files:
|
|
||||||
sftp.get('courses.csv','rosters/courses-'+folder+'.csv')
|
|
||||||
sftp.get('users.csv','rosters/users-'+folder+'.csv')
|
|
||||||
sftp.get('enrollments.csv','rosters/enrollments-'+folder+'.csv')
|
|
||||||
print folder + '\tSaved three data files in rosters folder.'
|
|
||||||
|
|
||||||
courses = open('rosters/courses-'+folder+'.csv','r')
|
|
||||||
courses.readline()
|
|
||||||
a = courses.readline()
|
|
||||||
print a
|
|
||||||
courses.close()
|
|
||||||
parts = a.split(',')
|
|
||||||
year = parts[1][0:4]
|
|
||||||
ss = parts[1][4:6]
|
|
||||||
#print parts[1]
|
|
||||||
sem = {'30':'spring', '50':'summer', '70':'fall' }
|
|
||||||
this_sem = sem[ss]
|
|
||||||
#print this_sem, "", year
|
|
||||||
print folder + '\tbuilding data file...'
|
|
||||||
convert_roster_files(this_sem,year,folder)
|
|
||||||
print folder + '\tmoving files...'
|
|
||||||
move_to_folder(this_sem,year,folder)
|
|
||||||
else:
|
|
||||||
print folder + "\tDon't see all three files."""
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
################
|
|
||||||
################ GOOGLE DOCS
|
|
||||||
################
|
|
||||||
################
|
|
||||||
################
|
|
||||||
|
|
||||||
def sec(t): return "<h3>"+t+"</h3>\n"
|
|
||||||
def para(t): return "<p>"+t+"</p>\n"
|
|
||||||
def ul(t): return "<ul>"+t+"</ul>\n"
|
|
||||||
def li(t): return "<li>"+t+"</li>\n"
|
|
||||||
|
|
||||||
def question(t,bracket=1):
|
|
||||||
ret = ''
|
|
||||||
match = re.search( r'\[(.*)\]', t)
|
|
||||||
if match and bracket:
|
|
||||||
ret += "<a name='" + match.group(1) + "'></a>"
|
|
||||||
t = re.sub( r'\[.*\]','',t)
|
|
||||||
else:
|
|
||||||
parts = t.split(' ')
|
|
||||||
id = ''
|
|
||||||
for p in parts:
|
|
||||||
if re.search(r'[a-zA-Z]',p[0]): id += p[0]
|
|
||||||
ret += "<a name='%s'></a>" % id.lower()
|
|
||||||
return ret + '<div class="accordion" data-accordion=""><h4 class="acrd_cntl">' + t + '</h4>\n<div class="acrd_cntnt">'
|
|
||||||
|
|
||||||
def answer(t):
|
|
||||||
return t + '</div></div>\n'
|
|
||||||
|
|
||||||
def read_paragraph_element(element,type="NORMAL_TEXT"):
|
|
||||||
"""Returns the text in the given ParagraphElement.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
element: a ParagraphElement from a Google Doc.
|
|
||||||
"""
|
|
||||||
text_run = element.get('textRun')
|
|
||||||
begin = ''
|
|
||||||
end = ''
|
|
||||||
if not text_run:
|
|
||||||
return ''
|
|
||||||
if 'textStyle' in text_run and 'link' in text_run['textStyle']:
|
|
||||||
begin = '<a href="' + text_run['textStyle']['link']['url'] + '">'
|
|
||||||
end = '</a>'
|
|
||||||
if 'textStyle' in text_run and 'bold' in text_run['textStyle'] and text_run['textStyle']['bold']==True and type=="NORMAL_TEXT":
|
|
||||||
begin = '<strong>' + begin
|
|
||||||
end = end + '</strong>'
|
|
||||||
|
|
||||||
content = text_run.get('content')
|
|
||||||
content = re.sub(u'\u000b','<br />\n',content)
|
|
||||||
|
|
||||||
return begin + content + end
|
|
||||||
|
|
||||||
|
|
||||||
def get_doc(docid, bracket=1, verbose=0):
|
|
||||||
import pickle
|
|
||||||
import os.path
|
|
||||||
from googleapiclient.discovery import build
|
|
||||||
from google_auth_oauthlib.flow import InstalledAppFlow
|
|
||||||
from google.auth.transport.requests import Request
|
|
||||||
|
|
||||||
#ooout = open(fileout,'w')
|
|
||||||
|
|
||||||
# If modifying these scopes, delete the file token.pickle.
|
|
||||||
SCOPES = ['https://www.googleapis.com/auth/documents.readonly']
|
|
||||||
creds = None
|
|
||||||
# The file token.pickle stores the user's access and refresh tokens, and is
|
|
||||||
# created automatically when the authorization flow completes for the first
|
|
||||||
# time.
|
|
||||||
if os.path.exists('token.pickle'):
|
|
||||||
with open('token.pickle', 'rb') as token:
|
|
||||||
creds = pickle.load(token)
|
|
||||||
# If there are no (valid) credentials available, let the user log in.
|
|
||||||
if not creds or not creds.valid:
|
|
||||||
if creds and creds.expired and creds.refresh_token:
|
|
||||||
creds.refresh(Request())
|
|
||||||
else:
|
|
||||||
flow = InstalledAppFlow.from_client_secrets_file(
|
|
||||||
'credentials.json', SCOPES)
|
|
||||||
creds = flow.run_local_server(port=0)
|
|
||||||
# Save the credentials for the next run
|
|
||||||
with open('token.pickle', 'wb') as token:
|
|
||||||
pickle.dump(creds, token)
|
|
||||||
|
|
||||||
service = build('docs', 'v1', credentials=creds)
|
|
||||||
|
|
||||||
# Retrieve the documents contents from the Docs service.
|
|
||||||
document = service.documents().get(documentId=docid).execute()
|
|
||||||
if verbose: print(document)
|
|
||||||
|
|
||||||
tempout = codecs.open('cache/trash/gdoctemp.txt','w','utf-8')
|
|
||||||
tempout.write( json.dumps(document,indent=2) + "\n\n\n------------------------------------\n\n")
|
|
||||||
if verbose: print('The title of the document is: {}'.format(document.get('title')))
|
|
||||||
doc_content = document.get('body').get('content')
|
|
||||||
if verbose: print(doc_content)
|
|
||||||
|
|
||||||
doc_objects = document.get('inlineObjects')
|
|
||||||
if verbose: print(doc_objects)
|
|
||||||
|
|
||||||
doc_lists = document.get('lists')
|
|
||||||
|
|
||||||
text = '<div class="acrd_grp" data-accordion-group="">'
|
|
||||||
last_type = ''
|
|
||||||
answer_text = ''
|
|
||||||
in_a_list = ''
|
|
||||||
|
|
||||||
img_count = 1
|
|
||||||
img_lookup = {}
|
|
||||||
img_heights = {}
|
|
||||||
img_widths = {}
|
|
||||||
|
|
||||||
if doc_objects:
|
|
||||||
for k,value in doc_objects.items():
|
|
||||||
tempout.write( "->" + k + "=" + json.dumps(value,indent=2) + "\n\n\n--\n\n")
|
|
||||||
if 'inlineObjectProperties' in value:
|
|
||||||
if 'embeddedObject' in value['inlineObjectProperties']:
|
|
||||||
if 'imageProperties' in value['inlineObjectProperties']['embeddedObject']:
|
|
||||||
if 'contentUri' in value['inlineObjectProperties']['embeddedObject']['imageProperties']:
|
|
||||||
print(k)
|
|
||||||
uu = value['inlineObjectProperties']['embeddedObject']['imageProperties']['contentUri']
|
|
||||||
response = requests.get(uu, stream=True)
|
|
||||||
name = 'image_' + str(img_count) + '.' + response.headers['content-type'].split('/')[1]
|
|
||||||
img_count += 1
|
|
||||||
|
|
||||||
img_lookup[k] = name
|
|
||||||
|
|
||||||
with open('cache/doc_images/'+name, 'wb') as out_file:
|
|
||||||
shutil.copyfileobj(response.raw, out_file)
|
|
||||||
print(uu)
|
|
||||||
print(response.headers)
|
|
||||||
print(name)
|
|
||||||
#input('x?')
|
|
||||||
del response
|
|
||||||
if 'size' in value['inlineObjectProperties']['embeddedObject']:
|
|
||||||
img_heights[k] = int(value['inlineObjectProperties']['embeddedObject']['size']['height']['magnitude'])
|
|
||||||
img_widths[k] = int(value['inlineObjectProperties']['embeddedObject']['size']['width']['magnitude'])
|
|
||||||
|
|
||||||
tempout.write('- - - - - - - -\n\n')
|
|
||||||
#for value in doc_lists:
|
|
||||||
# tempout.write( json.dumps(value,indent=2) + "\n\n\n--\n\n")
|
|
||||||
|
|
||||||
tempout.write('- - - - - - - -\n\n')
|
|
||||||
list_stack = []
|
|
||||||
list_depth = 0
|
|
||||||
last_list_depth = 0
|
|
||||||
for value in doc_content:
|
|
||||||
tempout.write( json.dumps(value,indent=2) + "\n\n\n")
|
|
||||||
if verbose: print(json.dumps(value, sort_keys=True, indent=4))
|
|
||||||
|
|
||||||
# todo: x link, x bold, list, image.
|
|
||||||
tag_fxn = para
|
|
||||||
if 'paragraph' in value:
|
|
||||||
this_text = ''
|
|
||||||
|
|
||||||
if 'bullet' in value['paragraph']:
|
|
||||||
# either we're (1)starting a new list, (2)in one, (3)starting a nested one, or (4)finished a nested one.
|
|
||||||
|
|
||||||
lid = value['paragraph']['bullet']['listId']
|
|
||||||
|
|
||||||
if not list_stack: # 1
|
|
||||||
list_stack.append(lid)
|
|
||||||
else:
|
|
||||||
if lid == list_stack[0]: # 2
|
|
||||||
pass
|
|
||||||
|
|
||||||
else:
|
|
||||||
if not lid in list_stack: # 3
|
|
||||||
list_stack.append(lid)
|
|
||||||
else: # 4
|
|
||||||
x = list_stack.pop()
|
|
||||||
while x != lid: list_stack.pop()
|
|
||||||
elif len(list_stack) > 0: # current para isn't a bullet but we still have a list open.
|
|
||||||
list_stack = []
|
|
||||||
|
|
||||||
list_depth = len(list_stack)
|
|
||||||
|
|
||||||
deeper = list_depth - last_list_depth
|
|
||||||
|
|
||||||
if deeper > 0:
|
|
||||||
answer_text += "<ul>" * deeper
|
|
||||||
elif deeper < 0:
|
|
||||||
deeper = -1 * deeper
|
|
||||||
answer_text += "</ul>" * deeper
|
|
||||||
|
|
||||||
if len(list_stack):
|
|
||||||
tag_fxn = li
|
|
||||||
|
|
||||||
elements = value.get('paragraph').get('elements')
|
|
||||||
|
|
||||||
# inlineObjectElement": {
|
|
||||||
# "inlineObjectId": "kix.ssseeu8j9cfx",
|
|
||||||
|
|
||||||
if 'paragraphStyle' in value.get('paragraph'):
|
|
||||||
style = value.get('paragraph').get('paragraphStyle')
|
|
||||||
#text += json.dumps(style, sort_keys=True, indent=4)
|
|
||||||
if 'namedStyleType' in style:
|
|
||||||
type = style['namedStyleType']
|
|
||||||
|
|
||||||
for elem in elements:
|
|
||||||
|
|
||||||
# text content
|
|
||||||
this_text += read_paragraph_element(elem,type)
|
|
||||||
|
|
||||||
# image content
|
|
||||||
if 'inlineObjectElement' in elem:
|
|
||||||
vpi = elem['inlineObjectElement']
|
|
||||||
if 'inlineObjectId' in vpi:
|
|
||||||
ii = vpi['inlineObjectId']
|
|
||||||
if ii in img_lookup:
|
|
||||||
img = img_lookup[ii]
|
|
||||||
h = img_heights[ii]
|
|
||||||
w = img_widths[ii]
|
|
||||||
this_text += '<img src="doc_images/%s" width="%i" height="%i" />' % (img,w,h)
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
if last_type=='NORMAL_TEXT' and type!=last_type:
|
|
||||||
text += answer(answer_text)
|
|
||||||
answer_text = ''
|
|
||||||
|
|
||||||
if type=='HEADING_2':
|
|
||||||
text += sec(this_text)
|
|
||||||
this_text = ''
|
|
||||||
elif type=='HEADING_3':
|
|
||||||
text += question(this_text,bracket)
|
|
||||||
this_text = ''
|
|
||||||
else:
|
|
||||||
answer_text += tag_fxn(this_text)
|
|
||||||
this_text = ''
|
|
||||||
last_type = type
|
|
||||||
last_list_depth = list_depth
|
|
||||||
|
|
||||||
elif 'table' in value:
|
|
||||||
# The text in table cells are in nested Structural Elements and tables may be
|
|
||||||
# nested.
|
|
||||||
text += "\nTABLE\n"
|
|
||||||
#table = value.get('table')
|
|
||||||
#for row in table.get('tableRows'):
|
|
||||||
# cells = row.get('tableCells')
|
|
||||||
# for cell in cells:
|
|
||||||
# text += read_strucutural_elements(cell.get('content'))
|
|
||||||
#elif 'tableOfContents' in value:
|
|
||||||
# # The text in the TOC is also in a Structural Element.
|
|
||||||
# toc = value.get('tableOfContents')
|
|
||||||
# text += read_strucutural_elements(toc.get('content'))
|
|
||||||
|
|
||||||
#else:
|
|
||||||
# print(json.dumps(value, sort_keys=True, indent=4))
|
|
||||||
|
|
||||||
text += answer(answer_text)
|
|
||||||
#text += '</div>'
|
|
||||||
#print(text)
|
|
||||||
return text
|
|
||||||
|
|
||||||
######### TRY #2 ######
|
|
||||||
|
|
||||||
|
|
||||||
def read_paragraph_element_2(element,type="NORMAL_TEXT"):
|
|
||||||
text_run = element.get('textRun')
|
|
||||||
begin = ''
|
|
||||||
end = ''
|
|
||||||
if not text_run: return ''
|
|
||||||
if 'textStyle' in text_run and 'link' in text_run['textStyle']:
|
|
||||||
begin = '<a href="' + text_run['textStyle']['link']['url'] + '">'
|
|
||||||
end = '</a>'
|
|
||||||
if 'textStyle' in text_run and 'bold' in text_run['textStyle'] and text_run['textStyle']['bold']==True and type=="NORMAL_TEXT":
|
|
||||||
begin = '<strong>' + begin
|
|
||||||
end = end + '</strong>'
|
|
||||||
elif 'textStyle' in text_run and 'italic' in text_run['textStyle'] and text_run['textStyle']['italic']==True and type=="NORMAL_TEXT":
|
|
||||||
begin = '<em>' + begin
|
|
||||||
end = end + '</em>'
|
|
||||||
content = text_run.get('content')
|
|
||||||
content = re.sub(u'\u000b','<br />\n',content)
|
|
||||||
return begin + content + end
|
|
||||||
|
|
||||||
# t is a string that begins with "Icons: " ... and contains comma(space) separated list
|
|
||||||
def handle_icons(t):
|
|
||||||
text = t[7:].strip()
|
|
||||||
parts = text.split(", ")
|
|
||||||
return ('icons',parts)
|
|
||||||
|
|
||||||
# t is a string that begins with "Tags: " ... and contains comma(space) separated list
|
|
||||||
def handle_tags(t):
|
|
||||||
text = t[6:].strip()
|
|
||||||
parts = text.split(", ")
|
|
||||||
return ('tags',parts)
|
|
||||||
|
|
||||||
def handle_question(t,bracket=1):
|
|
||||||
anchor = ''
|
|
||||||
match = re.search( r'\[(.*)\]', t)
|
|
||||||
if match and bracket:
|
|
||||||
anchor = match.group(1).lower()
|
|
||||||
t = re.sub( r'\[.*\]','',t)
|
|
||||||
else:
|
|
||||||
parts = t.split(' ')
|
|
||||||
for p in parts:
|
|
||||||
if re.search(r'[a-zA-Z]',p[0]): anchor += p[0].lower()
|
|
||||||
return ('question', t, anchor)
|
|
||||||
|
|
||||||
def handle_answer(t):
|
|
||||||
return ('answer',t)
|
|
||||||
|
|
||||||
def handle_sec(t): return ('section',t)
|
|
||||||
def handle_para(t): return ('paragraph',t)
|
|
||||||
def handle_ul(t): return ('unorderdedlist',t)
|
|
||||||
def handle_li(t): return ('listitem',t)
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
img_count = 1
|
|
||||||
img_lookup = {}
|
|
||||||
img_heights = {}
|
|
||||||
img_widths = {}
|
|
||||||
|
|
||||||
|
|
||||||
def fetch_doc_image(k,value):
|
|
||||||
global img_count, img_lookup, img_heights, img_widths
|
|
||||||
if 'inlineObjectProperties' in value:
|
|
||||||
if 'embeddedObject' in value['inlineObjectProperties']:
|
|
||||||
if 'imageProperties' in value['inlineObjectProperties']['embeddedObject']:
|
|
||||||
if 'contentUri' in value['inlineObjectProperties']['embeddedObject']['imageProperties']:
|
|
||||||
print(k)
|
|
||||||
uu = value['inlineObjectProperties']['embeddedObject']['imageProperties']['contentUri']
|
|
||||||
response = requests.get(uu, stream=True)
|
|
||||||
name = 'image_' + str(img_count) + '.' + response.headers['content-type'].split('/')[1]
|
|
||||||
img_count += 1
|
|
||||||
img_lookup[k] = name
|
|
||||||
|
|
||||||
with open('cache/doc_images/'+name, 'wb') as out_file:
|
|
||||||
shutil.copyfileobj(response.raw, out_file)
|
|
||||||
print(uu)
|
|
||||||
print(response.headers)
|
|
||||||
print(name)
|
|
||||||
del response
|
|
||||||
if 'size' in value['inlineObjectProperties']['embeddedObject']:
|
|
||||||
img_heights[k] = int(value['inlineObjectProperties']['embeddedObject']['size']['height']['magnitude'])
|
|
||||||
img_widths[k] = int(value['inlineObjectProperties']['embeddedObject']['size']['width']['magnitude'])
|
|
||||||
|
|
||||||
|
|
||||||
def get_doc_generic(docid, bracket=1, verbose=0):
|
|
||||||
import pickle
|
|
||||||
import os.path
|
|
||||||
from googleapiclient.discovery import build
|
|
||||||
from google_auth_oauthlib.flow import InstalledAppFlow
|
|
||||||
from google.auth.transport.requests import Request
|
|
||||||
global img_count, img_lookup, img_heights, img_widths
|
|
||||||
|
|
||||||
# If modifying these scopes, delete the file token.pickle.
|
|
||||||
SCOPES = ['https://www.googleapis.com/auth/documents.readonly']
|
|
||||||
creds = None
|
|
||||||
# The file token.pickle stores the user's access and refresh tokens, and is
|
|
||||||
# created automatically when the authorization flow completes for the first
|
|
||||||
# time.
|
|
||||||
if os.path.exists('token.pickle'):
|
|
||||||
with open('token.pickle', 'rb') as token:
|
|
||||||
creds = pickle.load(token)
|
|
||||||
if not creds or not creds.valid:
|
|
||||||
if creds and creds.expired and creds.refresh_token:
|
|
||||||
creds.refresh(Request())
|
|
||||||
else:
|
|
||||||
flow = InstalledAppFlow.from_client_secrets_file(
|
|
||||||
'credentials.json', SCOPES)
|
|
||||||
creds = flow.run_local_server(port=0)
|
|
||||||
# Save the credentials for the next run
|
|
||||||
with open('token.pickle', 'wb') as token:
|
|
||||||
pickle.dump(creds, token)
|
|
||||||
|
|
||||||
service = build('docs', 'v1', credentials=creds)
|
|
||||||
|
|
||||||
# Retrieve the documents contents from the Docs service.
|
|
||||||
document = service.documents().get(documentId=docid).execute()
|
|
||||||
|
|
||||||
tempout = codecs.open('cache/trash/gdoctemp.txt','w','utf-8')
|
|
||||||
tempout.write( json.dumps(document,indent=2) \
|
|
||||||
+ "\n\n\n------------------------------------\n\n")
|
|
||||||
if verbose: print('The title of the document is: {}'.format(document.get('title')))
|
|
||||||
|
|
||||||
doc_content = document.get('body').get('content')
|
|
||||||
doc_objects = document.get('inlineObjects')
|
|
||||||
doc_lists = document.get('lists')
|
|
||||||
|
|
||||||
#text = ''
|
|
||||||
result = []
|
result = []
|
||||||
last_type = ''
|
last_type = ''
|
||||||
#answer_text = ''
|
#answer_text = ''
|
||||||
|
|
|
||||||
23
search.py
23
search.py
|
|
@ -554,3 +554,26 @@ if __name__ == "__main__":
|
||||||
|
|
||||||
# Call the function in the options dict
|
# Call the function in the options dict
|
||||||
options[ int(resp)][1]()
|
options[ int(resp)][1]()
|
||||||
|
def try_clustering(df):
|
||||||
|
from sklearn.cluster import KMeans
|
||||||
|
df = df.drop(['code'], axis=1)
|
||||||
|
kmeans = KMeans(n_clusters=4, random_state=0).fit(df)
|
||||||
|
return kmeans.labels_
|
||||||
|
def nlp_sample():
|
||||||
|
from gensim import utils, corpora
|
||||||
|
from nltk import stem
|
||||||
|
stemmer = stem.porter.PorterStemmer()
|
||||||
|
strings = [
|
||||||
|
"Human machine interface for lab abc computer applications",
|
||||||
|
"A survey of user opinion of computer system response time",
|
||||||
|
"The EPS user interface management system",
|
||||||
|
"System and human system engineering testing of EPS",
|
||||||
|
"Relation of user perceived response time to error measurement",
|
||||||
|
"The generation of random binary unordered trees",
|
||||||
|
"The intersection graph of paths in trees",
|
||||||
|
"Graph minors IV Widths of trees and well quasi ordering",
|
||||||
|
"Graph minors A survey",
|
||||||
|
]
|
||||||
|
processed = [[stemmer.stem(y) for y in utils.simple_preprocess(x, min_len=4)] for x in strings]
|
||||||
|
dictionary = corpora.Dictionary(processed)
|
||||||
|
return dictionary
|
||||||
|
|
|
||||||
8
users.py
8
users.py
|
|
@ -1938,7 +1938,8 @@ def track_users_by_teacherclass():
|
||||||
print(json.dumps(g2, indent=2))
|
print(json.dumps(g2, indent=2))
|
||||||
|
|
||||||
|
|
||||||
def nlp_sample():
|
## moved: nlp_sample now in search.py
|
||||||
|
# def nlp_sample():
|
||||||
# Stream a training corpus directly from S3.
|
# Stream a training corpus directly from S3.
|
||||||
#corpus = corpora.MmCorpus("s3://path/to/corpus")
|
#corpus = corpora.MmCorpus("s3://path/to/corpus")
|
||||||
|
|
||||||
|
|
@ -1955,9 +1956,7 @@ def nlp_sample():
|
||||||
"Graph minors IV Widths of trees and well quasi ordering",
|
"Graph minors IV Widths of trees and well quasi ordering",
|
||||||
"Graph minors A survey",
|
"Graph minors A survey",
|
||||||
]
|
]
|
||||||
processed = [ [ stemmer.stem(y) for y in utils.simple_preprocess(x, min_len=4)] for x in strings]
|
# moved
|
||||||
print(processed)
|
|
||||||
dictionary = corpora.Dictionary( processed )
|
|
||||||
dct = dictionary
|
dct = dictionary
|
||||||
print(dictionary)
|
print(dictionary)
|
||||||
|
|
||||||
|
|
@ -2980,4 +2979,3 @@ if __name__ == "__main__":
|
||||||
|
|
||||||
# Call the function in the options dict
|
# Call the function in the options dict
|
||||||
options[ int(resp)][1]()
|
options[ int(resp)][1]()
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue