This commit is contained in:
Peter Howell 2025-09-05 23:06:34 +00:00
parent ff5ed654eb
commit 9584f45f30
7 changed files with 1472 additions and 1556 deletions

View File

@ -1528,6 +1528,334 @@ LANE: HyFlex
################
################ GOOGLE DOCS HELPERS (moved from pipelines)
################
def sec(t): return "<h3>"+t+"</h3>\n"
def para(t): return "<p>"+t+"</p>\n"
def ul(t): return "<ul>"+t+"</ul>\n"
def li(t): return "<li>"+t+"</li>\n"
def question(t,bracket=1):
ret = ''
match = re.search( r'\[(.*)\]', t)
if match and bracket:
ret += "<a name='" + match.group(1) + "'></a>"
t = re.sub( r'\[.*\]','',t)
else:
parts = t.split(' ')
id = ''
for p in parts:
if re.search(r'[a-zA-Z]',p[0]): id += p[0]
ret += "<a name='%s'></a>" % id.lower()
return ret + '<div class="accordion" data-accordion=""><h4 class="acrd_cntl">' + t + '</h4>\n<div class="acrd_cntnt">'
def answer(t):
return t + '</div></div>\n'
def read_paragraph_element(element,type="NORMAL_TEXT"):
text_run = element.get('textRun')
begin = ''
end = ''
if not text_run:
return ''
if 'textStyle' in text_run and 'link' in text_run['textStyle']:
begin = '<a href="' + text_run['textStyle']['link']['url'] + '">'
end = '</a>'
if 'textStyle' in text_run and 'bold' in text_run['textStyle'] and text_run['textStyle']['bold']==True and type=="NORMAL_TEXT":
begin = '<strong>' + begin
end = end + '</strong>'
content = text_run.get('content')
content = re.sub(u'\u000b','<br />\n',content)
return begin + content + end
def read_paragraph_element_2(element,type="NORMAL_TEXT"):
return read_paragraph_element(element,type)
# t is a string that begins with "Icons: " ... and contains comma(space) separated list
def handle_icons(t):
text = t[7:].strip()
parts = text.split(", ")
return ('icons',parts)
# t is a string that begins with "Tags: " ... and contains comma(space) separated list
def handle_tags(t):
text = t[6:].strip()
parts = text.split(", ")
return ('tags',parts)
def handle_question(t,bracket=1):
anchor = ''
match = re.search( r'\[(.*)\]', t)
if match and bracket:
anchor = match.group(1).lower()
t = re.sub( r'\[.*\]','',t)
else:
parts = t.split(' ')
for p in parts:
if re.search(r'[a-zA-Z]',p[0]): anchor += p[0].lower()
return ('question', t, anchor)
def handle_answer(t):
return ('answer',t)
def handle_sec(t): return ('section',t)
def handle_para(t): return ('paragraph',t)
def handle_ul(t): return ('unorderdedlist',t)
def handle_li(t): return ('listitem',t)
img_count = 1
img_lookup = {}
img_heights = {}
img_widths = {}
'''def fetch_doc_image(k,value):
global img_count, img_lookup, img_heights, img_widths
if 'inlineObjectProperties' in value:
if 'embeddedObject' in value['inlineObjectProperties']:
if 'imageProperties' in value['inlineObjectProperties']['embeddedObject']:
if 'contentUri' in value['inlineObjectProperties']['embeddedObject']['imageProperties']:
print(k)
uu = value['inlineObjectProperties']['embeddedObject']['imageProperties']['contentUri']
response = requests.get(uu, stream=True)
name = 'image_' + str(img_count) + '.' + response.headers['content-type'].split('/')[1]
img_count += 1
img_lookup[k] = name
with open('cache/doc_images/'+name, 'wb') as out_file:
shutil.copyfileobj(response.raw, out_file)
print(uu)
print(response.headers)
print(name)
del response
if 'size' in value['inlineObjectProperties']['embeddedObject']:
img_heights[k] = int(value['inlineObjectProperties']['embeddedObject']['size']['height']['magnitude'])
img_widths[k] = int(value['inlineObjectProperties']['embeddedObject']['size']['width']['magnitude'])
'''
def fetch_doc_image(k,value):
import shutil
if 'inlineObjectProperties' in value:
if 'embeddedObject' in value['inlineObjectProperties']:
if 'imageProperties' in value['inlineObjectProperties']['embeddedObject']:
if 'contentUri' in value['inlineObjectProperties']['embeddedObject']['imageProperties']:
uu = value['inlineObjectProperties']['embeddedObject']['imageProperties']['contentUri']
response = requests.get(uu, stream=True)
name = 'image_' + str(k) + '.' + response.headers['content-type'].split('/')[1]
with open('cache/doc_images/'+name, 'wb') as out_file:
shutil.copyfileobj(response.raw, out_file)
del response
return True
def get_doc(docid, bracket=1, verbose=0):
import pickle, shutil
import os.path
from googleapiclient.discovery import build
from google_auth_oauthlib.flow import InstalledAppFlow
from google.auth.transport.requests import Request
#ooout = open(fileout,'w')
# If modifying these scopes, delete the file token.pickle.
SCOPES = ['https://www.googleapis.com/auth/documents.readonly']
creds = None
# The file token.pickle stores the user's access and refresh tokens, and is
# created automatically when the authorization flow completes for the first
# time.
if os.path.exists('token.pickle'):
with open('token.pickle', 'rb') as token:
creds = pickle.load(token)
# If there are no (valid) credentials available, let the user log in.
if not creds or not creds.valid:
if creds and creds.expired and creds.refresh_token:
creds.refresh(Request())
else:
flow = InstalledAppFlow.from_client_secrets_file(
'credentials.json', SCOPES)
creds = flow.run_local_server(port=0)
# Save the credentials for the next run
with open('token.pickle', 'wb') as token:
pickle.dump(creds, token)
service = build('docs', 'v1', credentials=creds)
# Retrieve the documents contents from the Docs service.
document = service.documents().get(documentId=docid).execute()
if verbose: print(document)
tempout = codecs.open('cache/trash/gdoctemp.txt','w','utf-8')
tempout.write( json.dumps(document,indent=2) + "\n\n\n------------------------------------\n\n")
if verbose: print('The title of the document is: {}'.format(document.get('title')))
doc_content = document.get('body').get('content')
if verbose: print(doc_content)
doc_objects = document.get('inlineObjects')
if verbose: print(doc_objects)
doc_lists = document.get('lists')
text = '<div class="acrd_grp" data-accordion-group="">'
last_type = ''
answer_text = ''
in_a_list = ''
img_count = 1
img_lookup = {}
img_heights = {}
img_widths = {}
if doc_objects:
for k,value in doc_objects.items():
tempout.write( "->" + k + "=" + json.dumps(value,indent=2) + "\n\n\n--\n\n")
if 'inlineObjectProperties' in value:
if 'embeddedObject' in value['inlineObjectProperties']:
if 'imageProperties' in value['inlineObjectProperties']['embeddedObject']:
if 'contentUri' in value['inlineObjectProperties']['embeddedObject']['imageProperties']:
print(k)
uu = value['inlineObjectProperties']['embeddedObject']['imageProperties']['contentUri']
response = requests.get(uu, stream=True)
name = 'image_' + str(img_count) + '.' + response.headers['content-type'].split('/')[1]
img_count += 1
img_lookup[k] = name
with open('cache/doc_images/'+name, 'wb') as out_file:
shutil.copyfileobj(response.raw, out_file)
print(uu)
print(response.headers)
print(name)
#input('x?')
del response
if 'size' in value['inlineObjectProperties']['embeddedObject']:
img_heights[k] = int(value['inlineObjectProperties']['embeddedObject']['size']['height']['magnitude'])
img_widths[k] = int(value['inlineObjectProperties']['embeddedObject']['size']['width']['magnitude'])
tempout.write('- - - - - - - -\n\n')
#for value in doc_lists:
# tempout.write( json.dumps(value,indent=2) + "\n\n\n--\n\n")
tempout.write('- - - - - - - -\n\n')
list_stack = []
list_depth = 0
last_list_depth = 0
for value in doc_content:
tempout.write( json.dumps(value,indent=2) + "\n\n\n")
if verbose: print(json.dumps(value, sort_keys=True, indent=4))
# todo: x link, x bold, list, image.
tag_fxn = para
if 'paragraph' in value:
this_text = ''
if 'bullet' in value['paragraph']:
# either we're (1)starting a new list, (2)in one, (3)starting a nested one, or (4)finished a nested one.
lid = value['paragraph']['bullet']['listId']
if not list_stack: # 1
list_stack.append(lid)
else:
if lid == list_stack[0]: # 2
pass
else:
if not lid in list_stack: # 3
list_stack.append(lid)
else: # 4
x = list_stack.pop()
while x != lid: list_stack.pop()
elif len(list_stack) > 0: # current para isn't a bullet but we still have a list open.
list_stack = []
list_depth = len(list_stack)
deeper = list_depth - last_list_depth
if deeper > 0:
answer_text += "<ul>" * deeper
elif deeper < 0:
deeper = -1 * deeper
answer_text += "</ul>" * deeper
if len(list_stack):
tag_fxn = li
elements = value.get('paragraph').get('elements')
# inlineObjectElement": {
# "inlineObjectId": "kix.ssseeu8j9cfx",
if 'paragraphStyle' in value.get('paragraph'):
style = value.get('paragraph').get('paragraphStyle')
#text += json.dumps(style, sort_keys=True, indent=4)
if 'namedStyleType' in style:
type = style['namedStyleType']
for elem in elements:
# text content
this_text += read_paragraph_element(elem,type)
# image content
if 'inlineObjectElement' in elem:
vpi = elem['inlineObjectElement']
if 'inlineObjectId' in vpi:
ii = vpi['inlineObjectId']
if ii in img_lookup:
img = img_lookup[ii]
h = img_heights[ii]
w = img_widths[ii]
this_text += '<img src="doc_images/%s" width="%i" height="%i" />' % (img,w,h)
if last_type=='NORMAL_TEXT' and type!=last_type:
text += answer(answer_text)
answer_text = ''
if type=='HEADING_2':
text += sec(this_text)
this_text = ''
elif type=='HEADING_3':
text += question(this_text,bracket)
this_text = ''
else:
answer_text += tag_fxn(this_text)
this_text = ''
last_type = type
last_list_depth = list_depth
elif 'table' in value:
# The text in table cells are in nested Structural Elements and tables may be
# nested.
text += "\nTABLE\n"
#table = value.get('table')
#for row in table.get('tableRows'):
# cells = row.get('tableCells')
# for cell in cells:
# text += read_strucutural_elements(cell.get('content'))
#elif 'tableOfContents' in value:
# # The text in the TOC is also in a Structural Element.
# toc = value.get('tableOfContents')
# text += read_strucutural_elements(toc.get('content'))
#else:
# print(json.dumps(value, sort_keys=True, indent=4))
text += answer(answer_text)
#text += '</div>'
#print(text)
return text
def get_doc_generic(docid, bracket=1, verbose=0):
return get_doc(docid, bracket, verbose)
@ -1567,4 +1895,3 @@ if __name__ == "__main__":
# Call the function in the options dict # Call the function in the options dict
options[ int(resp)][1]() options[ int(resp)][1]()

View File

@ -2690,24 +2690,7 @@ def enrollment_helper():
# fill percentage for each section, then by mode, tod, campus # fill percentage for each section, then by mode, tod, campus
def try_clustering(df): ## moved: try_clustering now in search.py
# Import required libraries
from sklearn.cluster import KMeans
# Preprocessing
# Assuming df is your DataFrame and "modes" is your categorical column
#df['code'] = df['code'].astype('category').cat.codes
# Removing any other unnecessary columns
df = df.drop(['code'], axis=1)
# Perform KMeans clustering
kmeans = KMeans(n_clusters=4, random_state=0).fit(df)
# Get the cluster labels
labels = kmeans.labels_
# Add labels to the DataFrame # Add labels to the DataFrame
#df['clusters'] = labels #df['clusters'] = labels
#print(df) #print(df)

View File

@ -4,6 +4,98 @@
# from pipelines - canvas data # from pipelines - canvas data
# Canvas data, download all new files
def sync_non_interactive():
resp = do_request('/api/account/self/file/sync')
mylog.write(json.dumps(resp, indent=4))
#mylog.close()
gotten = os.listdir(local_data_folder)
wanted = []
i = 0
for x in resp['files']:
filename = x['filename']
exi = "No "
if filename in gotten: exi = "Yes"
else: wanted.append(x)
print(str(i) + '.\tLocal: %s\tRemote: %s' % ( exi, filename ))
i += 1
print("I will attempt to download %i files." % len(wanted))
#answer = input("Press enter to begin, or q to quit ")
#if not answer == '': return
good_count = 0
bad_count = 0
for W in wanted:
print("Downloading: " + W['filename'])
response = requests.request(method='GET', url=W['url'], stream=True)
if(response.status_code != 200):
print('Request response went bad. Got back a %s code, meaning the request was %s' % \
(response.status_code, response.reason))
print('URL: ' + W['url'])
bad_count += 1
else:
#Use the downloaded data
with open(local_data_folder + W['filename'], 'wb') as fd:
for chunk in response.iter_content(chunk_size=128):
fd.write(chunk)
print("Success")
good_count += 1
print("Out of %i files, %i succeeded and %i failed." % (len(wanted),good_count,bad_count))
## OLD STYLE CANVAS DATA
# Get something from Canvas Data
def do_request(path):
#Set up the request pieces
method = 'GET'
host = 'api.inshosteddata.com'
apiTime = datetime.utcnow().strftime('%a, %d %b %y %H:%M:%S GMT')
apiContentType = 'application/json'
msgList = []
msgList.append(method)
msgList.append(host)
msgList.append(apiContentType)
msgList.append('')
msgList.append(path)
msgList.append('')
msgList.append(apiTime)
msgList.append(apiSecret)
msgStr = bytes("".join("%s\n" % k for k in msgList).strip(),'utf-8')
sig = base64.b64encode(hmac.new(key=bytes(apiSecret,'utf-8'),msg=msgStr,digestmod=hashlib.sha256).digest())
sig = sig.decode('utf-8')
headers = {}
headers['Authorization'] = 'HMACAuth {}:{}'.format(apiKey,sig)
headers['Date'] = apiTime
headers['Content-type'] = apiContentType
#Submit the request/get a response
uri = "https://"+host+path
print (uri)
print (headers)
response = requests.request(method='GET', url=uri, headers=headers, stream=True)
#Check to make sure the request was ok
if(response.status_code != 200):
print(('Request response went bad. Got back a ', response.status_code, ' code, meaning the request was ', response.reason))
else:
#Use the downloaded data
jsonData = response.json()
#print(json.dumps(jsonData, indent=4))
return jsonData
def file_doesnt_exist(name): def file_doesnt_exist(name):
# Get list of files in current directory # Get list of files in current directory
files = os.listdir() files = os.listdir()

View File

@ -8,7 +8,7 @@ from datetime import datetime as dt
from datetime import timedelta from datetime import timedelta
from dateutil.parser import parse from dateutil.parser import parse
from os.path import exists, getmtime from os.path import exists, getmtime
from pipelines import sync_non_interactive, url, header from pipelines import url, header
import util import util
from semesters import short_to_sis from semesters import short_to_sis
@ -1121,7 +1121,7 @@ def full_reload():
except Exception as e: except Exception as e:
print("Couldn't rename file:", str(e)) print("Couldn't rename file:", str(e))
sync_non_interactive() #sync_non_interactive()
setup_table('requests_sum1') setup_table('requests_sum1')
setup_table('courses') setup_table('courses')

File diff suppressed because it is too large Load Diff

View File

@ -554,3 +554,26 @@ if __name__ == "__main__":
# Call the function in the options dict # Call the function in the options dict
options[ int(resp)][1]() options[ int(resp)][1]()
def try_clustering(df):
from sklearn.cluster import KMeans
df = df.drop(['code'], axis=1)
kmeans = KMeans(n_clusters=4, random_state=0).fit(df)
return kmeans.labels_
def nlp_sample():
from gensim import utils, corpora
from nltk import stem
stemmer = stem.porter.PorterStemmer()
strings = [
"Human machine interface for lab abc computer applications",
"A survey of user opinion of computer system response time",
"The EPS user interface management system",
"System and human system engineering testing of EPS",
"Relation of user perceived response time to error measurement",
"The generation of random binary unordered trees",
"The intersection graph of paths in trees",
"Graph minors IV Widths of trees and well quasi ordering",
"Graph minors A survey",
]
processed = [[stemmer.stem(y) for y in utils.simple_preprocess(x, min_len=4)] for x in strings]
dictionary = corpora.Dictionary(processed)
return dictionary

View File

@ -1938,7 +1938,8 @@ def track_users_by_teacherclass():
print(json.dumps(g2, indent=2)) print(json.dumps(g2, indent=2))
def nlp_sample(): ## moved: nlp_sample now in search.py
# def nlp_sample():
# Stream a training corpus directly from S3. # Stream a training corpus directly from S3.
#corpus = corpora.MmCorpus("s3://path/to/corpus") #corpus = corpora.MmCorpus("s3://path/to/corpus")
@ -1955,9 +1956,7 @@ def nlp_sample():
"Graph minors IV Widths of trees and well quasi ordering", "Graph minors IV Widths of trees and well quasi ordering",
"Graph minors A survey", "Graph minors A survey",
] ]
processed = [ [ stemmer.stem(y) for y in utils.simple_preprocess(x, min_len=4)] for x in strings] # moved
print(processed)
dictionary = corpora.Dictionary( processed )
dct = dictionary dct = dictionary
print(dictionary) print(dictionary)
@ -2980,4 +2979,3 @@ if __name__ == "__main__":
# Call the function in the options dict # Call the function in the options dict
options[ int(resp)][1]() options[ int(resp)][1]()