cleanup
This commit is contained in:
parent
ff5ed654eb
commit
9584f45f30
329
content.py
329
content.py
|
|
@ -1528,6 +1528,334 @@ LANE: HyFlex
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
################
|
||||||
|
################ GOOGLE DOCS HELPERS (moved from pipelines)
|
||||||
|
################
|
||||||
|
|
||||||
|
def sec(t): return "<h3>"+t+"</h3>\n"
|
||||||
|
def para(t): return "<p>"+t+"</p>\n"
|
||||||
|
def ul(t): return "<ul>"+t+"</ul>\n"
|
||||||
|
def li(t): return "<li>"+t+"</li>\n"
|
||||||
|
|
||||||
|
def question(t,bracket=1):
|
||||||
|
ret = ''
|
||||||
|
match = re.search( r'\[(.*)\]', t)
|
||||||
|
if match and bracket:
|
||||||
|
ret += "<a name='" + match.group(1) + "'></a>"
|
||||||
|
t = re.sub( r'\[.*\]','',t)
|
||||||
|
else:
|
||||||
|
parts = t.split(' ')
|
||||||
|
id = ''
|
||||||
|
for p in parts:
|
||||||
|
if re.search(r'[a-zA-Z]',p[0]): id += p[0]
|
||||||
|
ret += "<a name='%s'></a>" % id.lower()
|
||||||
|
return ret + '<div class="accordion" data-accordion=""><h4 class="acrd_cntl">' + t + '</h4>\n<div class="acrd_cntnt">'
|
||||||
|
|
||||||
|
def answer(t):
|
||||||
|
return t + '</div></div>\n'
|
||||||
|
|
||||||
|
def read_paragraph_element(element,type="NORMAL_TEXT"):
|
||||||
|
text_run = element.get('textRun')
|
||||||
|
begin = ''
|
||||||
|
end = ''
|
||||||
|
if not text_run:
|
||||||
|
return ''
|
||||||
|
if 'textStyle' in text_run and 'link' in text_run['textStyle']:
|
||||||
|
begin = '<a href="' + text_run['textStyle']['link']['url'] + '">'
|
||||||
|
end = '</a>'
|
||||||
|
if 'textStyle' in text_run and 'bold' in text_run['textStyle'] and text_run['textStyle']['bold']==True and type=="NORMAL_TEXT":
|
||||||
|
begin = '<strong>' + begin
|
||||||
|
end = end + '</strong>'
|
||||||
|
content = text_run.get('content')
|
||||||
|
content = re.sub(u'\u000b','<br />\n',content)
|
||||||
|
return begin + content + end
|
||||||
|
|
||||||
|
def read_paragraph_element_2(element,type="NORMAL_TEXT"):
|
||||||
|
return read_paragraph_element(element,type)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# t is a string that begins with "Icons: " ... and contains comma(space) separated list
|
||||||
|
def handle_icons(t):
|
||||||
|
text = t[7:].strip()
|
||||||
|
parts = text.split(", ")
|
||||||
|
return ('icons',parts)
|
||||||
|
|
||||||
|
# t is a string that begins with "Tags: " ... and contains comma(space) separated list
|
||||||
|
def handle_tags(t):
|
||||||
|
text = t[6:].strip()
|
||||||
|
parts = text.split(", ")
|
||||||
|
return ('tags',parts)
|
||||||
|
|
||||||
|
def handle_question(t,bracket=1):
|
||||||
|
anchor = ''
|
||||||
|
match = re.search( r'\[(.*)\]', t)
|
||||||
|
if match and bracket:
|
||||||
|
anchor = match.group(1).lower()
|
||||||
|
t = re.sub( r'\[.*\]','',t)
|
||||||
|
else:
|
||||||
|
parts = t.split(' ')
|
||||||
|
for p in parts:
|
||||||
|
if re.search(r'[a-zA-Z]',p[0]): anchor += p[0].lower()
|
||||||
|
return ('question', t, anchor)
|
||||||
|
|
||||||
|
def handle_answer(t):
|
||||||
|
return ('answer',t)
|
||||||
|
|
||||||
|
def handle_sec(t): return ('section',t)
|
||||||
|
def handle_para(t): return ('paragraph',t)
|
||||||
|
def handle_ul(t): return ('unorderdedlist',t)
|
||||||
|
def handle_li(t): return ('listitem',t)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
img_count = 1
|
||||||
|
img_lookup = {}
|
||||||
|
img_heights = {}
|
||||||
|
img_widths = {}
|
||||||
|
|
||||||
|
|
||||||
|
'''def fetch_doc_image(k,value):
|
||||||
|
global img_count, img_lookup, img_heights, img_widths
|
||||||
|
if 'inlineObjectProperties' in value:
|
||||||
|
if 'embeddedObject' in value['inlineObjectProperties']:
|
||||||
|
if 'imageProperties' in value['inlineObjectProperties']['embeddedObject']:
|
||||||
|
if 'contentUri' in value['inlineObjectProperties']['embeddedObject']['imageProperties']:
|
||||||
|
print(k)
|
||||||
|
uu = value['inlineObjectProperties']['embeddedObject']['imageProperties']['contentUri']
|
||||||
|
response = requests.get(uu, stream=True)
|
||||||
|
name = 'image_' + str(img_count) + '.' + response.headers['content-type'].split('/')[1]
|
||||||
|
img_count += 1
|
||||||
|
img_lookup[k] = name
|
||||||
|
|
||||||
|
with open('cache/doc_images/'+name, 'wb') as out_file:
|
||||||
|
shutil.copyfileobj(response.raw, out_file)
|
||||||
|
print(uu)
|
||||||
|
print(response.headers)
|
||||||
|
print(name)
|
||||||
|
del response
|
||||||
|
if 'size' in value['inlineObjectProperties']['embeddedObject']:
|
||||||
|
img_heights[k] = int(value['inlineObjectProperties']['embeddedObject']['size']['height']['magnitude'])
|
||||||
|
img_widths[k] = int(value['inlineObjectProperties']['embeddedObject']['size']['width']['magnitude'])
|
||||||
|
'''
|
||||||
|
|
||||||
|
|
||||||
|
def fetch_doc_image(k,value):
|
||||||
|
import shutil
|
||||||
|
if 'inlineObjectProperties' in value:
|
||||||
|
if 'embeddedObject' in value['inlineObjectProperties']:
|
||||||
|
if 'imageProperties' in value['inlineObjectProperties']['embeddedObject']:
|
||||||
|
if 'contentUri' in value['inlineObjectProperties']['embeddedObject']['imageProperties']:
|
||||||
|
uu = value['inlineObjectProperties']['embeddedObject']['imageProperties']['contentUri']
|
||||||
|
response = requests.get(uu, stream=True)
|
||||||
|
name = 'image_' + str(k) + '.' + response.headers['content-type'].split('/')[1]
|
||||||
|
with open('cache/doc_images/'+name, 'wb') as out_file:
|
||||||
|
shutil.copyfileobj(response.raw, out_file)
|
||||||
|
del response
|
||||||
|
return True
|
||||||
|
|
||||||
|
def get_doc(docid, bracket=1, verbose=0):
|
||||||
|
import pickle, shutil
|
||||||
|
import os.path
|
||||||
|
from googleapiclient.discovery import build
|
||||||
|
from google_auth_oauthlib.flow import InstalledAppFlow
|
||||||
|
from google.auth.transport.requests import Request
|
||||||
|
|
||||||
|
#ooout = open(fileout,'w')
|
||||||
|
|
||||||
|
# If modifying these scopes, delete the file token.pickle.
|
||||||
|
SCOPES = ['https://www.googleapis.com/auth/documents.readonly']
|
||||||
|
creds = None
|
||||||
|
# The file token.pickle stores the user's access and refresh tokens, and is
|
||||||
|
# created automatically when the authorization flow completes for the first
|
||||||
|
# time.
|
||||||
|
if os.path.exists('token.pickle'):
|
||||||
|
with open('token.pickle', 'rb') as token:
|
||||||
|
creds = pickle.load(token)
|
||||||
|
# If there are no (valid) credentials available, let the user log in.
|
||||||
|
if not creds or not creds.valid:
|
||||||
|
if creds and creds.expired and creds.refresh_token:
|
||||||
|
creds.refresh(Request())
|
||||||
|
else:
|
||||||
|
flow = InstalledAppFlow.from_client_secrets_file(
|
||||||
|
'credentials.json', SCOPES)
|
||||||
|
creds = flow.run_local_server(port=0)
|
||||||
|
# Save the credentials for the next run
|
||||||
|
with open('token.pickle', 'wb') as token:
|
||||||
|
pickle.dump(creds, token)
|
||||||
|
|
||||||
|
service = build('docs', 'v1', credentials=creds)
|
||||||
|
|
||||||
|
# Retrieve the documents contents from the Docs service.
|
||||||
|
document = service.documents().get(documentId=docid).execute()
|
||||||
|
if verbose: print(document)
|
||||||
|
|
||||||
|
tempout = codecs.open('cache/trash/gdoctemp.txt','w','utf-8')
|
||||||
|
tempout.write( json.dumps(document,indent=2) + "\n\n\n------------------------------------\n\n")
|
||||||
|
if verbose: print('The title of the document is: {}'.format(document.get('title')))
|
||||||
|
doc_content = document.get('body').get('content')
|
||||||
|
if verbose: print(doc_content)
|
||||||
|
|
||||||
|
doc_objects = document.get('inlineObjects')
|
||||||
|
if verbose: print(doc_objects)
|
||||||
|
|
||||||
|
doc_lists = document.get('lists')
|
||||||
|
|
||||||
|
text = '<div class="acrd_grp" data-accordion-group="">'
|
||||||
|
last_type = ''
|
||||||
|
answer_text = ''
|
||||||
|
in_a_list = ''
|
||||||
|
|
||||||
|
img_count = 1
|
||||||
|
img_lookup = {}
|
||||||
|
img_heights = {}
|
||||||
|
img_widths = {}
|
||||||
|
|
||||||
|
if doc_objects:
|
||||||
|
for k,value in doc_objects.items():
|
||||||
|
tempout.write( "->" + k + "=" + json.dumps(value,indent=2) + "\n\n\n--\n\n")
|
||||||
|
if 'inlineObjectProperties' in value:
|
||||||
|
if 'embeddedObject' in value['inlineObjectProperties']:
|
||||||
|
if 'imageProperties' in value['inlineObjectProperties']['embeddedObject']:
|
||||||
|
if 'contentUri' in value['inlineObjectProperties']['embeddedObject']['imageProperties']:
|
||||||
|
print(k)
|
||||||
|
uu = value['inlineObjectProperties']['embeddedObject']['imageProperties']['contentUri']
|
||||||
|
response = requests.get(uu, stream=True)
|
||||||
|
name = 'image_' + str(img_count) + '.' + response.headers['content-type'].split('/')[1]
|
||||||
|
img_count += 1
|
||||||
|
|
||||||
|
img_lookup[k] = name
|
||||||
|
|
||||||
|
with open('cache/doc_images/'+name, 'wb') as out_file:
|
||||||
|
shutil.copyfileobj(response.raw, out_file)
|
||||||
|
print(uu)
|
||||||
|
print(response.headers)
|
||||||
|
print(name)
|
||||||
|
#input('x?')
|
||||||
|
del response
|
||||||
|
if 'size' in value['inlineObjectProperties']['embeddedObject']:
|
||||||
|
img_heights[k] = int(value['inlineObjectProperties']['embeddedObject']['size']['height']['magnitude'])
|
||||||
|
img_widths[k] = int(value['inlineObjectProperties']['embeddedObject']['size']['width']['magnitude'])
|
||||||
|
|
||||||
|
tempout.write('- - - - - - - -\n\n')
|
||||||
|
#for value in doc_lists:
|
||||||
|
# tempout.write( json.dumps(value,indent=2) + "\n\n\n--\n\n")
|
||||||
|
|
||||||
|
tempout.write('- - - - - - - -\n\n')
|
||||||
|
list_stack = []
|
||||||
|
list_depth = 0
|
||||||
|
last_list_depth = 0
|
||||||
|
for value in doc_content:
|
||||||
|
tempout.write( json.dumps(value,indent=2) + "\n\n\n")
|
||||||
|
if verbose: print(json.dumps(value, sort_keys=True, indent=4))
|
||||||
|
|
||||||
|
# todo: x link, x bold, list, image.
|
||||||
|
tag_fxn = para
|
||||||
|
if 'paragraph' in value:
|
||||||
|
this_text = ''
|
||||||
|
|
||||||
|
if 'bullet' in value['paragraph']:
|
||||||
|
# either we're (1)starting a new list, (2)in one, (3)starting a nested one, or (4)finished a nested one.
|
||||||
|
|
||||||
|
lid = value['paragraph']['bullet']['listId']
|
||||||
|
|
||||||
|
if not list_stack: # 1
|
||||||
|
list_stack.append(lid)
|
||||||
|
else:
|
||||||
|
if lid == list_stack[0]: # 2
|
||||||
|
pass
|
||||||
|
|
||||||
|
else:
|
||||||
|
if not lid in list_stack: # 3
|
||||||
|
list_stack.append(lid)
|
||||||
|
else: # 4
|
||||||
|
x = list_stack.pop()
|
||||||
|
while x != lid: list_stack.pop()
|
||||||
|
elif len(list_stack) > 0: # current para isn't a bullet but we still have a list open.
|
||||||
|
list_stack = []
|
||||||
|
|
||||||
|
list_depth = len(list_stack)
|
||||||
|
|
||||||
|
deeper = list_depth - last_list_depth
|
||||||
|
|
||||||
|
if deeper > 0:
|
||||||
|
answer_text += "<ul>" * deeper
|
||||||
|
elif deeper < 0:
|
||||||
|
deeper = -1 * deeper
|
||||||
|
answer_text += "</ul>" * deeper
|
||||||
|
|
||||||
|
if len(list_stack):
|
||||||
|
tag_fxn = li
|
||||||
|
|
||||||
|
elements = value.get('paragraph').get('elements')
|
||||||
|
|
||||||
|
# inlineObjectElement": {
|
||||||
|
# "inlineObjectId": "kix.ssseeu8j9cfx",
|
||||||
|
|
||||||
|
if 'paragraphStyle' in value.get('paragraph'):
|
||||||
|
style = value.get('paragraph').get('paragraphStyle')
|
||||||
|
#text += json.dumps(style, sort_keys=True, indent=4)
|
||||||
|
if 'namedStyleType' in style:
|
||||||
|
type = style['namedStyleType']
|
||||||
|
|
||||||
|
for elem in elements:
|
||||||
|
|
||||||
|
# text content
|
||||||
|
this_text += read_paragraph_element(elem,type)
|
||||||
|
|
||||||
|
# image content
|
||||||
|
if 'inlineObjectElement' in elem:
|
||||||
|
vpi = elem['inlineObjectElement']
|
||||||
|
if 'inlineObjectId' in vpi:
|
||||||
|
ii = vpi['inlineObjectId']
|
||||||
|
if ii in img_lookup:
|
||||||
|
img = img_lookup[ii]
|
||||||
|
h = img_heights[ii]
|
||||||
|
w = img_widths[ii]
|
||||||
|
this_text += '<img src="doc_images/%s" width="%i" height="%i" />' % (img,w,h)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
if last_type=='NORMAL_TEXT' and type!=last_type:
|
||||||
|
text += answer(answer_text)
|
||||||
|
answer_text = ''
|
||||||
|
|
||||||
|
if type=='HEADING_2':
|
||||||
|
text += sec(this_text)
|
||||||
|
this_text = ''
|
||||||
|
elif type=='HEADING_3':
|
||||||
|
text += question(this_text,bracket)
|
||||||
|
this_text = ''
|
||||||
|
else:
|
||||||
|
answer_text += tag_fxn(this_text)
|
||||||
|
this_text = ''
|
||||||
|
last_type = type
|
||||||
|
last_list_depth = list_depth
|
||||||
|
|
||||||
|
elif 'table' in value:
|
||||||
|
# The text in table cells are in nested Structural Elements and tables may be
|
||||||
|
# nested.
|
||||||
|
text += "\nTABLE\n"
|
||||||
|
#table = value.get('table')
|
||||||
|
#for row in table.get('tableRows'):
|
||||||
|
# cells = row.get('tableCells')
|
||||||
|
# for cell in cells:
|
||||||
|
# text += read_strucutural_elements(cell.get('content'))
|
||||||
|
#elif 'tableOfContents' in value:
|
||||||
|
# # The text in the TOC is also in a Structural Element.
|
||||||
|
# toc = value.get('tableOfContents')
|
||||||
|
# text += read_strucutural_elements(toc.get('content'))
|
||||||
|
|
||||||
|
#else:
|
||||||
|
# print(json.dumps(value, sort_keys=True, indent=4))
|
||||||
|
|
||||||
|
text += answer(answer_text)
|
||||||
|
#text += '</div>'
|
||||||
|
#print(text)
|
||||||
|
return text
|
||||||
|
|
||||||
|
def get_doc_generic(docid, bracket=1, verbose=0):
|
||||||
|
return get_doc(docid, bracket, verbose)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -1567,4 +1895,3 @@ if __name__ == "__main__":
|
||||||
|
|
||||||
# Call the function in the options dict
|
# Call the function in the options dict
|
||||||
options[ int(resp)][1]()
|
options[ int(resp)][1]()
|
||||||
|
|
||||||
|
|
|
||||||
19
courses.py
19
courses.py
|
|
@ -2690,24 +2690,7 @@ def enrollment_helper():
|
||||||
|
|
||||||
# fill percentage for each section, then by mode, tod, campus
|
# fill percentage for each section, then by mode, tod, campus
|
||||||
|
|
||||||
def try_clustering(df):
|
## moved: try_clustering now in search.py
|
||||||
# Import required libraries
|
|
||||||
from sklearn.cluster import KMeans
|
|
||||||
|
|
||||||
# Preprocessing
|
|
||||||
|
|
||||||
# Assuming df is your DataFrame and "modes" is your categorical column
|
|
||||||
#df['code'] = df['code'].astype('category').cat.codes
|
|
||||||
|
|
||||||
# Removing any other unnecessary columns
|
|
||||||
df = df.drop(['code'], axis=1)
|
|
||||||
|
|
||||||
# Perform KMeans clustering
|
|
||||||
kmeans = KMeans(n_clusters=4, random_state=0).fit(df)
|
|
||||||
|
|
||||||
# Get the cluster labels
|
|
||||||
labels = kmeans.labels_
|
|
||||||
|
|
||||||
# Add labels to the DataFrame
|
# Add labels to the DataFrame
|
||||||
#df['clusters'] = labels
|
#df['clusters'] = labels
|
||||||
#print(df)
|
#print(df)
|
||||||
|
|
|
||||||
|
|
@ -4,6 +4,98 @@
|
||||||
|
|
||||||
# from pipelines - canvas data
|
# from pipelines - canvas data
|
||||||
|
|
||||||
|
|
||||||
|
# Canvas data, download all new files
|
||||||
|
def sync_non_interactive():
|
||||||
|
resp = do_request('/api/account/self/file/sync')
|
||||||
|
mylog.write(json.dumps(resp, indent=4))
|
||||||
|
#mylog.close()
|
||||||
|
gotten = os.listdir(local_data_folder)
|
||||||
|
wanted = []
|
||||||
|
i = 0
|
||||||
|
for x in resp['files']:
|
||||||
|
filename = x['filename']
|
||||||
|
exi = "No "
|
||||||
|
if filename in gotten: exi = "Yes"
|
||||||
|
else: wanted.append(x)
|
||||||
|
|
||||||
|
print(str(i) + '.\tLocal: %s\tRemote: %s' % ( exi, filename ))
|
||||||
|
i += 1
|
||||||
|
print("I will attempt to download %i files." % len(wanted))
|
||||||
|
|
||||||
|
#answer = input("Press enter to begin, or q to quit ")
|
||||||
|
#if not answer == '': return
|
||||||
|
|
||||||
|
good_count = 0
|
||||||
|
bad_count = 0
|
||||||
|
for W in wanted:
|
||||||
|
print("Downloading: " + W['filename'])
|
||||||
|
response = requests.request(method='GET', url=W['url'], stream=True)
|
||||||
|
if(response.status_code != 200):
|
||||||
|
print('Request response went bad. Got back a %s code, meaning the request was %s' % \
|
||||||
|
(response.status_code, response.reason))
|
||||||
|
print('URL: ' + W['url'])
|
||||||
|
bad_count += 1
|
||||||
|
|
||||||
|
else:
|
||||||
|
#Use the downloaded data
|
||||||
|
with open(local_data_folder + W['filename'], 'wb') as fd:
|
||||||
|
for chunk in response.iter_content(chunk_size=128):
|
||||||
|
fd.write(chunk)
|
||||||
|
print("Success")
|
||||||
|
good_count += 1
|
||||||
|
print("Out of %i files, %i succeeded and %i failed." % (len(wanted),good_count,bad_count))
|
||||||
|
|
||||||
|
|
||||||
|
## OLD STYLE CANVAS DATA
|
||||||
|
|
||||||
|
# Get something from Canvas Data
|
||||||
|
def do_request(path):
|
||||||
|
#Set up the request pieces
|
||||||
|
method = 'GET'
|
||||||
|
host = 'api.inshosteddata.com'
|
||||||
|
apiTime = datetime.utcnow().strftime('%a, %d %b %y %H:%M:%S GMT')
|
||||||
|
apiContentType = 'application/json'
|
||||||
|
|
||||||
|
msgList = []
|
||||||
|
msgList.append(method)
|
||||||
|
msgList.append(host)
|
||||||
|
msgList.append(apiContentType)
|
||||||
|
msgList.append('')
|
||||||
|
msgList.append(path)
|
||||||
|
msgList.append('')
|
||||||
|
msgList.append(apiTime)
|
||||||
|
msgList.append(apiSecret)
|
||||||
|
|
||||||
|
msgStr = bytes("".join("%s\n" % k for k in msgList).strip(),'utf-8')
|
||||||
|
|
||||||
|
sig = base64.b64encode(hmac.new(key=bytes(apiSecret,'utf-8'),msg=msgStr,digestmod=hashlib.sha256).digest())
|
||||||
|
sig = sig.decode('utf-8')
|
||||||
|
|
||||||
|
headers = {}
|
||||||
|
headers['Authorization'] = 'HMACAuth {}:{}'.format(apiKey,sig)
|
||||||
|
headers['Date'] = apiTime
|
||||||
|
headers['Content-type'] = apiContentType
|
||||||
|
|
||||||
|
|
||||||
|
#Submit the request/get a response
|
||||||
|
uri = "https://"+host+path
|
||||||
|
print (uri)
|
||||||
|
print (headers)
|
||||||
|
response = requests.request(method='GET', url=uri, headers=headers, stream=True)
|
||||||
|
|
||||||
|
#Check to make sure the request was ok
|
||||||
|
if(response.status_code != 200):
|
||||||
|
print(('Request response went bad. Got back a ', response.status_code, ' code, meaning the request was ', response.reason))
|
||||||
|
else:
|
||||||
|
#Use the downloaded data
|
||||||
|
jsonData = response.json()
|
||||||
|
#print(json.dumps(jsonData, indent=4))
|
||||||
|
return jsonData
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def file_doesnt_exist(name):
|
def file_doesnt_exist(name):
|
||||||
# Get list of files in current directory
|
# Get list of files in current directory
|
||||||
files = os.listdir()
|
files = os.listdir()
|
||||||
|
|
|
||||||
|
|
@ -8,7 +8,7 @@ from datetime import datetime as dt
|
||||||
from datetime import timedelta
|
from datetime import timedelta
|
||||||
from dateutil.parser import parse
|
from dateutil.parser import parse
|
||||||
from os.path import exists, getmtime
|
from os.path import exists, getmtime
|
||||||
from pipelines import sync_non_interactive, url, header
|
from pipelines import url, header
|
||||||
import util
|
import util
|
||||||
from semesters import short_to_sis
|
from semesters import short_to_sis
|
||||||
|
|
||||||
|
|
@ -1121,7 +1121,7 @@ def full_reload():
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print("Couldn't rename file:", str(e))
|
print("Couldn't rename file:", str(e))
|
||||||
|
|
||||||
sync_non_interactive()
|
#sync_non_interactive()
|
||||||
|
|
||||||
setup_table('requests_sum1')
|
setup_table('requests_sum1')
|
||||||
setup_table('courses')
|
setup_table('courses')
|
||||||
|
|
|
||||||
2553
pipelines.py
2553
pipelines.py
File diff suppressed because it is too large
Load Diff
23
search.py
23
search.py
|
|
@ -554,3 +554,26 @@ if __name__ == "__main__":
|
||||||
|
|
||||||
# Call the function in the options dict
|
# Call the function in the options dict
|
||||||
options[ int(resp)][1]()
|
options[ int(resp)][1]()
|
||||||
|
def try_clustering(df):
|
||||||
|
from sklearn.cluster import KMeans
|
||||||
|
df = df.drop(['code'], axis=1)
|
||||||
|
kmeans = KMeans(n_clusters=4, random_state=0).fit(df)
|
||||||
|
return kmeans.labels_
|
||||||
|
def nlp_sample():
|
||||||
|
from gensim import utils, corpora
|
||||||
|
from nltk import stem
|
||||||
|
stemmer = stem.porter.PorterStemmer()
|
||||||
|
strings = [
|
||||||
|
"Human machine interface for lab abc computer applications",
|
||||||
|
"A survey of user opinion of computer system response time",
|
||||||
|
"The EPS user interface management system",
|
||||||
|
"System and human system engineering testing of EPS",
|
||||||
|
"Relation of user perceived response time to error measurement",
|
||||||
|
"The generation of random binary unordered trees",
|
||||||
|
"The intersection graph of paths in trees",
|
||||||
|
"Graph minors IV Widths of trees and well quasi ordering",
|
||||||
|
"Graph minors A survey",
|
||||||
|
]
|
||||||
|
processed = [[stemmer.stem(y) for y in utils.simple_preprocess(x, min_len=4)] for x in strings]
|
||||||
|
dictionary = corpora.Dictionary(processed)
|
||||||
|
return dictionary
|
||||||
|
|
|
||||||
8
users.py
8
users.py
|
|
@ -1938,7 +1938,8 @@ def track_users_by_teacherclass():
|
||||||
print(json.dumps(g2, indent=2))
|
print(json.dumps(g2, indent=2))
|
||||||
|
|
||||||
|
|
||||||
def nlp_sample():
|
## moved: nlp_sample now in search.py
|
||||||
|
# def nlp_sample():
|
||||||
# Stream a training corpus directly from S3.
|
# Stream a training corpus directly from S3.
|
||||||
#corpus = corpora.MmCorpus("s3://path/to/corpus")
|
#corpus = corpora.MmCorpus("s3://path/to/corpus")
|
||||||
|
|
||||||
|
|
@ -1955,9 +1956,7 @@ def nlp_sample():
|
||||||
"Graph minors IV Widths of trees and well quasi ordering",
|
"Graph minors IV Widths of trees and well quasi ordering",
|
||||||
"Graph minors A survey",
|
"Graph minors A survey",
|
||||||
]
|
]
|
||||||
processed = [ [ stemmer.stem(y) for y in utils.simple_preprocess(x, min_len=4)] for x in strings]
|
# moved
|
||||||
print(processed)
|
|
||||||
dictionary = corpora.Dictionary( processed )
|
|
||||||
dct = dictionary
|
dct = dictionary
|
||||||
print(dictionary)
|
print(dictionary)
|
||||||
|
|
||||||
|
|
@ -2980,4 +2979,3 @@ if __name__ == "__main__":
|
||||||
|
|
||||||
# Call the function in the options dict
|
# Call the function in the options dict
|
||||||
options[ int(resp)][1]()
|
options[ int(resp)][1]()
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue