cleanup

2025-09-05 23:06:34 +00:00 · 2025-09-05 23:06:34 +00:00 · 9584f45f30
parent ff5ed654eb
commit 9584f45f30
7 changed files with 1472 additions and 1556 deletions
--- a/content.py
+++ b/content.py
@ -1528,6 +1528,334 @@ LANE: HyFlex



+################
+################  GOOGLE DOCS HELPERS (moved from pipelines)
+################
+
+def sec(t): return "<h3>"+t+"</h3>\n"
+def para(t): return "<p>"+t+"</p>\n"
+def ul(t): return "<ul>"+t+"</ul>\n"
+def li(t): return "<li>"+t+"</li>\n"
+
+def question(t,bracket=1):
+    ret = ''
+    match = re.search( r'\[(.*)\]', t)
+    if match and bracket:
+        ret += "<a name='" + match.group(1) + "'></a>"
+        t = re.sub( r'\[.*\]','',t)
+    else:
+        parts = t.split(' ')
+        id = ''
+        for p in parts:
+            if re.search(r'[a-zA-Z]',p[0]): id += p[0]
+        ret += "<a name='%s'></a>" % id.lower()
+    return ret + '<div class="accordion" data-accordion=""><h4 class="acrd_cntl">' + t + '</h4>\n<div class="acrd_cntnt">'
+
+def answer(t):
+    return t + '</div></div>\n'
+
+def read_paragraph_element(element,type="NORMAL_TEXT"):
+    text_run = element.get('textRun')
+    begin = ''
+    end = ''
+    if not text_run:
+        return ''
+    if 'textStyle' in text_run and 'link' in text_run['textStyle']:
+        begin = '<a href="' + text_run['textStyle']['link']['url'] + '">'
+        end = '</a>'
+    if 'textStyle' in text_run and 'bold' in text_run['textStyle'] and text_run['textStyle']['bold']==True and type=="NORMAL_TEXT":
+        begin = '<strong>' + begin
+        end = end + '</strong>'
+    content = text_run.get('content')
+    content = re.sub(u'\u000b','<br />\n',content)
+    return begin + content + end
+
+def read_paragraph_element_2(element,type="NORMAL_TEXT"):
+    return read_paragraph_element(element,type)
+
+
+
+# t is a string that begins with "Icons: " ... and contains comma(space) separated list
+def handle_icons(t):
+    text = t[7:].strip()
+    parts = text.split(", ")
+    return ('icons',parts)
+
+# t is a string that begins with "Tags: " ... and contains comma(space) separated list
+def handle_tags(t):
+    text = t[6:].strip()
+    parts = text.split(", ")
+    return ('tags',parts)
+
+def handle_question(t,bracket=1):
+    anchor = ''
+    match = re.search( r'\[(.*)\]', t)
+    if match and bracket:
+        anchor = match.group(1).lower()
+        t = re.sub( r'\[.*\]','',t)
+    else:
+        parts = t.split(' ')
+        for p in parts:
+            if re.search(r'[a-zA-Z]',p[0]): anchor += p[0].lower()
+    return ('question', t, anchor)
+
+def handle_answer(t):
+    return ('answer',t)
+
+def handle_sec(t): return ('section',t)
+def handle_para(t): return ('paragraph',t)
+def handle_ul(t): return ('unorderdedlist',t)
+def handle_li(t): return ('listitem',t)
+
+
+
+img_count = 1
+img_lookup = {}
+img_heights = {}
+img_widths = {}
+
+
+'''def fetch_doc_image(k,value):
+    global img_count, img_lookup, img_heights, img_widths
+    if 'inlineObjectProperties' in value:
+        if 'embeddedObject' in value['inlineObjectProperties']:
+            if 'imageProperties' in value['inlineObjectProperties']['embeddedObject']:
+                if 'contentUri' in value['inlineObjectProperties']['embeddedObject']['imageProperties']:
+                    print(k)
+                    uu = value['inlineObjectProperties']['embeddedObject']['imageProperties']['contentUri']
+                    response = requests.get(uu, stream=True)
+                    name = 'image_' + str(img_count) + '.' + response.headers['content-type'].split('/')[1]
+                    img_count += 1
+                    img_lookup[k] = name
+                    
+                    with open('cache/doc_images/'+name, 'wb') as out_file:
+                        shutil.copyfileobj(response.raw, out_file)
+                    print(uu)
+                    print(response.headers)
+                    print(name)
+                    del response
+            if 'size' in  value['inlineObjectProperties']['embeddedObject']: 
+                img_heights[k] = int(value['inlineObjectProperties']['embeddedObject']['size']['height']['magnitude'])
+                img_widths[k] = int(value['inlineObjectProperties']['embeddedObject']['size']['width']['magnitude'])
+'''
+
+
+def fetch_doc_image(k,value):
+    import shutil
+    if 'inlineObjectProperties' in value:
+        if 'embeddedObject' in value['inlineObjectProperties']:
+            if 'imageProperties' in value['inlineObjectProperties']['embeddedObject']:
+                if 'contentUri' in value['inlineObjectProperties']['embeddedObject']['imageProperties']:
+                    uu = value['inlineObjectProperties']['embeddedObject']['imageProperties']['contentUri']
+                    response = requests.get(uu, stream=True)
+                    name = 'image_' + str(k) + '.' + response.headers['content-type'].split('/')[1]
+                    with open('cache/doc_images/'+name, 'wb') as out_file:
+                        shutil.copyfileobj(response.raw, out_file)
+                    del response
+    return True
+
+def get_doc(docid, bracket=1, verbose=0):
+    import pickle, shutil
+    import os.path
+    from googleapiclient.discovery import build
+    from google_auth_oauthlib.flow import InstalledAppFlow
+    from google.auth.transport.requests import Request
+    
+    #ooout = open(fileout,'w')
+
+    # If modifying these scopes, delete the file token.pickle.
+    SCOPES = ['https://www.googleapis.com/auth/documents.readonly']
+    creds = None
+    # The file token.pickle stores the user's access and refresh tokens, and is
+    # created automatically when the authorization flow completes for the first
+    # time.
+    if os.path.exists('token.pickle'):
+        with open('token.pickle', 'rb') as token:
+            creds = pickle.load(token)
+    # If there are no (valid) credentials available, let the user log in.
+    if not creds or not creds.valid:
+        if creds and creds.expired and creds.refresh_token:
+            creds.refresh(Request())
+        else:
+            flow = InstalledAppFlow.from_client_secrets_file(
+                'credentials.json', SCOPES)
+            creds = flow.run_local_server(port=0)
+        # Save the credentials for the next run
+        with open('token.pickle', 'wb') as token:
+            pickle.dump(creds, token)
+
+    service = build('docs', 'v1', credentials=creds)
+
+    # Retrieve the documents contents from the Docs service.
+    document = service.documents().get(documentId=docid).execute()
+    if verbose: print(document)
+        
+    tempout = codecs.open('cache/trash/gdoctemp.txt','w','utf-8')
+    tempout.write( json.dumps(document,indent=2) + "\n\n\n------------------------------------\n\n")
+    if verbose: print('The title of the document is: {}'.format(document.get('title'))) 
+    doc_content = document.get('body').get('content')
+    if verbose: print(doc_content)
+    
+    doc_objects = document.get('inlineObjects')
+    if verbose: print(doc_objects)
+    
+    doc_lists = document.get('lists')
+    
+    text = '<div class="acrd_grp" data-accordion-group="">'
+    last_type = ''
+    answer_text = ''
+    in_a_list = ''
+    
+    img_count = 1
+    img_lookup = {}
+    img_heights = {}
+    img_widths = {}
+    
+    if doc_objects:
+        for k,value in doc_objects.items():
+            tempout.write( "->" + k + "=" + json.dumps(value,indent=2) + "\n\n\n--\n\n")
+            if 'inlineObjectProperties' in value:
+                if 'embeddedObject' in value['inlineObjectProperties']:
+                    if 'imageProperties' in value['inlineObjectProperties']['embeddedObject']:
+                        if 'contentUri' in value['inlineObjectProperties']['embeddedObject']['imageProperties']:
+                            print(k)
+                            uu = value['inlineObjectProperties']['embeddedObject']['imageProperties']['contentUri']
+                            response = requests.get(uu, stream=True)
+                            name = 'image_' + str(img_count) + '.' + response.headers['content-type'].split('/')[1]
+                            img_count += 1
+                            
+                            img_lookup[k] = name
+                            
+                            with open('cache/doc_images/'+name, 'wb') as out_file:
+                                shutil.copyfileobj(response.raw, out_file)
+                            print(uu)
+                            print(response.headers)
+                            print(name)
+                            #input('x?')
+                            del response
+                    if 'size' in  value['inlineObjectProperties']['embeddedObject']: 
+                        img_heights[k] = int(value['inlineObjectProperties']['embeddedObject']['size']['height']['magnitude'])
+                        img_widths[k] = int(value['inlineObjectProperties']['embeddedObject']['size']['width']['magnitude'])
+        
+    tempout.write('- - - - - - - -\n\n')
+    #for value in doc_lists:
+    #    tempout.write( json.dumps(value,indent=2) + "\n\n\n--\n\n")
+    
+    tempout.write('- - - - - - - -\n\n')
+    list_stack = []
+    list_depth = 0
+    last_list_depth = 0
+    for value in doc_content:
+        tempout.write( json.dumps(value,indent=2) + "\n\n\n")
+        if verbose: print(json.dumps(value, sort_keys=True, indent=4))
+
+        # todo: x link, x bold, list, image.
+        tag_fxn = para
+        if 'paragraph' in value:
+            this_text = ''
+            
+            if 'bullet' in value['paragraph']:
+                # either we're (1)starting a new list, (2)in one, (3)starting a nested one, or (4)finished a nested one. 
+                
+                lid = value['paragraph']['bullet']['listId']
+                
+                if not list_stack:  # 1
+                    list_stack.append(lid)
+                else:
+                    if lid == list_stack[0]:   # 2
+                        pass
+                        
+                    else:
+                        if not lid in list_stack:   # 3
+                            list_stack.append(lid)
+                        else:                       # 4
+                            x = list_stack.pop()
+                            while x != lid: list_stack.pop()
+            elif len(list_stack) > 0:                           #  current para isn't a bullet but we still have a list open.             
+                list_stack = []
+            
+            list_depth = len(list_stack)
+            
+            deeper = list_depth - last_list_depth
+            
+            if deeper > 0:
+                answer_text += "<ul>" * deeper
+            elif deeper < 0:
+                deeper = -1 * deeper
+                answer_text += "</ul>" * deeper
+            
+            if len(list_stack):
+                tag_fxn = li
+                
+            elements = value.get('paragraph').get('elements')
+            
+            # inlineObjectElement": {
+            # "inlineObjectId": "kix.ssseeu8j9cfx",
+            
+            if 'paragraphStyle' in value.get('paragraph'):
+                style = value.get('paragraph').get('paragraphStyle')
+                #text += json.dumps(style, sort_keys=True, indent=4)
+                if 'namedStyleType' in style:
+                    type = style['namedStyleType']
+            
+            for elem in elements:
+                
+                # text content
+                this_text += read_paragraph_element(elem,type)
+                
+                # image content
+                if 'inlineObjectElement' in elem:
+                    vpi = elem['inlineObjectElement']
+                    if 'inlineObjectId' in vpi:
+                        ii = vpi['inlineObjectId']
+                        if ii in img_lookup:
+                            img = img_lookup[ii]
+                            h = img_heights[ii]
+                            w = img_widths[ii]
+                            this_text += '<img src="doc_images/%s" width="%i" height="%i" />' % (img,w,h)
+                
+                
+            
+            if last_type=='NORMAL_TEXT' and type!=last_type:
+                text += answer(answer_text)
+                answer_text = ''
+            
+            if type=='HEADING_2':
+                text += sec(this_text)
+                this_text = ''
+            elif type=='HEADING_3': 
+                text += question(this_text,bracket)
+                this_text = ''
+            else:
+                answer_text += tag_fxn(this_text)
+                this_text = ''
+            last_type = type
+            last_list_depth = list_depth
+                
+        elif 'table' in value:
+            # The text in table cells are in nested Structural Elements and tables may be
+            # nested.
+            text += "\nTABLE\n"
+            #table = value.get('table')
+            #for row in table.get('tableRows'):
+            #    cells = row.get('tableCells')
+            #    for cell in cells:
+            #        text += read_strucutural_elements(cell.get('content'))
+        #elif 'tableOfContents' in value:
+        #    # The text in the TOC is also in a Structural Element.
+        #    toc = value.get('tableOfContents')
+        #    text += read_strucutural_elements(toc.get('content'))
+        
+        #else:
+        #    print(json.dumps(value, sort_keys=True, indent=4))
+    
+    text += answer(answer_text)
+    #text += '</div>'
+    #print(text)
+    return text
+    
+def get_doc_generic(docid, bracket=1, verbose=0):
+    return get_doc(docid, bracket, verbose)



@ -1567,4 +1895,3 @@ if __name__ == "__main__":
    
    # Call the function in the options dict
    options[ int(resp)][1]() 
-
--- a/courses.py
+++ b/courses.py
@ -2690,24 +2690,7 @@ def enrollment_helper():

    # fill percentage for each section, then by mode, tod, campus

-def try_clustering(df):
-    # Import required libraries
-    from sklearn.cluster import KMeans
-
-    # Preprocessing
-
-    # Assuming df is your DataFrame and "modes" is your categorical column
-    #df['code'] = df['code'].astype('category').cat.codes 
-
-    # Removing any other unnecessary columns
-    df = df.drop(['code'], axis=1)
-
-    # Perform KMeans clustering
-    kmeans = KMeans(n_clusters=4, random_state=0).fit(df) 
-
-    # Get the cluster labels
-    labels = kmeans.labels_
-
+## moved: try_clustering now in search.py
    # Add labels to the DataFrame
    #df['clusters'] = labels
    #print(df)
--- a/depricated.py
+++ b/depricated.py
@ -4,6 +4,98 @@

 # from pipelines - canvas data

+
+# Canvas data, download all new files
+def sync_non_interactive():
+    resp = do_request('/api/account/self/file/sync')    
+    mylog.write(json.dumps(resp, indent=4))
+    #mylog.close()
+    gotten = os.listdir(local_data_folder)
+    wanted = []
+    i = 0
+    for x in resp['files']:
+        filename = x['filename']
+        exi = "No "
+        if filename in gotten: exi = "Yes"
+        else: wanted.append(x)
+        
+        print(str(i) + '.\tLocal: %s\tRemote: %s' % ( exi, filename ))
+        i += 1
+    print("I will attempt to download %i files." % len(wanted))
+    
+    #answer = input("Press enter to begin, or q to quit ")
+    #if not answer == '': return
+    
+    good_count = 0
+    bad_count = 0
+    for W in wanted:
+        print("Downloading: " + W['filename'])
+        response = requests.request(method='GET', url=W['url'], stream=True)
+        if(response.status_code != 200):
+            print('Request response went bad. Got back a %s code, meaning the request was %s' % \
+                 (response.status_code, response.reason))
+            print('URL: ' + W['url'])
+            bad_count += 1
+            
+        else:
+            #Use the downloaded data
+            with open(local_data_folder + W['filename'], 'wb') as fd:
+                for chunk in response.iter_content(chunk_size=128):
+                    fd.write(chunk)
+            print("Success")
+            good_count += 1
+    print("Out of %i files, %i succeeded and %i failed." % (len(wanted),good_count,bad_count))
+        
+    
+## OLD STYLE CANVAS DATA
+
+# Get something from Canvas Data
+def do_request(path):  
+    #Set up the request pieces
+    method = 'GET'
+    host = 'api.inshosteddata.com'
+    apiTime = datetime.utcnow().strftime('%a, %d %b %y %H:%M:%S GMT')
+    apiContentType = 'application/json'
+
+    msgList = []
+    msgList.append(method)
+    msgList.append(host)
+    msgList.append(apiContentType)
+    msgList.append('')
+    msgList.append(path)
+    msgList.append('')
+    msgList.append(apiTime)
+    msgList.append(apiSecret)
+
+    msgStr = bytes("".join("%s\n" % k for k in msgList).strip(),'utf-8')
+
+    sig = base64.b64encode(hmac.new(key=bytes(apiSecret,'utf-8'),msg=msgStr,digestmod=hashlib.sha256).digest())
+    sig = sig.decode('utf-8')
+
+    headers = {}
+    headers['Authorization'] = 'HMACAuth {}:{}'.format(apiKey,sig)
+    headers['Date'] = apiTime
+    headers['Content-type'] = apiContentType
+
+
+    #Submit the request/get a response
+    uri = "https://"+host+path
+    print (uri)
+    print (headers)
+    response = requests.request(method='GET', url=uri, headers=headers, stream=True)
+
+    #Check to make sure the request was ok
+    if(response.status_code != 200):
+        print(('Request response went bad. Got back a ', response.status_code, ' code, meaning the request was ', response.reason))
+    else:
+        #Use the downloaded data
+        jsonData = response.json()
+        #print(json.dumps(jsonData, indent=4))
+        return jsonData
+
+
+
+
 def file_doesnt_exist(name):
    # Get list of files in current directory
    files = os.listdir()
--- a/localcache.py
+++ b/localcache.py
@ -8,7 +8,7 @@ from datetime import datetime as dt
 from datetime import timedelta
 from dateutil.parser import parse
 from os.path import exists, getmtime
-from pipelines import sync_non_interactive, url, header
+from pipelines import url, header
 import util
 from semesters import short_to_sis

@ -1121,7 +1121,7 @@ def full_reload():
        except Exception as e:
            print("Couldn't rename file:", str(e))

-    sync_non_interactive()
+    #sync_non_interactive()

    setup_table('requests_sum1')
    setup_table('courses')
--- a/pipelines.py
+++ b/pipelines.py
@ -3,7 +3,7 @@ import codecs, json, requests, re, csv, datetime, os, jsondiff, os.path
 import sys, shutil, hmac, hashlib, base64, schedule, time, pathlib
 from datetime import timedelta

-from canvas_secrets import apiKey, apiSecret, FTP_SITE, FTP_USER, FTP_PW, url, domain, account_id, header, header_media, g_id, g_secret
+from canvas_secrets import apiKey, apiSecret, FTP_SITE, FTP_USER, FTP_PW, url, domain, account_id, header, header_media
 from canvas_secrets import instructure_url, instructure_username, instructure_private_key

 import os, asyncio
@ -571,95 +571,6 @@ def fetch_current_rosters_auto(poll_seconds=15):
        time.sleep(max(5, int(poll_seconds)))


-# Canvas data, download all new files
-def sync_non_interactive():
-    resp = do_request('/api/account/self/file/sync')    
-    mylog.write(json.dumps(resp, indent=4))
-    #mylog.close()
-    gotten = os.listdir(local_data_folder)
-    wanted = []
-    i = 0
-    for x in resp['files']:
-        filename = x['filename']
-        exi = "No "
-        if filename in gotten: exi = "Yes"
-        else: wanted.append(x)
-        
-        print(str(i) + '.\tLocal: %s\tRemote: %s' % ( exi, filename ))
-        i += 1
-    print("I will attempt to download %i files." % len(wanted))
-    
-    #answer = input("Press enter to begin, or q to quit ")
-    #if not answer == '': return
-    
-    good_count = 0
-    bad_count = 0
-    for W in wanted:
-        print("Downloading: " + W['filename'])
-        response = requests.request(method='GET', url=W['url'], stream=True)
-        if(response.status_code != 200):
-            print('Request response went bad. Got back a %s code, meaning the request was %s' % \
-                 (response.status_code, response.reason))
-            print('URL: ' + W['url'])
-            bad_count += 1
-            
-        else:
-            #Use the downloaded data
-            with open(local_data_folder + W['filename'], 'wb') as fd:
-                for chunk in response.iter_content(chunk_size=128):
-                    fd.write(chunk)
-            print("Success")
-            good_count += 1
-    print("Out of %i files, %i succeeded and %i failed." % (len(wanted),good_count,bad_count))
-        
-    
-## OLD STYLE CANVAS DATA
-
-# Get something from Canvas Data
-def do_request(path):  
-    #Set up the request pieces
-    method = 'GET'
-    host = 'api.inshosteddata.com'
-    apiTime = datetime.utcnow().strftime('%a, %d %b %y %H:%M:%S GMT')
-    apiContentType = 'application/json'
-
-    msgList = []
-    msgList.append(method)
-    msgList.append(host)
-    msgList.append(apiContentType)
-    msgList.append('')
-    msgList.append(path)
-    msgList.append('')
-    msgList.append(apiTime)
-    msgList.append(apiSecret)
-
-    msgStr = bytes("".join("%s\n" % k for k in msgList).strip(),'utf-8')
-
-    sig = base64.b64encode(hmac.new(key=bytes(apiSecret,'utf-8'),msg=msgStr,digestmod=hashlib.sha256).digest())
-    sig = sig.decode('utf-8')
-
-    headers = {}
-    headers['Authorization'] = 'HMACAuth {}:{}'.format(apiKey,sig)
-    headers['Date'] = apiTime
-    headers['Content-type'] = apiContentType
-
-
-    #Submit the request/get a response
-    uri = "https://"+host+path
-    print (uri)
-    print (headers)
-    response = requests.request(method='GET', url=uri, headers=headers, stream=True)
-
-    #Check to make sure the request was ok
-    if(response.status_code != 200):
-        print(('Request response went bad. Got back a ', response.status_code, ' code, meaning the request was ', response.reason))
-    else:
-        #Use the downloaded data
-        jsonData = response.json()
-        #print(json.dumps(jsonData, indent=4))
-        return jsonData
-
-



@ -693,425 +604,7 @@ def put_file(remotepath,localpath, localfile,prompt=1):
        sftp.close()
       
        
-        """
-        # copy files and directories from local static, to remote static,
-        # preserving modification times on the files
-        for f in localf:
-            print("This local file: " + f + " ", end=' ')
-            if not f in files: 
-                sftp.put('video_srt/'+classfoldername+'/'+f, f, preserve_mtime=True)
-                print("Uploaded.")
-            else:
-                print("Skipped.")
-        """
-
-        """if len(files)==3 and 'users.csv' in files:
-            sftp.get('courses.csv','rosters/courses-'+folder+'.csv')
-            sftp.get('users.csv','rosters/users-'+folder+'.csv')
-            sftp.get('enrollments.csv','rosters/enrollments-'+folder+'.csv')
-            print folder + '\tSaved three data files in rosters folder.'
-            
-            courses = open('rosters/courses-'+folder+'.csv','r')
-            courses.readline()
-            a = courses.readline()
-            print a
-            courses.close()
-            parts = a.split(',')
-            year = parts[1][0:4]
-            ss = parts[1][4:6]
-            #print parts[1]
-            sem = {'30':'spring', '50':'summer', '70':'fall' }
-            this_sem = sem[ss]
-            #print this_sem, "", year
-            print folder + '\tbuilding data file...'
-            convert_roster_files(this_sem,year,folder)
-            print folder + '\tmoving files...'
-            move_to_folder(this_sem,year,folder)
-        else:
-            print folder + "\tDon't see all three files."""
-    
-    
-    
-################
-################  GOOGLE DOCS
-################
-################
-################
-
-def sec(t): return "<h3>"+t+"</h3>\n"
-def para(t): return "<p>"+t+"</p>\n"
-def ul(t): return "<ul>"+t+"</ul>\n"
-def li(t): return "<li>"+t+"</li>\n"
-
-def question(t,bracket=1): 
-    ret = ''
-    match = re.search( r'\[(.*)\]', t)
-    if match and bracket:
-        ret += "<a name='" + match.group(1) + "'></a>"
-        t = re.sub( r'\[.*\]','',t)
-    else:
-        parts = t.split(' ')
-        id = ''
-        for p in parts:
-            if re.search(r'[a-zA-Z]',p[0]): id += p[0]
-        ret += "<a name='%s'></a>" % id.lower()
-    return ret + '<div class="accordion" data-accordion=""><h4 class="acrd_cntl">' + t + '</h4>\n<div class="acrd_cntnt">'
-    
-def answer(t): 
-    return t + '</div></div>\n'
-
-def read_paragraph_element(element,type="NORMAL_TEXT"):
-    """Returns the text in the given ParagraphElement.
-
-        Args:
-            element: a ParagraphElement from a Google Doc.
-    """
-    text_run = element.get('textRun')
-    begin = ''
-    end = ''
-    if not text_run:
-        return ''
-    if 'textStyle' in text_run and 'link' in text_run['textStyle']:
-        begin = '<a href="' + text_run['textStyle']['link']['url'] + '">'
-        end = '</a>'
-    if 'textStyle' in text_run and 'bold' in text_run['textStyle'] and text_run['textStyle']['bold']==True and type=="NORMAL_TEXT":
-        begin = '<strong>' + begin
-        end = end + '</strong>'
-    
-    content = text_run.get('content')
-    content = re.sub(u'\u000b','<br />\n',content)
-    
-    return begin + content + end
-    
-
-def get_doc(docid, bracket=1, verbose=0):
-    import pickle
-    import os.path
-    from googleapiclient.discovery import build
-    from google_auth_oauthlib.flow import InstalledAppFlow
-    from google.auth.transport.requests import Request
-    
-    #ooout = open(fileout,'w')
-
-    # If modifying these scopes, delete the file token.pickle.
-    SCOPES = ['https://www.googleapis.com/auth/documents.readonly']
-    creds = None
-    # The file token.pickle stores the user's access and refresh tokens, and is
-    # created automatically when the authorization flow completes for the first
-    # time.
-    if os.path.exists('token.pickle'):
-        with open('token.pickle', 'rb') as token:
-            creds = pickle.load(token)
-    # If there are no (valid) credentials available, let the user log in.
-    if not creds or not creds.valid:
-        if creds and creds.expired and creds.refresh_token:
-            creds.refresh(Request())
-        else:
-            flow = InstalledAppFlow.from_client_secrets_file(
-                'credentials.json', SCOPES)
-            creds = flow.run_local_server(port=0)
-        # Save the credentials for the next run
-        with open('token.pickle', 'wb') as token:
-            pickle.dump(creds, token)
-
-    service = build('docs', 'v1', credentials=creds)
-
-    # Retrieve the documents contents from the Docs service.
-    document = service.documents().get(documentId=docid).execute()
-    if verbose: print(document)
-        
-    tempout = codecs.open('cache/trash/gdoctemp.txt','w','utf-8')
-    tempout.write( json.dumps(document,indent=2) + "\n\n\n------------------------------------\n\n")
-    if verbose: print('The title of the document is: {}'.format(document.get('title'))) 
-    doc_content = document.get('body').get('content')
-    if verbose: print(doc_content)
-    
-    doc_objects = document.get('inlineObjects')
-    if verbose: print(doc_objects)
-    
-    doc_lists = document.get('lists')
-    
-    text = '<div class="acrd_grp" data-accordion-group="">'
-    last_type = ''
-    answer_text = ''
-    in_a_list = ''
-    
-    img_count = 1
-    img_lookup = {}
-    img_heights = {}
-    img_widths = {}
-    
-    if doc_objects:
-        for k,value in doc_objects.items():
-            tempout.write( "->" + k + "=" + json.dumps(value,indent=2) + "\n\n\n--\n\n")
-            if 'inlineObjectProperties' in value:
-                if 'embeddedObject' in value['inlineObjectProperties']:
-                    if 'imageProperties' in value['inlineObjectProperties']['embeddedObject']:
-                        if 'contentUri' in value['inlineObjectProperties']['embeddedObject']['imageProperties']:
-                            print(k)
-                            uu = value['inlineObjectProperties']['embeddedObject']['imageProperties']['contentUri']
-                            response = requests.get(uu, stream=True)
-                            name = 'image_' + str(img_count) + '.' + response.headers['content-type'].split('/')[1]
-                            img_count += 1
-                            
-                            img_lookup[k] = name
-                            
-                            with open('cache/doc_images/'+name, 'wb') as out_file:
-                                shutil.copyfileobj(response.raw, out_file)
-                            print(uu)
-                            print(response.headers)
-                            print(name)
-                            #input('x?')
-                            del response
-                    if 'size' in  value['inlineObjectProperties']['embeddedObject']: 
-                        img_heights[k] = int(value['inlineObjectProperties']['embeddedObject']['size']['height']['magnitude'])
-                        img_widths[k] = int(value['inlineObjectProperties']['embeddedObject']['size']['width']['magnitude'])
-        
-    tempout.write('- - - - - - - -\n\n')
-    #for value in doc_lists:
-    #    tempout.write( json.dumps(value,indent=2) + "\n\n\n--\n\n")
-    
-    tempout.write('- - - - - - - -\n\n')
-    list_stack = []
-    list_depth = 0
-    last_list_depth = 0
-    for value in doc_content:
-        tempout.write( json.dumps(value,indent=2) + "\n\n\n")
-        if verbose: print(json.dumps(value, sort_keys=True, indent=4))
-
-        # todo: x link, x bold, list, image.
-        tag_fxn = para
-        if 'paragraph' in value:
-            this_text = ''
-            
-            if 'bullet' in value['paragraph']:
-                # either we're (1)starting a new list, (2)in one, (3)starting a nested one, or (4)finished a nested one. 
-                
-                lid = value['paragraph']['bullet']['listId']
-                
-                if not list_stack:  # 1
-                    list_stack.append(lid)
-                else:
-                    if lid == list_stack[0]:   # 2
-                        pass
-                        
-                    else:
-                        if not lid in list_stack:   # 3
-                            list_stack.append(lid)
-                        else:                       # 4
-                            x = list_stack.pop()
-                            while x != lid: list_stack.pop()
-            elif len(list_stack) > 0:                           #  current para isn't a bullet but we still have a list open.             
-                list_stack = []
-            
-            list_depth = len(list_stack)
-            
-            deeper = list_depth - last_list_depth
-            
-            if deeper > 0:
-                answer_text += "<ul>" * deeper
-            elif deeper < 0:
-                deeper = -1 * deeper
-                answer_text += "</ul>" * deeper
-            
-            if len(list_stack):
-                tag_fxn = li
-                
-            elements = value.get('paragraph').get('elements')
-            
-            # inlineObjectElement": {
-            # "inlineObjectId": "kix.ssseeu8j9cfx",
-            
-            if 'paragraphStyle' in value.get('paragraph'):
-                style = value.get('paragraph').get('paragraphStyle')
-                #text += json.dumps(style, sort_keys=True, indent=4)
-                if 'namedStyleType' in style:
-                    type = style['namedStyleType']
-            
-            for elem in elements:
-                
-                # text content
-                this_text += read_paragraph_element(elem,type)
-                
-                # image content
-                if 'inlineObjectElement' in elem:
-                    vpi = elem['inlineObjectElement']
-                    if 'inlineObjectId' in vpi:
-                        ii = vpi['inlineObjectId']
-                        if ii in img_lookup:
-                            img = img_lookup[ii]
-                            h = img_heights[ii]
-                            w = img_widths[ii]
-                            this_text += '<img src="doc_images/%s" width="%i" height="%i" />' % (img,w,h)
-                
-                
-            
-            if last_type=='NORMAL_TEXT' and type!=last_type:
-                text += answer(answer_text)
-                answer_text = ''
-            
-            if type=='HEADING_2':
-                text += sec(this_text)
-                this_text = ''
-            elif type=='HEADING_3': 
-                text += question(this_text,bracket)
-                this_text = ''
-            else:
-                answer_text += tag_fxn(this_text)
-                this_text = ''
-            last_type = type
-            last_list_depth = list_depth
-                
-        elif 'table' in value:
-            # The text in table cells are in nested Structural Elements and tables may be
-            # nested.
-            text += "\nTABLE\n"
-            #table = value.get('table')
-            #for row in table.get('tableRows'):
-            #    cells = row.get('tableCells')
-            #    for cell in cells:
-            #        text += read_strucutural_elements(cell.get('content'))
-        #elif 'tableOfContents' in value:
-        #    # The text in the TOC is also in a Structural Element.
-        #    toc = value.get('tableOfContents')
-        #    text += read_strucutural_elements(toc.get('content'))
-        
-        #else:
-        #    print(json.dumps(value, sort_keys=True, indent=4))
-    
-    text += answer(answer_text)
-    #text += '</div>'
-    #print(text)
-    return text
-    
-######### TRY #2 ######    
-
-
-def read_paragraph_element_2(element,type="NORMAL_TEXT"):
-    text_run = element.get('textRun')
-    begin = ''
-    end = ''
-    if not text_run: return ''
-    if 'textStyle' in text_run and 'link' in text_run['textStyle']:
-        begin = '<a href="' + text_run['textStyle']['link']['url'] + '">'
-        end = '</a>'
-    if 'textStyle' in text_run and 'bold' in text_run['textStyle'] and text_run['textStyle']['bold']==True and type=="NORMAL_TEXT":
-        begin = '<strong>' + begin
-        end = end + '</strong>'
-    elif 'textStyle' in text_run and 'italic' in text_run['textStyle'] and text_run['textStyle']['italic']==True and type=="NORMAL_TEXT":
-        begin = '<em>' + begin
-        end = end + '</em>'
-    content = text_run.get('content')
-    content = re.sub(u'\u000b','<br />\n',content)
-    return begin + content + end
-
-# t is a string that begins with "Icons: " ... and contains comma(space) separated list
-def handle_icons(t):
-    text = t[7:].strip()
-    parts = text.split(", ")
-    return ('icons',parts)
-
-# t is a string that begins with "Tags: " ... and contains comma(space) separated list
-def handle_tags(t):
-    text = t[6:].strip()
-    parts = text.split(", ")
-    return ('tags',parts)
-
-def handle_question(t,bracket=1):
-    anchor = ''
-    match = re.search( r'\[(.*)\]', t)
-    if match and bracket:
-        anchor = match.group(1).lower()
-        t = re.sub( r'\[.*\]','',t)
-    else:
-        parts = t.split(' ')
-        for p in parts:
-            if re.search(r'[a-zA-Z]',p[0]): anchor += p[0].lower()
-    return ('question', t, anchor)
-
-def handle_answer(t):
-    return ('answer',t)
-
-def handle_sec(t): return ('section',t)
-def handle_para(t): return ('paragraph',t)
-def handle_ul(t): return ('unorderdedlist',t)
-def handle_li(t): return ('listitem',t)
-
-
-
-img_count = 1
-img_lookup = {}
-img_heights = {}
-img_widths = {}
-
-
-def fetch_doc_image(k,value):
-    global img_count, img_lookup, img_heights, img_widths
-    if 'inlineObjectProperties' in value:
-        if 'embeddedObject' in value['inlineObjectProperties']:
-            if 'imageProperties' in value['inlineObjectProperties']['embeddedObject']:
-                if 'contentUri' in value['inlineObjectProperties']['embeddedObject']['imageProperties']:
-                    print(k)
-                    uu = value['inlineObjectProperties']['embeddedObject']['imageProperties']['contentUri']
-                    response = requests.get(uu, stream=True)
-                    name = 'image_' + str(img_count) + '.' + response.headers['content-type'].split('/')[1]
-                    img_count += 1
-                    img_lookup[k] = name
-                    
-                    with open('cache/doc_images/'+name, 'wb') as out_file:
-                        shutil.copyfileobj(response.raw, out_file)
-                    print(uu)
-                    print(response.headers)
-                    print(name)
-                    del response
-            if 'size' in  value['inlineObjectProperties']['embeddedObject']: 
-                img_heights[k] = int(value['inlineObjectProperties']['embeddedObject']['size']['height']['magnitude'])
-                img_widths[k] = int(value['inlineObjectProperties']['embeddedObject']['size']['width']['magnitude'])
-        
-
-def get_doc_generic(docid, bracket=1, verbose=0):
-    import pickle
-    import os.path
-    from googleapiclient.discovery import build
-    from google_auth_oauthlib.flow import InstalledAppFlow
-    from google.auth.transport.requests import Request
-    global img_count, img_lookup, img_heights, img_widths
-
-# If modifying these scopes, delete the file token.pickle.
-    SCOPES = ['https://www.googleapis.com/auth/documents.readonly']
-    creds = None
-    # The file token.pickle stores the user's access and refresh tokens, and is
-    # created automatically when the authorization flow completes for the first
-    # time.
-    if os.path.exists('token.pickle'):
-        with open('token.pickle', 'rb') as token:
-            creds = pickle.load(token)
-    if not creds or not creds.valid:
-        if creds and creds.expired and creds.refresh_token:
-            creds.refresh(Request())
-        else:
-            flow = InstalledAppFlow.from_client_secrets_file(
-                'credentials.json', SCOPES)
-            creds = flow.run_local_server(port=0)
-        # Save the credentials for the next run
-        with open('token.pickle', 'wb') as token:
-            pickle.dump(creds, token)
-
-    service = build('docs', 'v1', credentials=creds)
-
-    # Retrieve the documents contents from the Docs service.
-    document = service.documents().get(documentId=docid).execute()
-    
-    tempout = codecs.open('cache/trash/gdoctemp.txt','w','utf-8')
-    tempout.write( json.dumps(document,indent=2) \
-        + "\n\n\n------------------------------------\n\n")
-    if verbose: print('The title of the document is: {}'.format(document.get('title'))) 
-
-    doc_content = document.get('body').get('content')
-    doc_objects = document.get('inlineObjects')
-    doc_lists = document.get('lists')
-    
-    #text = ''
+#text = 
    result = []
    last_type = ''
    #answer_text = ''
--- a/search.py
+++ b/search.py
@ -554,3 +554,26 @@ if __name__ == "__main__":
    
    # Call the function in the options dict
    options[ int(resp)][1]() 
+def try_clustering(df):
+    from sklearn.cluster import KMeans
+    df = df.drop(['code'], axis=1)
+    kmeans = KMeans(n_clusters=4, random_state=0).fit(df)
+    return kmeans.labels_
+def nlp_sample():
+    from gensim import utils, corpora
+    from nltk import stem
+    stemmer = stem.porter.PorterStemmer()
+    strings = [
+        "Human machine interface for lab abc computer applications",
+        "A survey of user opinion of computer system response time",
+        "The EPS user interface management system",
+        "System and human system engineering testing of EPS",
+        "Relation of user perceived response time to error measurement",
+        "The generation of random binary unordered trees",
+        "The intersection graph of paths in trees",
+        "Graph minors IV Widths of trees and well quasi ordering",
+        "Graph minors A survey",
+    ]
+    processed = [[stemmer.stem(y) for y in utils.simple_preprocess(x, min_len=4)] for x in strings]
+    dictionary = corpora.Dictionary(processed)
+    return dictionary
--- a/users.py
+++ b/users.py
@ -1938,7 +1938,8 @@ def track_users_by_teacherclass():
    print(json.dumps(g2, indent=2))
 

-def nlp_sample():
+## moved: nlp_sample now in search.py
+# def nlp_sample():
    # Stream a training corpus directly from S3.
    #corpus = corpora.MmCorpus("s3://path/to/corpus")
    
@ -1955,9 +1956,7 @@ def nlp_sample():
    "Graph minors IV Widths of trees and well quasi ordering",
    "Graph minors A survey",
 ]
-    processed = [ [ stemmer.stem(y) for y in utils.simple_preprocess(x, min_len=4)] for x in strings]
-    print(processed)
-    dictionary = corpora.Dictionary( processed )
+    # moved
    dct = dictionary
    print(dictionary)
    
@ -2980,4 +2979,3 @@ if __name__ == "__main__":
    
    # Call the function in the options dict
    options[ int(resp)][1]() 
-