cleanup

2025-09-05 23:06:34 +00:00 · 2025-09-05 23:06:34 +00:00 · 9584f45f30
parent ff5ed654eb
commit 9584f45f30
7 changed files with 1472 additions and 1556 deletions
--- a/content.py
+++ b/content.py
@ -1528,6 +1528,334 @@ LANE: HyFlex
 ################
 ################  GOOGLE DOCS HELPERS (moved from pipelines)
 ################
 def sec(t): return "<h3>"+t+"</h3>\n"
 def para(t): return "<p>"+t+"</p>\n"
 def ul(t): return "<ul>"+t+"</ul>\n"
 def li(t): return "<li>"+t+"</li>\n"
 def question(t,bracket=1):
    ret = ''
    match = re.search( r'\[(.*)\]', t)
    if match and bracket:
        ret += "<a name='" + match.group(1) + "'></a>"
        t = re.sub( r'\[.*\]','',t)
    else:
        parts = t.split(' ')
        id = ''
        for p in parts:
            if re.search(r'[a-zA-Z]',p[0]): id += p[0]
        ret += "<a name='%s'></a>" % id.lower()
    return ret + '<div class="accordion" data-accordion=""><h4 class="acrd_cntl">' + t + '</h4>\n<div class="acrd_cntnt">'
 def answer(t):
    return t + '</div></div>\n'
 def read_paragraph_element(element,type="NORMAL_TEXT"):
    text_run = element.get('textRun')
    begin = ''
    end = ''
    if not text_run:
        return ''
    if 'textStyle' in text_run and 'link' in text_run['textStyle']:
        begin = '<a href="' + text_run['textStyle']['link']['url'] + '">'
        end = '</a>'
    if 'textStyle' in text_run and 'bold' in text_run['textStyle'] and text_run['textStyle']['bold']==True and type=="NORMAL_TEXT":
        begin = '<strong>' + begin
        end = end + '</strong>'
    content = text_run.get('content')
    content = re.sub(u'\u000b','<br />\n',content)
    return begin + content + end
 def read_paragraph_element_2(element,type="NORMAL_TEXT"):
    return read_paragraph_element(element,type)
 # t is a string that begins with "Icons: " ... and contains comma(space) separated list
 def handle_icons(t):
    text = t[7:].strip()
    parts = text.split(", ")
    return ('icons',parts)
 # t is a string that begins with "Tags: " ... and contains comma(space) separated list
 def handle_tags(t):
    text = t[6:].strip()
    parts = text.split(", ")
    return ('tags',parts)
 def handle_question(t,bracket=1):
    anchor = ''
    match = re.search( r'\[(.*)\]', t)
    if match and bracket:
        anchor = match.group(1).lower()
        t = re.sub( r'\[.*\]','',t)
    else:
        parts = t.split(' ')
        for p in parts:
            if re.search(r'[a-zA-Z]',p[0]): anchor += p[0].lower()
    return ('question', t, anchor)
 def handle_answer(t):
    return ('answer',t)
 def handle_sec(t): return ('section',t)
 def handle_para(t): return ('paragraph',t)
 def handle_ul(t): return ('unorderdedlist',t)
 def handle_li(t): return ('listitem',t)
 img_count = 1
 img_lookup = {}
 img_heights = {}
 img_widths = {}
 '''def fetch_doc_image(k,value):
    global img_count, img_lookup, img_heights, img_widths
    if 'inlineObjectProperties' in value:
        if 'embeddedObject' in value['inlineObjectProperties']:
            if 'imageProperties' in value['inlineObjectProperties']['embeddedObject']:
                if 'contentUri' in value['inlineObjectProperties']['embeddedObject']['imageProperties']:
                    print(k)
                    uu = value['inlineObjectProperties']['embeddedObject']['imageProperties']['contentUri']
                    response = requests.get(uu, stream=True)
                    name = 'image_' + str(img_count) + '.' + response.headers['content-type'].split('/')[1]
                    img_count += 1
                    img_lookup[k] = name
                    with open('cache/doc_images/'+name, 'wb') as out_file:
                        shutil.copyfileobj(response.raw, out_file)
                    print(uu)
                    print(response.headers)
                    print(name)
                    del response
            if 'size' in  value['inlineObjectProperties']['embeddedObject']: 
                img_heights[k] = int(value['inlineObjectProperties']['embeddedObject']['size']['height']['magnitude'])
                img_widths[k] = int(value['inlineObjectProperties']['embeddedObject']['size']['width']['magnitude'])
 '''
 def fetch_doc_image(k,value):
    import shutil
    if 'inlineObjectProperties' in value:
        if 'embeddedObject' in value['inlineObjectProperties']:
            if 'imageProperties' in value['inlineObjectProperties']['embeddedObject']:
                if 'contentUri' in value['inlineObjectProperties']['embeddedObject']['imageProperties']:
                    uu = value['inlineObjectProperties']['embeddedObject']['imageProperties']['contentUri']
                    response = requests.get(uu, stream=True)
                    name = 'image_' + str(k) + '.' + response.headers['content-type'].split('/')[1]
                    with open('cache/doc_images/'+name, 'wb') as out_file:
                        shutil.copyfileobj(response.raw, out_file)
                    del response
    return True
 def get_doc(docid, bracket=1, verbose=0):
    import pickle, shutil
    import os.path
    from googleapiclient.discovery import build
    from google_auth_oauthlib.flow import InstalledAppFlow
    from google.auth.transport.requests import Request
    #ooout = open(fileout,'w')
    # If modifying these scopes, delete the file token.pickle.
    SCOPES = ['https://www.googleapis.com/auth/documents.readonly']
    creds = None
    # The file token.pickle stores the user's access and refresh tokens, and is
    # created automatically when the authorization flow completes for the first
    # time.
    if os.path.exists('token.pickle'):
        with open('token.pickle', 'rb') as token:
            creds = pickle.load(token)
    # If there are no (valid) credentials available, let the user log in.
    if not creds or not creds.valid:
        if creds and creds.expired and creds.refresh_token:
            creds.refresh(Request())
        else:
            flow = InstalledAppFlow.from_client_secrets_file(
                'credentials.json', SCOPES)
            creds = flow.run_local_server(port=0)
        # Save the credentials for the next run
        with open('token.pickle', 'wb') as token:
            pickle.dump(creds, token)
    service = build('docs', 'v1', credentials=creds)
    # Retrieve the documents contents from the Docs service.
    document = service.documents().get(documentId=docid).execute()
    if verbose: print(document)
    tempout = codecs.open('cache/trash/gdoctemp.txt','w','utf-8')
    tempout.write( json.dumps(document,indent=2) + "\n\n\n------------------------------------\n\n")
    if verbose: print('The title of the document is: {}'.format(document.get('title'))) 
    doc_content = document.get('body').get('content')
    if verbose: print(doc_content)
    doc_objects = document.get('inlineObjects')
    if verbose: print(doc_objects)
    doc_lists = document.get('lists')
    text = '<div class="acrd_grp" data-accordion-group="">'
    last_type = ''
    answer_text = ''
    in_a_list = ''
    img_count = 1
    img_lookup = {}
    img_heights = {}
    img_widths = {}
    if doc_objects:
        for k,value in doc_objects.items():
            tempout.write( "->" + k + "=" + json.dumps(value,indent=2) + "\n\n\n--\n\n")
            if 'inlineObjectProperties' in value:
                if 'embeddedObject' in value['inlineObjectProperties']:
                    if 'imageProperties' in value['inlineObjectProperties']['embeddedObject']:
                        if 'contentUri' in value['inlineObjectProperties']['embeddedObject']['imageProperties']:
                            print(k)
                            uu = value['inlineObjectProperties']['embeddedObject']['imageProperties']['contentUri']
                            response = requests.get(uu, stream=True)
                            name = 'image_' + str(img_count) + '.' + response.headers['content-type'].split('/')[1]
                            img_count += 1
                            img_lookup[k] = name
                            with open('cache/doc_images/'+name, 'wb') as out_file:
                                shutil.copyfileobj(response.raw, out_file)
                            print(uu)
                            print(response.headers)
                            print(name)
                            #input('x?')
                            del response
                    if 'size' in  value['inlineObjectProperties']['embeddedObject']: 
                        img_heights[k] = int(value['inlineObjectProperties']['embeddedObject']['size']['height']['magnitude'])
                        img_widths[k] = int(value['inlineObjectProperties']['embeddedObject']['size']['width']['magnitude'])
    tempout.write('- - - - - - - -\n\n')
    #for value in doc_lists:
    #    tempout.write( json.dumps(value,indent=2) + "\n\n\n--\n\n")
    tempout.write('- - - - - - - -\n\n')
    list_stack = []
    list_depth = 0
    last_list_depth = 0
    for value in doc_content:
        tempout.write( json.dumps(value,indent=2) + "\n\n\n")
        if verbose: print(json.dumps(value, sort_keys=True, indent=4))
        # todo: x link, x bold, list, image.
        tag_fxn = para
        if 'paragraph' in value:
            this_text = ''
            if 'bullet' in value['paragraph']:
                # either we're (1)starting a new list, (2)in one, (3)starting a nested one, or (4)finished a nested one. 
                lid = value['paragraph']['bullet']['listId']
                if not list_stack:  # 1
                    list_stack.append(lid)
                else:
                    if lid == list_stack[0]:   # 2
                        pass
                    else:
                        if not lid in list_stack:   # 3
                            list_stack.append(lid)
                        else:                       # 4
                            x = list_stack.pop()
                            while x != lid: list_stack.pop()
            elif len(list_stack) > 0:                           #  current para isn't a bullet but we still have a list open.             
                list_stack = []
            list_depth = len(list_stack)
            deeper = list_depth - last_list_depth
            if deeper > 0:
                answer_text += "<ul>" * deeper
            elif deeper < 0:
                deeper = -1 * deeper
                answer_text += "</ul>" * deeper
            if len(list_stack):
                tag_fxn = li
            elements = value.get('paragraph').get('elements')
            # inlineObjectElement": {
            # "inlineObjectId": "kix.ssseeu8j9cfx",
            if 'paragraphStyle' in value.get('paragraph'):
                style = value.get('paragraph').get('paragraphStyle')
                #text += json.dumps(style, sort_keys=True, indent=4)
                if 'namedStyleType' in style:
                    type = style['namedStyleType']
            for elem in elements:
                # text content
                this_text += read_paragraph_element(elem,type)
                # image content
                if 'inlineObjectElement' in elem:
                    vpi = elem['inlineObjectElement']
                    if 'inlineObjectId' in vpi:
                        ii = vpi['inlineObjectId']
                        if ii in img_lookup:
                            img = img_lookup[ii]
                            h = img_heights[ii]
                            w = img_widths[ii]
                            this_text += '<img src="doc_images/%s" width="%i" height="%i" />' % (img,w,h)
            if last_type=='NORMAL_TEXT' and type!=last_type:
                text += answer(answer_text)
                answer_text = ''
            if type=='HEADING_2':
                text += sec(this_text)
                this_text = ''
            elif type=='HEADING_3': 
                text += question(this_text,bracket)
                this_text = ''
            else:
                answer_text += tag_fxn(this_text)
                this_text = ''
            last_type = type
            last_list_depth = list_depth
        elif 'table' in value:
            # The text in table cells are in nested Structural Elements and tables may be
            # nested.
            text += "\nTABLE\n"
            #table = value.get('table')
            #for row in table.get('tableRows'):
            #    cells = row.get('tableCells')
            #    for cell in cells:
            #        text += read_strucutural_elements(cell.get('content'))
        #elif 'tableOfContents' in value:
        #    # The text in the TOC is also in a Structural Element.
        #    toc = value.get('tableOfContents')
        #    text += read_strucutural_elements(toc.get('content'))
        #else:
        #    print(json.dumps(value, sort_keys=True, indent=4))
    text += answer(answer_text)
    #text += '</div>'
    #print(text)
    return text
 def get_doc_generic(docid, bracket=1, verbose=0):
    return get_doc(docid, bracket, verbose)
@ -1567,4 +1895,3 @@ if __name__ == "__main__":
    # Call the function in the options dict
    options[ int(resp)][1]() 
--- a/courses.py
+++ b/courses.py
@ -2690,24 +2690,7 @@ def enrollment_helper():
    # fill percentage for each section, then by mode, tod, campus
-def try_clustering(df):
+## moved: try_clustering now in search.py
    # Import required libraries
    from sklearn.cluster import KMeans
    # Preprocessing
    # Assuming df is your DataFrame and "modes" is your categorical column
    #df['code'] = df['code'].astype('category').cat.codes 
    # Removing any other unnecessary columns
    df = df.drop(['code'], axis=1)
    # Perform KMeans clustering
    kmeans = KMeans(n_clusters=4, random_state=0).fit(df) 
    # Get the cluster labels
    labels = kmeans.labels_
    # Add labels to the DataFrame
    #df['clusters'] = labels
    #print(df)
--- a/depricated.py
+++ b/depricated.py
@ -4,6 +4,98 @@
 # from pipelines - canvas data
 # Canvas data, download all new files
 def sync_non_interactive():
    resp = do_request('/api/account/self/file/sync')    
    mylog.write(json.dumps(resp, indent=4))
    #mylog.close()
    gotten = os.listdir(local_data_folder)
    wanted = []
    i = 0
    for x in resp['files']:
        filename = x['filename']
        exi = "No "
        if filename in gotten: exi = "Yes"
        else: wanted.append(x)
        print(str(i) + '.\tLocal: %s\tRemote: %s' % ( exi, filename ))
        i += 1
    print("I will attempt to download %i files." % len(wanted))
    #answer = input("Press enter to begin, or q to quit ")
    #if not answer == '': return
    good_count = 0
    bad_count = 0
    for W in wanted:
        print("Downloading: " + W['filename'])
        response = requests.request(method='GET', url=W['url'], stream=True)
        if(response.status_code != 200):
            print('Request response went bad. Got back a %s code, meaning the request was %s' % \
                 (response.status_code, response.reason))
            print('URL: ' + W['url'])
            bad_count += 1
        else:
            #Use the downloaded data
            with open(local_data_folder + W['filename'], 'wb') as fd:
                for chunk in response.iter_content(chunk_size=128):
                    fd.write(chunk)
            print("Success")
            good_count += 1
    print("Out of %i files, %i succeeded and %i failed." % (len(wanted),good_count,bad_count))
 ## OLD STYLE CANVAS DATA
 # Get something from Canvas Data
 def do_request(path):  
    #Set up the request pieces
    method = 'GET'
    host = 'api.inshosteddata.com'
    apiTime = datetime.utcnow().strftime('%a, %d %b %y %H:%M:%S GMT')
    apiContentType = 'application/json'
    msgList = []
    msgList.append(method)
    msgList.append(host)
    msgList.append(apiContentType)
    msgList.append('')
    msgList.append(path)
    msgList.append('')
    msgList.append(apiTime)
    msgList.append(apiSecret)
    msgStr = bytes("".join("%s\n" % k for k in msgList).strip(),'utf-8')
    sig = base64.b64encode(hmac.new(key=bytes(apiSecret,'utf-8'),msg=msgStr,digestmod=hashlib.sha256).digest())
    sig = sig.decode('utf-8')
    headers = {}
    headers['Authorization'] = 'HMACAuth {}:{}'.format(apiKey,sig)
    headers['Date'] = apiTime
    headers['Content-type'] = apiContentType
    #Submit the request/get a response
    uri = "https://"+host+path
    print (uri)
    print (headers)
    response = requests.request(method='GET', url=uri, headers=headers, stream=True)
    #Check to make sure the request was ok
    if(response.status_code != 200):
        print(('Request response went bad. Got back a ', response.status_code, ' code, meaning the request was ', response.reason))
    else:
        #Use the downloaded data
        jsonData = response.json()
        #print(json.dumps(jsonData, indent=4))
        return jsonData
 def file_doesnt_exist(name):
    # Get list of files in current directory
    files = os.listdir()
--- a/localcache.py
+++ b/localcache.py
@ -8,7 +8,7 @@ from datetime import datetime as dt
 from datetime import timedelta
 from dateutil.parser import parse
 from os.path import exists, getmtime
-from pipelines import sync_non_interactive, url, header
+from pipelines import url, header
 import util
 from semesters import short_to_sis
@ -1121,7 +1121,7 @@ def full_reload():
        except Exception as e:
            print("Couldn't rename file:", str(e))
-    sync_non_interactive()
+    #sync_non_interactive()
    setup_table('requests_sum1')
    setup_table('courses')
--- a/pipelines.py
+++ b/pipelines.py
--- a/search.py
+++ b/search.py
@ -554,3 +554,26 @@ if __name__ == "__main__":
    # Call the function in the options dict
    options[ int(resp)][1]() 
 def try_clustering(df):
    from sklearn.cluster import KMeans
    df = df.drop(['code'], axis=1)
    kmeans = KMeans(n_clusters=4, random_state=0).fit(df)
    return kmeans.labels_
 def nlp_sample():
    from gensim import utils, corpora
    from nltk import stem
    stemmer = stem.porter.PorterStemmer()
    strings = [
        "Human machine interface for lab abc computer applications",
        "A survey of user opinion of computer system response time",
        "The EPS user interface management system",
        "System and human system engineering testing of EPS",
        "Relation of user perceived response time to error measurement",
        "The generation of random binary unordered trees",
        "The intersection graph of paths in trees",
        "Graph minors IV Widths of trees and well quasi ordering",
        "Graph minors A survey",
    ]
    processed = [[stemmer.stem(y) for y in utils.simple_preprocess(x, min_len=4)] for x in strings]
    dictionary = corpora.Dictionary(processed)
    return dictionary
--- a/users.py
+++ b/users.py
@ -1938,7 +1938,8 @@ def track_users_by_teacherclass():
    print(json.dumps(g2, indent=2))
-def nlp_sample():
+## moved: nlp_sample now in search.py
 # def nlp_sample():
    # Stream a training corpus directly from S3.
    #corpus = corpora.MmCorpus("s3://path/to/corpus")
@ -1955,9 +1956,7 @@ def nlp_sample():
    "Graph minors IV Widths of trees and well quasi ordering",
    "Graph minors A survey",
 ]
-    processed = [ [ stemmer.stem(y) for y in utils.simple_preprocess(x, min_len=4)] for x in strings]
+    # moved
    print(processed)
    dictionary = corpora.Dictionary( processed )
    dct = dictionary
    print(dictionary)
@ -2980,4 +2979,3 @@ if __name__ == "__main__":
    # Call the function in the options dict
    options[ int(resp)][1]()