moving fxns to depricated. cleaning

2023-04-11 08:56:24 -07:00 · 2023-04-11 08:56:24 -07:00 · 0c358da2ab
parent 035db48464
commit 0c358da2ab
3 changed files with 239 additions and 115 deletions
--- a/content.py
+++ b/content.py
@ -3,7 +3,7 @@
 #saved_titles = json.loads( codecs.open('cache/saved_youtube_titles.json','r','utf-8').read() )
 import requests, codecs, os, re, json
 from pipelines import header, fetch, url, put_file
-from util import clean_title, to_file_friendly, minimal_string
+from util import clean_title, to_file_friendly, minimal_string, stripper, mycleaner
 from bs4 import BeautifulSoup as bs
 from html.parser import HTMLParser
 import tomd, checker 
@ -19,63 +19,9 @@ def d(s):
    global DBG
    if DBG: print(s)
 def stripper(s):
    REMOVE_ATTRIBUTES = [
    'lang','language','onmouseover','onmouseout','script','style','font',
    'dir','face','size','color','style','class','width','height','hspace',
    'border','valign','align','background','bgcolor','text','link','vlink',
    'alink','cellpadding','cellspacing']
    #doc = '''<html><head><title>Page title</title></head><body><p id="firstpara" align="center">This is <i>paragraph</i> <a onmouseout="">one</a>.<p id="secondpara" align="blah">This is <i>paragraph</i> <b>two</b>.</html>'''
    soup = bs(s, features='lxml')
    for tag in soup.recursiveChildGenerator():
        try:
            tag.attrs = {key:value for key,value in tag.attrs.iteritems() 
                         if key not in REMOVE_ATTRIBUTES}
        except AttributeError: 
            # 'NavigableString' object has no attribute 'attrs'
            pass
    return soup.prettify()
 def mycleaner(s):
    s = re.sub(r'<br\s?\/>','\n',s)
    s = re.sub(r'<\/?b>','',s)
    s = re.sub(r' +',' ',s)
    s = re.sub(r'^[\s\t\r\n]+$','',s,flags=re.MULTILINE)
    s = re.sub('^ ','',s)
    return s
 def freshdesk():
    path = "C:\\Users\\peter\\Downloads\\freshdesk\\Solutions.xml"
    soup = bs( codecs.open(path,'r','utf-8').read() ,features="lxml")
    outpt = codecs.open('cache/faqs.txt','w')
    out = ""
    for a in soup.find_all('solution-article'):
        print("TITLE\n"+a.find('title').get_text())
        out += a.find('title').get_text()
        """for d in a.find_all('description'):
            #print(d)
            if d:
                d = h.unescape(d.get_text())
                e = stripper(d)
                m = tomd.convert( e )
                m = mycleaner(m)
                print("\nDESCRIPTION\n"+m)"""
        #print("\nWHAT IS THIS?\n" + 
        hh = a.find('desc-un-html').get_text()
        d = h.unescape(hh)
        e = stripper(d)
        m = tomd.convert( e )
        m = mycleaner(m)
        print("\nDESCRIPTION\n"+m)
        out += "\n\n" + m + "\n\n"
        print("-----------\n\n")
    outpt.write(out)
 # Download everything interesting in a course to a local folder
 # Build a master file with the entire class content
@ -533,64 +479,6 @@ def grab_course_pages(course_num=-1):
    pageout.close()
    pageoutm.close()
 # Upload pages. Local copy has a particular format.
 # Appears to not be used
 def put_course_pages():
    course_num = '6862'
    filein = codecs.open('cache/pages/course_'+str(course_num)+'.html','r','utf-8')
    my_titles = []
    my_urls = []
    my_bodys = []
    started = 0
    current_body = ""
    for L in filein.readlines():
        ma = re.search('^###\s(.*)###\s(.*)$',L)
        if ma:
            my_titles.append(ma.group(1))
            my_urls.append(ma.group(2))
            if started:
                my_bodys.append(current_body)
                current_body = ""
            started = 1
        else:
            current_body += "\n" + L
    my_bodys.append(current_body)
    i = 0
    for U in my_urls:
        # and now upload it....lol
        upload_page(course_num,U,my_bodys[i])
        i += 1
 # Also not used
 def put_revised_pages():
    course_num = '6862'
    course_folder = '../course_temps/course_6862'
    filein = codecs.open(course_folder+'/fullcourse.v2.html','r','utf-8')
    my_titles = []
    my_urls = []
    my_bodys = []
    started = 0
    current_body = ""
    for L in filein.readlines():
        ma = re.search('^<h1>(.*)</h1>.*$',L)
        if ma:
            my_titles.append(ma.group(1))
            my_urls.append(ma.group(2))
            if started:
                my_bodys.append(current_body)
                current_body = ""
            started = 1
        else:
            current_body += "\n" + L
    my_bodys.append(current_body)
    i = 0
    for U in my_urls:
        # and now upload it....lol
        upload_page(course_num,U,my_bodys[i])
        i += 1
 # Download, clean html, and reupload page
 def update_page():
    global results, results_dict, url, header
@ -837,6 +725,115 @@ def multiple_downloads():
        accessible_check(id)
 ###
 ###
 ### Text / Knowledge Base
 ###
 ### How about downloading all possible info / webpages / sources
 ### related to Gavilan and creating a master search index?
 ###
 ### Goals:
 ### - Scripted approach to allow re-indexing / updating
 ### - Break everything down into paragraphs
 ###
 ### - Script to extract keywords, topics, entities, summaries, questions answered 
 ###   from each paragraph or chunk.
 ### - Use spacy, gensim, nltk, or gpt-3, or a combination of all of them
 ###
 ### - Create vector / embeddings for each paragraph
 ###
 ### - Enable a vector search engine and connect to front page of gavilan.cc
 ### - Use that to feed handful of source paragraphs (& prompt) into gpt and
 ###   receive text answers to questions.
 def demo_vector_search():
    from gensim.models import Word2Vec
    from gensim.utils import simple_preprocess
    import nltk.data
    import spacy
    # (might have to upgrade pip first...)
    # pip install --upgrade click
    #
    # python -m spacy download en_core_web_sm
    # python -m spacy download en_core_web_lg
    def is_complete_sentence(text):
        #text = text.text
        doc = nlp(text)
        sentences = list(doc.sents)
        if len(sentences) == 1 and text.strip() == sentences[0].text.strip():
            return True
        return False
    sentences = [
        "This is an example sentence.",
        "Here is another sentence for training."
    ]
    paragraph = """Financial Aid services are available in person!  We are happy to assist you with your financial aid needs.  If you are interested in visiting the office in person, please review the guidelines for visiting campus and schedule your appointment:
 Guidelines for In-Person Financial Aid Services
 Due to FERPA regulations, no student information will be given to anyone other than the student without authorization from the student.
 We continue to offer virtual services.  Financial Aid staff may be reached by email, phone, text, and zoom!  Please refer to the contact information and schedules below.
 Gavilan-WelcomeCenter_Peer_Mentors.jpg
 Do you need assistance filing the FAFSA or California Dream Act Application? Friendly and knowledgeable Peer Mentors are available to assist you virtually and in person!  Details below for an online Zoom visit, phone call, or in-person visit with Peer Mentors. 
 Monday - Friday 8am - 5pm, Student Center
 Join Zoom to Connect with a Peer Mentor
 Or call (669) 900-6833 and use meeting ID 408 848 4800
 MicrosoftTeams-image.png
 Do you need assistance with an existing financial aid application, financial aid document submission, or review of your financial aid package? Schedule an in-person, phone, or zoom appointment with our Financial Aid counter. 
 Mon - Thurs: 9am - 1:00pm, 2:00pm - 5:00pm
 Fri: 10am - 2pm
 Office: (408) 848-4727     Email: finaid@gavilan.edu
 Schedule an In-Person, Phone or Zoom Appointment"""
    tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
    sentences1 = tokenizer.tokenize(paragraph)
    for i,s in enumerate(sentences1):
        print(i, "\t", s)
    print("\n\n")
    #nlp = spacy.load('en_core_web_sm')
    nlp = spacy.load('en_core_web_md')
    doc = nlp(paragraph)
    sentences2 = list(doc.sents)
    for i,s in enumerate(sentences2):
        t = re.sub(r'\n+',' ',s.text)
        is_sentence = 'yes' if is_complete_sentence(t) else 'no '
        print(i, " ", is_sentence, "  ", t)
    print("\n\n")
    #for text in sentences2:
    #    print(text, "is a complete sentence?" , is_complete_sentence(text))   
    return
    tokenized_sentences = [simple_preprocess(s) for s in sentences]
    model = Word2Vec(tokenized_sentences, min_count=1, vector_size=100)
    example_word = "example"
    vector = model.wv[example_word]
    print(f"Vector for the word '{example_word}': {vector}")
    ## TODO  site scraper
    ## TODO  finde package that extracts text from web page
    ### TODO master list of what to index.
 if __name__ == "__main__":
    print ('')
@ -844,8 +841,9 @@ if __name__ == "__main__":
                2: ['download multiple classes', multiple_downloads ],
                3: ['convert stuff', pan_testing ],
                4: ['convert md to html', md_to_course ],
-                5: ['import freshdesk content', freshdesk ],
+                # 5: ['import freshdesk content', freshdesk ],
                6: ['download all a courses pages', grab_course_pages],
                7: ['demo vector search', demo_vector_search],
              }
    for key in options:
--- a/depricated.py
+++ b/depricated.py
@ -1617,8 +1617,11 @@ for L in str.split("\n"):
 ## sched.py
 import codecs
 import requests, re, csv, json, funcy, sys
 from content import upload_page
 def dates(s):
    #print(s)
@ -1708,8 +1711,99 @@ if __name__ == "__main__":
    options[ int(resp)][1]()
 def put_revised_pages():
    course_num = '6862'
    course_folder = '../course_temps/course_6862'
    filein = codecs.open(course_folder+'/fullcourse.v2.html','r','utf-8')
    my_titles = []
    my_urls = []
    my_bodys = []
    started = 0
    current_body = ""
    for L in filein.readlines():
        ma = re.search('^<h1>(.*)</h1>.*$',L)
        if ma:
            my_titles.append(ma.group(1))
            my_urls.append(ma.group(2))
            if started:
                my_bodys.append(current_body)
                current_body = ""
            started = 1
        else:
            current_body += "\n" + L
    my_bodys.append(current_body)
    i = 0
    for U in my_urls:
        # and now upload it....lol
        upload_page(course_num,U,my_bodys[i])
        i += 1
 # Upload pages. Local copy has a particular format.
 # Appears to not be used
 def put_course_pages():
    course_num = '6862'
    filein = codecs.open('cache/pages/course_'+str(course_num)+'.html','r','utf-8')
    my_titles = []
    my_urls = []
    my_bodys = []
    started = 0
    current_body = ""
    for L in filein.readlines():
        ma = re.search('^###\s(.*)###\s(.*)$',L)
        if ma:
            my_titles.append(ma.group(1))
            my_urls.append(ma.group(2))
            if started:
                my_bodys.append(current_body)
                current_body = ""
            started = 1
        else:
            current_body += "\n" + L
    my_bodys.append(current_body)
    i = 0
    for U in my_urls:
        # and now upload it....lol
        upload_page(course_num,U,my_bodys[i])
        i += 1 
 def freshdesk():
    path = "C:\\Users\\peter\\Downloads\\freshdesk\\Solutions.xml"
    soup = bs( codecs.open(path,'r','utf-8').read() ,features="lxml")
    outpt = codecs.open('cache/faqs.txt','w')
    out = ""
    for a in soup.find_all('solution-article'):
        print("TITLE\n"+a.find('title').get_text())
        out += a.find('title').get_text()
        """for d in a.find_all('description'):
            #print(d)
            if d:
                d = h.unescape(d.get_text())
                e = stripper(d)
                m = tomd.convert( e )
                m = mycleaner(m)
                print("\nDESCRIPTION\n"+m)"""
        #print("\nWHAT IS THIS?\n" + 
        hh = a.find('desc-un-html').get_text()
        d = h.unescape(hh)
        e = stripper(d)
        m = tomd.convert( e )
        m = mycleaner(m)
        print("\nDESCRIPTION\n"+m)
        out += "\n\n" + m + "\n\n"
        print("-----------\n\n")
    outpt.write(out)
--- a/util.py
+++ b/util.py
@ -4,6 +4,38 @@
 import re, csv
 from collections import defaultdict
 from bs4 import BeautifulSoup as bs
 def stripper(s):
    REMOVE_ATTRIBUTES = [
    'lang','language','onmouseover','onmouseout','script','style','font',
    'dir','face','size','color','style','class','width','height','hspace',
    'border','valign','align','background','bgcolor','text','link','vlink',
    'alink','cellpadding','cellspacing']
    #doc = '''<html><head><title>Page title</title></head><body><p id="firstpara" align="center">This is <i>paragraph</i> <a onmouseout="">one</a>.<p id="secondpara" align="blah">This is <i>paragraph</i> <b>two</b>.</html>'''
    soup = bs(s, features='lxml')
    for tag in soup.recursiveChildGenerator():
        try:
            tag.attrs = {key:value for key,value in tag.attrs.iteritems() 
                         if key not in REMOVE_ATTRIBUTES}
        except AttributeError: 
            # 'NavigableString' object has no attribute 'attrs'
            pass
    return soup.prettify()
 def mycleaner(s):
    s = re.sub(r'<br\s?\/>','\n',s)
    s = re.sub(r'<\/?b>','',s)
    s = re.sub(r' +',' ',s)
    s = re.sub(r'^[\s\t\r\n]+$','',s,flags=re.MULTILINE)
    s = re.sub('^ ','',s)
    return s
 def print_table(table):
    longest_cols = [