From 0c358da2ab472ed4ceffe0f6f3d72c411fafce13 Mon Sep 17 00:00:00 2001 From: Coding with Peter Date: Tue, 11 Apr 2023 08:56:24 -0700 Subject: [PATCH] moving fxns to depricated. cleaning --- content.py | 226 +++++++++++++++++++++++++------------------------- depricated.py | 96 ++++++++++++++++++++- util.py | 32 +++++++ 3 files changed, 239 insertions(+), 115 deletions(-) diff --git a/content.py b/content.py index 2fc4832..56307a6 100644 --- a/content.py +++ b/content.py @@ -3,7 +3,7 @@ #saved_titles = json.loads( codecs.open('cache/saved_youtube_titles.json','r','utf-8').read() ) import requests, codecs, os, re, json from pipelines import header, fetch, url, put_file -from util import clean_title, to_file_friendly, minimal_string +from util import clean_title, to_file_friendly, minimal_string, stripper, mycleaner from bs4 import BeautifulSoup as bs from html.parser import HTMLParser import tomd, checker @@ -19,63 +19,9 @@ def d(s): global DBG if DBG: print(s) -def stripper(s): - REMOVE_ATTRIBUTES = [ - 'lang','language','onmouseover','onmouseout','script','style','font', - 'dir','face','size','color','style','class','width','height','hspace', - 'border','valign','align','background','bgcolor','text','link','vlink', - 'alink','cellpadding','cellspacing'] - #doc = '''Page title

This is paragraph one.

This is paragraph two.''' - soup = bs(s, features='lxml') - for tag in soup.recursiveChildGenerator(): - try: - tag.attrs = {key:value for key,value in tag.attrs.iteritems() - if key not in REMOVE_ATTRIBUTES} - except AttributeError: - # 'NavigableString' object has no attribute 'attrs' - pass - return soup.prettify() -def mycleaner(s): - s = re.sub(r'','\n',s) - s = re.sub(r'<\/?b>','',s) - s = re.sub(r' +',' ',s) - s = re.sub(r'^[\s\t\r\n]+$','',s,flags=re.MULTILINE) - s = re.sub('^ ','',s) - return s -def freshdesk(): - path = "C:\\Users\\peter\\Downloads\\freshdesk\\Solutions.xml" - soup = bs( codecs.open(path,'r','utf-8').read() ,features="lxml") - - outpt = codecs.open('cache/faqs.txt','w') - out = "" - for a in soup.find_all('solution-article'): - - print("TITLE\n"+a.find('title').get_text()) - out += a.find('title').get_text() - - """for d in a.find_all('description'): - #print(d) - if d: - d = h.unescape(d.get_text()) - e = stripper(d) - m = tomd.convert( e ) - m = mycleaner(m) - print("\nDESCRIPTION\n"+m)""" - - #print("\nWHAT IS THIS?\n" + - hh = a.find('desc-un-html').get_text() - d = h.unescape(hh) - e = stripper(d) - m = tomd.convert( e ) - m = mycleaner(m) - print("\nDESCRIPTION\n"+m) - out += "\n\n" + m + "\n\n" - - print("-----------\n\n") - outpt.write(out) # Download everything interesting in a course to a local folder # Build a master file with the entire class content @@ -533,64 +479,6 @@ def grab_course_pages(course_num=-1): pageout.close() pageoutm.close() -# Upload pages. Local copy has a particular format. -# Appears to not be used -def put_course_pages(): - course_num = '6862' - filein = codecs.open('cache/pages/course_'+str(course_num)+'.html','r','utf-8') - my_titles = [] - my_urls = [] - my_bodys = [] - started = 0 - current_body = "" - for L in filein.readlines(): - ma = re.search('^###\s(.*)###\s(.*)$',L) - if ma: - my_titles.append(ma.group(1)) - my_urls.append(ma.group(2)) - if started: - my_bodys.append(current_body) - current_body = "" - started = 1 - else: - current_body += "\n" + L - my_bodys.append(current_body) - - i = 0 - for U in my_urls: - # and now upload it....lol - upload_page(course_num,U,my_bodys[i]) - i += 1 - -# Also not used -def put_revised_pages(): - course_num = '6862' - course_folder = '../course_temps/course_6862' - filein = codecs.open(course_folder+'/fullcourse.v2.html','r','utf-8') - my_titles = [] - my_urls = [] - my_bodys = [] - started = 0 - current_body = "" - for L in filein.readlines(): - ma = re.search('^

(.*)

.*$',L) - if ma: - my_titles.append(ma.group(1)) - my_urls.append(ma.group(2)) - if started: - my_bodys.append(current_body) - current_body = "" - started = 1 - else: - current_body += "\n" + L - my_bodys.append(current_body) - - i = 0 - for U in my_urls: - # and now upload it....lol - upload_page(course_num,U,my_bodys[i]) - i += 1 - # Download, clean html, and reupload page def update_page(): global results, results_dict, url, header @@ -836,6 +724,115 @@ def multiple_downloads(): for id in x.split(" "): accessible_check(id) + +### +### +### Text / Knowledge Base +### +### How about downloading all possible info / webpages / sources +### related to Gavilan and creating a master search index? +### +### Goals: +### - Scripted approach to allow re-indexing / updating +### - Break everything down into paragraphs +### +### - Script to extract keywords, topics, entities, summaries, questions answered +### from each paragraph or chunk. +### - Use spacy, gensim, nltk, or gpt-3, or a combination of all of them +### +### - Create vector / embeddings for each paragraph +### +### - Enable a vector search engine and connect to front page of gavilan.cc +### - Use that to feed handful of source paragraphs (& prompt) into gpt and +### receive text answers to questions. + +def demo_vector_search(): + from gensim.models import Word2Vec + from gensim.utils import simple_preprocess + import nltk.data + import spacy + + # (might have to upgrade pip first...) + # pip install --upgrade click + # + # python -m spacy download en_core_web_sm + # python -m spacy download en_core_web_lg + + def is_complete_sentence(text): + #text = text.text + doc = nlp(text) + sentences = list(doc.sents) + if len(sentences) == 1 and text.strip() == sentences[0].text.strip(): + return True + return False + + + sentences = [ + "This is an example sentence.", + "Here is another sentence for training." + ] + + paragraph = """Financial Aid services are available in person! We are happy to assist you with your financial aid needs. If you are interested in visiting the office in person, please review the guidelines for visiting campus and schedule your appointment: + +Guidelines for In-Person Financial Aid Services + +Due to FERPA regulations, no student information will be given to anyone other than the student without authorization from the student. +We continue to offer virtual services. Financial Aid staff may be reached by email, phone, text, and zoom! Please refer to the contact information and schedules below. + +Gavilan-WelcomeCenter_Peer_Mentors.jpg + +Do you need assistance filing the FAFSA or California Dream Act Application? Friendly and knowledgeable Peer Mentors are available to assist you virtually and in person! Details below for an online Zoom visit, phone call, or in-person visit with Peer Mentors. + +Monday - Friday 8am - 5pm, Student Center +Join Zoom to Connect with a Peer Mentor +Or call (669) 900-6833 and use meeting ID 408 848 4800 + +MicrosoftTeams-image.png + + + +Do you need assistance with an existing financial aid application, financial aid document submission, or review of your financial aid package? Schedule an in-person, phone, or zoom appointment with our Financial Aid counter. + +Mon - Thurs: 9am - 1:00pm, 2:00pm - 5:00pm +Fri: 10am - 2pm +Office: (408) 848-4727 Email: finaid@gavilan.edu +Schedule an In-Person, Phone or Zoom Appointment""" + + tokenizer = nltk.data.load('tokenizers/punkt/english.pickle') + sentences1 = tokenizer.tokenize(paragraph) + for i,s in enumerate(sentences1): + print(i, "\t", s) + print("\n\n") + + #nlp = spacy.load('en_core_web_sm') + nlp = spacy.load('en_core_web_md') + + doc = nlp(paragraph) + sentences2 = list(doc.sents) + for i,s in enumerate(sentences2): + t = re.sub(r'\n+',' ',s.text) + is_sentence = 'yes' if is_complete_sentence(t) else 'no ' + print(i, " ", is_sentence, " ", t) + print("\n\n") + + #for text in sentences2: + # print(text, "is a complete sentence?" , is_complete_sentence(text)) + + return + + tokenized_sentences = [simple_preprocess(s) for s in sentences] + model = Word2Vec(tokenized_sentences, min_count=1, vector_size=100) + + example_word = "example" + vector = model.wv[example_word] + print(f"Vector for the word '{example_word}': {vector}") + + + ## TODO site scraper + + ## TODO finde package that extracts text from web page + + ### TODO master list of what to index. if __name__ == "__main__": @@ -844,8 +841,9 @@ if __name__ == "__main__": 2: ['download multiple classes', multiple_downloads ], 3: ['convert stuff', pan_testing ], 4: ['convert md to html', md_to_course ], - 5: ['import freshdesk content', freshdesk ], + # 5: ['import freshdesk content', freshdesk ], 6: ['download all a courses pages', grab_course_pages], + 7: ['demo vector search', demo_vector_search], } for key in options: diff --git a/depricated.py b/depricated.py index 73e843d..d79baaf 100644 --- a/depricated.py +++ b/depricated.py @@ -1617,8 +1617,11 @@ for L in str.split("\n"): ## sched.py +import codecs import requests, re, csv, json, funcy, sys +from content import upload_page + def dates(s): #print(s) @@ -1705,11 +1708,102 @@ if __name__ == "__main__": resp = input('Choose: ') # Call the function in the options dict - options[ int(resp)][1]() + options[ int(resp)][1]() + + +def put_revised_pages(): + course_num = '6862' + course_folder = '../course_temps/course_6862' + filein = codecs.open(course_folder+'/fullcourse.v2.html','r','utf-8') + my_titles = [] + my_urls = [] + my_bodys = [] + started = 0 + current_body = "" + for L in filein.readlines(): + ma = re.search('^

(.*)

.*$',L) + if ma: + my_titles.append(ma.group(1)) + my_urls.append(ma.group(2)) + if started: + my_bodys.append(current_body) + current_body = "" + started = 1 + else: + current_body += "\n" + L + my_bodys.append(current_body) + + i = 0 + for U in my_urls: + # and now upload it....lol + upload_page(course_num,U,my_bodys[i]) + i += 1 + +# Upload pages. Local copy has a particular format. +# Appears to not be used + +def put_course_pages(): + course_num = '6862' + filein = codecs.open('cache/pages/course_'+str(course_num)+'.html','r','utf-8') + my_titles = [] + my_urls = [] + my_bodys = [] + started = 0 + current_body = "" + for L in filein.readlines(): + ma = re.search('^###\s(.*)###\s(.*)$',L) + if ma: + my_titles.append(ma.group(1)) + my_urls.append(ma.group(2)) + if started: + my_bodys.append(current_body) + current_body = "" + started = 1 + else: + current_body += "\n" + L + my_bodys.append(current_body) + + i = 0 + for U in my_urls: + # and now upload it....lol + upload_page(course_num,U,my_bodys[i]) + i += 1 +def freshdesk(): + path = "C:\\Users\\peter\\Downloads\\freshdesk\\Solutions.xml" + soup = bs( codecs.open(path,'r','utf-8').read() ,features="lxml") + + outpt = codecs.open('cache/faqs.txt','w') + out = "" + for a in soup.find_all('solution-article'): + + print("TITLE\n"+a.find('title').get_text()) + out += a.find('title').get_text() + + """for d in a.find_all('description'): + #print(d) + if d: + d = h.unescape(d.get_text()) + e = stripper(d) + m = tomd.convert( e ) + m = mycleaner(m) + print("\nDESCRIPTION\n"+m)""" + + #print("\nWHAT IS THIS?\n" + + hh = a.find('desc-un-html').get_text() + d = h.unescape(hh) + e = stripper(d) + m = tomd.convert( e ) + m = mycleaner(m) + print("\nDESCRIPTION\n"+m) + out += "\n\n" + m + "\n\n" + + print("-----------\n\n") + outpt.write(out) + diff --git a/util.py b/util.py index 1a87041..a1197bf 100644 --- a/util.py +++ b/util.py @@ -4,6 +4,38 @@ import re, csv from collections import defaultdict +from bs4 import BeautifulSoup as bs + + + + +def stripper(s): + REMOVE_ATTRIBUTES = [ + 'lang','language','onmouseover','onmouseout','script','style','font', + 'dir','face','size','color','style','class','width','height','hspace', + 'border','valign','align','background','bgcolor','text','link','vlink', + 'alink','cellpadding','cellspacing'] + + #doc = '''Page title

This is paragraph one.

This is paragraph two.''' + soup = bs(s, features='lxml') + for tag in soup.recursiveChildGenerator(): + try: + tag.attrs = {key:value for key,value in tag.attrs.iteritems() + if key not in REMOVE_ATTRIBUTES} + except AttributeError: + # 'NavigableString' object has no attribute 'attrs' + pass + return soup.prettify() + +def mycleaner(s): + s = re.sub(r'','\n',s) + s = re.sub(r'<\/?b>','',s) + s = re.sub(r' +',' ',s) + s = re.sub(r'^[\s\t\r\n]+$','',s,flags=re.MULTILINE) + s = re.sub('^ ','',s) + return s + + def print_table(table): longest_cols = [