moving fxns to depricated. cleaning

This commit is contained in:
Coding with Peter 2023-04-11 08:56:24 -07:00
parent 035db48464
commit 0c358da2ab
3 changed files with 239 additions and 115 deletions

View File

@ -3,7 +3,7 @@
#saved_titles = json.loads( codecs.open('cache/saved_youtube_titles.json','r','utf-8').read() ) #saved_titles = json.loads( codecs.open('cache/saved_youtube_titles.json','r','utf-8').read() )
import requests, codecs, os, re, json import requests, codecs, os, re, json
from pipelines import header, fetch, url, put_file from pipelines import header, fetch, url, put_file
from util import clean_title, to_file_friendly, minimal_string from util import clean_title, to_file_friendly, minimal_string, stripper, mycleaner
from bs4 import BeautifulSoup as bs from bs4 import BeautifulSoup as bs
from html.parser import HTMLParser from html.parser import HTMLParser
import tomd, checker import tomd, checker
@ -19,63 +19,9 @@ def d(s):
global DBG global DBG
if DBG: print(s) if DBG: print(s)
def stripper(s):
REMOVE_ATTRIBUTES = [
'lang','language','onmouseover','onmouseout','script','style','font',
'dir','face','size','color','style','class','width','height','hspace',
'border','valign','align','background','bgcolor','text','link','vlink',
'alink','cellpadding','cellspacing']
#doc = '''<html><head><title>Page title</title></head><body><p id="firstpara" align="center">This is <i>paragraph</i> <a onmouseout="">one</a>.<p id="secondpara" align="blah">This is <i>paragraph</i> <b>two</b>.</html>'''
soup = bs(s, features='lxml')
for tag in soup.recursiveChildGenerator():
try:
tag.attrs = {key:value for key,value in tag.attrs.iteritems()
if key not in REMOVE_ATTRIBUTES}
except AttributeError:
# 'NavigableString' object has no attribute 'attrs'
pass
return soup.prettify()
def mycleaner(s):
s = re.sub(r'<br\s?\/>','\n',s)
s = re.sub(r'<\/?b>','',s)
s = re.sub(r' +',' ',s)
s = re.sub(r'^[\s\t\r\n]+$','',s,flags=re.MULTILINE)
s = re.sub('^ ','',s)
return s
def freshdesk():
path = "C:\\Users\\peter\\Downloads\\freshdesk\\Solutions.xml"
soup = bs( codecs.open(path,'r','utf-8').read() ,features="lxml")
outpt = codecs.open('cache/faqs.txt','w')
out = ""
for a in soup.find_all('solution-article'):
print("TITLE\n"+a.find('title').get_text())
out += a.find('title').get_text()
"""for d in a.find_all('description'):
#print(d)
if d:
d = h.unescape(d.get_text())
e = stripper(d)
m = tomd.convert( e )
m = mycleaner(m)
print("\nDESCRIPTION\n"+m)"""
#print("\nWHAT IS THIS?\n" +
hh = a.find('desc-un-html').get_text()
d = h.unescape(hh)
e = stripper(d)
m = tomd.convert( e )
m = mycleaner(m)
print("\nDESCRIPTION\n"+m)
out += "\n\n" + m + "\n\n"
print("-----------\n\n")
outpt.write(out)
# Download everything interesting in a course to a local folder # Download everything interesting in a course to a local folder
# Build a master file with the entire class content # Build a master file with the entire class content
@ -533,64 +479,6 @@ def grab_course_pages(course_num=-1):
pageout.close() pageout.close()
pageoutm.close() pageoutm.close()
# Upload pages. Local copy has a particular format.
# Appears to not be used
def put_course_pages():
course_num = '6862'
filein = codecs.open('cache/pages/course_'+str(course_num)+'.html','r','utf-8')
my_titles = []
my_urls = []
my_bodys = []
started = 0
current_body = ""
for L in filein.readlines():
ma = re.search('^###\s(.*)###\s(.*)$',L)
if ma:
my_titles.append(ma.group(1))
my_urls.append(ma.group(2))
if started:
my_bodys.append(current_body)
current_body = ""
started = 1
else:
current_body += "\n" + L
my_bodys.append(current_body)
i = 0
for U in my_urls:
# and now upload it....lol
upload_page(course_num,U,my_bodys[i])
i += 1
# Also not used
def put_revised_pages():
course_num = '6862'
course_folder = '../course_temps/course_6862'
filein = codecs.open(course_folder+'/fullcourse.v2.html','r','utf-8')
my_titles = []
my_urls = []
my_bodys = []
started = 0
current_body = ""
for L in filein.readlines():
ma = re.search('^<h1>(.*)</h1>.*$',L)
if ma:
my_titles.append(ma.group(1))
my_urls.append(ma.group(2))
if started:
my_bodys.append(current_body)
current_body = ""
started = 1
else:
current_body += "\n" + L
my_bodys.append(current_body)
i = 0
for U in my_urls:
# and now upload it....lol
upload_page(course_num,U,my_bodys[i])
i += 1
# Download, clean html, and reupload page # Download, clean html, and reupload page
def update_page(): def update_page():
global results, results_dict, url, header global results, results_dict, url, header
@ -837,6 +725,115 @@ def multiple_downloads():
accessible_check(id) accessible_check(id)
###
###
### Text / Knowledge Base
###
### How about downloading all possible info / webpages / sources
### related to Gavilan and creating a master search index?
###
### Goals:
### - Scripted approach to allow re-indexing / updating
### - Break everything down into paragraphs
###
### - Script to extract keywords, topics, entities, summaries, questions answered
### from each paragraph or chunk.
### - Use spacy, gensim, nltk, or gpt-3, or a combination of all of them
###
### - Create vector / embeddings for each paragraph
###
### - Enable a vector search engine and connect to front page of gavilan.cc
### - Use that to feed handful of source paragraphs (& prompt) into gpt and
### receive text answers to questions.
def demo_vector_search():
from gensim.models import Word2Vec
from gensim.utils import simple_preprocess
import nltk.data
import spacy
# (might have to upgrade pip first...)
# pip install --upgrade click
#
# python -m spacy download en_core_web_sm
# python -m spacy download en_core_web_lg
def is_complete_sentence(text):
#text = text.text
doc = nlp(text)
sentences = list(doc.sents)
if len(sentences) == 1 and text.strip() == sentences[0].text.strip():
return True
return False
sentences = [
"This is an example sentence.",
"Here is another sentence for training."
]
paragraph = """Financial Aid services are available in person! We are happy to assist you with your financial aid needs. If you are interested in visiting the office in person, please review the guidelines for visiting campus and schedule your appointment:
Guidelines for In-Person Financial Aid Services
Due to FERPA regulations, no student information will be given to anyone other than the student without authorization from the student.
We continue to offer virtual services. Financial Aid staff may be reached by email, phone, text, and zoom! Please refer to the contact information and schedules below.
Gavilan-WelcomeCenter_Peer_Mentors.jpg
Do you need assistance filing the FAFSA or California Dream Act Application? Friendly and knowledgeable Peer Mentors are available to assist you virtually and in person! Details below for an online Zoom visit, phone call, or in-person visit with Peer Mentors.
Monday - Friday 8am - 5pm, Student Center
Join Zoom to Connect with a Peer Mentor
Or call (669) 900-6833 and use meeting ID 408 848 4800
MicrosoftTeams-image.png
Do you need assistance with an existing financial aid application, financial aid document submission, or review of your financial aid package? Schedule an in-person, phone, or zoom appointment with our Financial Aid counter.
Mon - Thurs: 9am - 1:00pm, 2:00pm - 5:00pm
Fri: 10am - 2pm
Office: (408) 848-4727 Email: finaid@gavilan.edu
Schedule an In-Person, Phone or Zoom Appointment"""
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
sentences1 = tokenizer.tokenize(paragraph)
for i,s in enumerate(sentences1):
print(i, "\t", s)
print("\n\n")
#nlp = spacy.load('en_core_web_sm')
nlp = spacy.load('en_core_web_md')
doc = nlp(paragraph)
sentences2 = list(doc.sents)
for i,s in enumerate(sentences2):
t = re.sub(r'\n+',' ',s.text)
is_sentence = 'yes' if is_complete_sentence(t) else 'no '
print(i, " ", is_sentence, " ", t)
print("\n\n")
#for text in sentences2:
# print(text, "is a complete sentence?" , is_complete_sentence(text))
return
tokenized_sentences = [simple_preprocess(s) for s in sentences]
model = Word2Vec(tokenized_sentences, min_count=1, vector_size=100)
example_word = "example"
vector = model.wv[example_word]
print(f"Vector for the word '{example_word}': {vector}")
## TODO site scraper
## TODO finde package that extracts text from web page
### TODO master list of what to index.
if __name__ == "__main__": if __name__ == "__main__":
print ('') print ('')
@ -844,8 +841,9 @@ if __name__ == "__main__":
2: ['download multiple classes', multiple_downloads ], 2: ['download multiple classes', multiple_downloads ],
3: ['convert stuff', pan_testing ], 3: ['convert stuff', pan_testing ],
4: ['convert md to html', md_to_course ], 4: ['convert md to html', md_to_course ],
5: ['import freshdesk content', freshdesk ], # 5: ['import freshdesk content', freshdesk ],
6: ['download all a courses pages', grab_course_pages], 6: ['download all a courses pages', grab_course_pages],
7: ['demo vector search', demo_vector_search],
} }
for key in options: for key in options:

View File

@ -1617,8 +1617,11 @@ for L in str.split("\n"):
## sched.py ## sched.py
import codecs
import requests, re, csv, json, funcy, sys import requests, re, csv, json, funcy, sys
from content import upload_page
def dates(s): def dates(s):
#print(s) #print(s)
@ -1708,8 +1711,99 @@ if __name__ == "__main__":
options[ int(resp)][1]() options[ int(resp)][1]()
def put_revised_pages():
course_num = '6862'
course_folder = '../course_temps/course_6862'
filein = codecs.open(course_folder+'/fullcourse.v2.html','r','utf-8')
my_titles = []
my_urls = []
my_bodys = []
started = 0
current_body = ""
for L in filein.readlines():
ma = re.search('^<h1>(.*)</h1>.*$',L)
if ma:
my_titles.append(ma.group(1))
my_urls.append(ma.group(2))
if started:
my_bodys.append(current_body)
current_body = ""
started = 1
else:
current_body += "\n" + L
my_bodys.append(current_body)
i = 0
for U in my_urls:
# and now upload it....lol
upload_page(course_num,U,my_bodys[i])
i += 1
# Upload pages. Local copy has a particular format.
# Appears to not be used
def put_course_pages():
course_num = '6862'
filein = codecs.open('cache/pages/course_'+str(course_num)+'.html','r','utf-8')
my_titles = []
my_urls = []
my_bodys = []
started = 0
current_body = ""
for L in filein.readlines():
ma = re.search('^###\s(.*)###\s(.*)$',L)
if ma:
my_titles.append(ma.group(1))
my_urls.append(ma.group(2))
if started:
my_bodys.append(current_body)
current_body = ""
started = 1
else:
current_body += "\n" + L
my_bodys.append(current_body)
i = 0
for U in my_urls:
# and now upload it....lol
upload_page(course_num,U,my_bodys[i])
i += 1
def freshdesk():
path = "C:\\Users\\peter\\Downloads\\freshdesk\\Solutions.xml"
soup = bs( codecs.open(path,'r','utf-8').read() ,features="lxml")
outpt = codecs.open('cache/faqs.txt','w')
out = ""
for a in soup.find_all('solution-article'):
print("TITLE\n"+a.find('title').get_text())
out += a.find('title').get_text()
"""for d in a.find_all('description'):
#print(d)
if d:
d = h.unescape(d.get_text())
e = stripper(d)
m = tomd.convert( e )
m = mycleaner(m)
print("\nDESCRIPTION\n"+m)"""
#print("\nWHAT IS THIS?\n" +
hh = a.find('desc-un-html').get_text()
d = h.unescape(hh)
e = stripper(d)
m = tomd.convert( e )
m = mycleaner(m)
print("\nDESCRIPTION\n"+m)
out += "\n\n" + m + "\n\n"
print("-----------\n\n")
outpt.write(out)

32
util.py
View File

@ -4,6 +4,38 @@
import re, csv import re, csv
from collections import defaultdict from collections import defaultdict
from bs4 import BeautifulSoup as bs
def stripper(s):
REMOVE_ATTRIBUTES = [
'lang','language','onmouseover','onmouseout','script','style','font',
'dir','face','size','color','style','class','width','height','hspace',
'border','valign','align','background','bgcolor','text','link','vlink',
'alink','cellpadding','cellspacing']
#doc = '''<html><head><title>Page title</title></head><body><p id="firstpara" align="center">This is <i>paragraph</i> <a onmouseout="">one</a>.<p id="secondpara" align="blah">This is <i>paragraph</i> <b>two</b>.</html>'''
soup = bs(s, features='lxml')
for tag in soup.recursiveChildGenerator():
try:
tag.attrs = {key:value for key,value in tag.attrs.iteritems()
if key not in REMOVE_ATTRIBUTES}
except AttributeError:
# 'NavigableString' object has no attribute 'attrs'
pass
return soup.prettify()
def mycleaner(s):
s = re.sub(r'<br\s?\/>','\n',s)
s = re.sub(r'<\/?b>','',s)
s = re.sub(r' +',' ',s)
s = re.sub(r'^[\s\t\r\n]+$','',s,flags=re.MULTILINE)
s = re.sub('^ ','',s)
return s
def print_table(table): def print_table(table):
longest_cols = [ longest_cols = [