This commit is contained in:
phowell 2023-04-17 16:30:17 -07:00
commit 6807ddd96c
7 changed files with 2400 additions and 1704 deletions

3596
apphelp.py

File diff suppressed because it is too large Load Diff

View File

@ -1,25 +1,24 @@
#saved_titles = json.loads( codecs.open('cache/saved_youtube_titles.json','r','utf-8').read() )
import requests, codecs, os, re, json
import requests, codecs, os, re, json, sys, pypandoc
import webbrowser, bs4, trafilatura, pickle, tomd, checker
import html2markdown as h2m
from pipelines import header, fetch, url, put_file
from util import clean_title, to_file_friendly, minimal_string, stripper, mycleaner
from bs4 import BeautifulSoup as bs
from html.parser import HTMLParser
import tomd, checker
import html2markdown as h2m
import pypandoc
import webbrowser
h = HTMLParser()
from collections import defaultdict
from pdfminer.high_level import extract_text
from sentence_transformers import SentenceTransformer, util
h = HTMLParser()
DBG = 1
def d(s):
global DBG
if DBG: print(s)
@ -828,63 +827,265 @@ Schedule an In-Person, Phone or Zoom Appointment"""
print(f"Vector for the word '{example_word}': {vector}")
def makedir():
files = os.listdir('cache/crawl')
#print(files)
files.sort()
for f in files:
m = re.match(r'https?..www\.gavilan\.edu\+(.*)\.\w\w\w\w?\.txt$',f)
if m:
name = m.groups()[0]
parts = name.split('+')
print(parts)
def manual_index():
files = os.listdir('cache/crawl')
#print(files)
ii = codecs.open('cache/crawl/index.html','w','utf-8')
ii.write('<html><body><h1>Site index</h1>\n')
files.sort()
for f in files:
m = re.match(r'https?..www\.gavilan\.edu\+(.*)\.\w\w\w\w?\.txt$',f)
if m:
name = m.groups()[0]
parts = name.split('+')
ii.write('<br /><a href="mirror/'+f+'">'+f+'</a>\n')
def my_site():
files = os.listdir('cache/crawl')
output = []
files.sort()
for f in files:
m = re.match(r'https?..www\.gavilan\.edu\+(.*)\.\w\w\w\w?\.txt$',f)
if m:
name = m.groups()[0]
parts = name.split('+')
output.append(parts)
return output
## TODO site scraper
## TODO finde package that extracts text from web page
## TODO find package that extracts text from web page
### TODO master list of what to index.
from pattern.web import URL, plaintext, extension
from pattern.web import download
from pattern import URL, MIMETYPE_IMAGE
from pattern.web import Crawler
from util import clean_title
save_folder = 'cache/crawl'
class GavCrawl(Crawler):
def visit(self, link, source=None):
print 'visited:', repr(link.url), 'from:', link.referrer
txt = plaintext(link.source) ## , keep={'h1':[], 'h2':[], 'strong':[], 'a':['href']})
codecs.open(save_folder + '/' + clean_title(link.url) + '.txt').write(txt)
def fail(self, link):
print 'failed:', repr(link.url)
## TODO PDFs and DOCXs
## TODO fix urls w/ anchors
def crawl():
p = GavCrawl(links=['http://www.gavilan.edu/'], delay=3)
while not p.done:
p.crawl(method=DEPTH, cached=False, throttle=3)
import scrapy, logging
from scrapy.crawler import CrawlerProcess
logger = logging.getLogger()
logger.setLevel(level=logging.CRITICAL)
logging.basicConfig(level=logging.CRITICAL)
logger.disabled = True
avoid = ['ezproxy','community\.gavilan\.edu','archive\/tag','archive\/category', 'my\.gavilan\.edu', 'augusoft',
'eis-prod', 'ilearn\.gavilan', 'mailto', 'cgi-bin', 'edu\/old\/schedule', ]
class MySpider(scrapy.Spider):
name = 'myspider'
#start_urls = ['https://gavilan.curriqunet.com/catalog/iq/1826']
start_urls = ['https://www.gavilan.edu']
"""
logging.getLogger("scrapy").setLevel(logging.CRITICAL)
logging.getLogger("scrapy.utils.log").setLevel(logging.CRITICAL)
logging.getLogger("scrapy.extensions.telnet").setLevel(logging.CRITICAL)
logging.getLogger("scrapy.middleware").setLevel(logging.CRITICAL)
logging.getLogger("scrapy.core.engine").setLevel(logging.CRITICAL)
logging.getLogger("scrapy.middleware").setLevel(logging.CRITICAL)
logger.disabled = True"""
def parse(self, response):
print('visited:', repr(response.url), 'status:', response.status)
if re.search(r'\.pdf$', response.url):
m = re.search(r'\/([^\/]+\.'+ext+')$', response.url)
if m:
print("saving to ", save_folder + '/' + clean_fn(response.url))
pdf_response = requests.get(response.url)
with open(save_folder + '/' + clean_fn(response.url), 'wb') as f:
f.write(pdf_response.content)
text = extract_text(save_folder + '/' + clean_fn(response.url))
codecs.open(save_folder + '/' + clean_fn(response.url) + '.txt','w','utf-8').write(text)
for ext in ['doc','docx','ppt','pptx']:
if re.search(r'\.'+ext+'$', response.url):
m = re.search(r'\/([^\/]+\.'+ext+')$', response.url)
if m:
print("saving to ", save_folder + '/' + clean_fn(response.url))
pdf_response = requests.get(response.url)
with open(save_folder + '/' + clean_fn(response.url), 'wb') as f:
f.write(pdf_response.content)
#text = extract_text(save_folder + '/' + clean_fn(response.url) + '.txt')
output = pypandoc.convert_file(save_folder + '/' + clean_fn(response.url), 'html', extra_args=['--extract-media=%s' % hash ])
txt_output = trafilatura.extract(response.text,include_links=True, deduplicate=True, include_images=True, include_formatting=True)
if txt_output:
codecs.open(save_folder + '/' + clean_fn(response.url) + '.txt','w','utf-8').write(txt_output)
for ext in ['jpg','jpeg','gif','webp']:
if re.search(r'\.'+ext+'$', response.url):
m = re.search(r'\/([^\/]+\.'+ext+')$', response.url)
if m:
print("saving to ", save_folder + '/' + clean_fn(response.url))
pdf_response = requests.get(response.url)
with open(save_folder + '/' + clean_fn(response.url), 'wb') as f:
f.write(pdf_response.content)
f_out = codecs.open(save_folder + '/' + clean_fn(response.url) + '.txt','w','utf-8')
this_output = trafilatura.extract(response.text,include_links=True, deduplicate=True, include_images=True, include_formatting=True)
if this_output:
f_out.write(this_output)
f_out.close()
links = response.css('a::attr(href)').getall()
# Follow each link and parse its contents
for link in links:
go = 1
full_link = response.urljoin(link)
print('++++++ trying ', full_link)
if not re.search(r'gavilan\.edu',full_link):
go = 0
print('--- not gav edu')
else:
if re.search(r'hhh\.gavilan\.edu',full_link):
pass
elif not re.search(r'^https?:\/\/www\.gavilan\.edu',full_link):
# need to add www to gavilan.edu
m = re.search(r'^(https?:\/\/)gavilan\.edu(\/.*)$',full_link)
if m:
full_link = m.group(1) + 'www.' + m.group(2)
for a in avoid:
if re.search(a,full_link):
go = 0
print('--- avoid ', a)
if go: yield scrapy.Request(full_link, callback=self.parse,
headers={"User-Agent": "Mozilla/5.0 (iPad; CPU OS 12_2 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Mobile/15E148"})
else:
print("------ avoiding ", full_link)
# Instantiate a CrawlerProcess object
process = CrawlerProcess()
# Add the MySpider spider to the process
process.crawl(MySpider)
# Start the process
logging.basicConfig(level=logging.CRITICAL)
logging.getLogger('scrapy').propagate = False
logging.getLogger("trafilatura").setLevel(logging.CRITICAL)
logging.getLogger("trafilatura").propagate = False
logging.getLogger("pdfminer").setLevel(logging.CRITICAL)
logging.getLogger("pdfminer").propagate = False
logging.getLogger("urllib3").setLevel(logging.CRITICAL)
logging.getLogger("urllib3").propagate = False
logging.basicConfig(level=logging.CRITICAL)
process.start()
def samples():
crawler = Crawler(links=[], domains=[], delay=20.0, sort=FIFO)
save_folder = 'cache/crawl'
clean_folder = 'cache/cleancrawl'
url = URL('http://www.clips.ua.ac.bemedia/pattern_schema.gif')
print url.mimetype in MIMETYPE_IMAGE
def clean_fn(s):
s = re.sub(r'[\s:]+','',s)
s = re.sub(r'\/','+',s)
return s
def format_html(html):
soup = bs4.BeautifulSoup(html, 'html.parser')
return soup.prettify()
#html = download('http://www.clips.ua.ac.be/', unicode=True)
s = URL('http://www.clips.ua.ac.be').download()
s = plaintext(s, keep={'h1':[], 'h2':[], 'strong':[], 'a':['href']})
def txt_clean_index():
files = os.listdir(save_folder)
line_freq = defaultdict(int)
# first pass
for f in files:
lines = codecs.open(save_folder + '/' + f,'r','utf-8').readlines()
for L in lines:
L = L.strip()
line_freq[L] += 1
# second pass
for f in files:
print("\n\n",f)
lines = codecs.open(save_folder + '/' + f,'r','utf-8').readlines()
out = codecs.open(clean_folder + '/' + f,'w','utf-8')
for L in lines:
L = L.strip()
if L in line_freq and line_freq[L] > 3:
continue
print(L)
out.write(L + '\n')
out.close()
# getting absolute urls
from pattern.web import URL, DOM, abs
url = URL('http://www.clips.ua.ac.be')
dom = DOM(url.download())
for link in dom('a'):
print abs(link.attributes.get('href',''), base=url.redirect or url.string)
def search_embeddings():
model = SentenceTransformer('all-MiniLM-L6-v2')
save_embeds = pickle.load( open( "cache/embeddings.p", "rb" ) )
columns = list(zip(*save_embeds))
files = columns[0]
sentences = columns[1]
embeddings = columns[2]
# get pdfs
from pattern.web import URL, PDF
print(files[:20])
print(sentences[:20])
print(embeddings[:20])
url = URL('http://www.clips.ua.ac.be/sites/default/files/ctrs-002_0.pdf')
pdf = PDF(url.download())
print pdf.string
s = ''
while s != 'q':
s = input("search or 'q' to quit: ")
if s == 'q':
return
query_embedding = model.encode(s)
# Compute the cosine similarity between the query embedding and the sentence embeddings
cosine_scores = util.cos_sim(query_embedding, embeddings)
# Sort the sentences by their cosine similarity to the query sentence
results = sorted(zip(sentences, cosine_scores, files), key=lambda x: x[1], reverse=True)
# Print the top 5 results
for i, (sentence, score, file) in enumerate(results[:5]):
print(f'Top {i+1}: {file} - {sentence} - (Score: {score})')
def create_embeddings():
model = SentenceTransformer('all-MiniLM-L6-v2')
files = os.listdir('cache/crawl')
output = []
save_embeds = [] # ['file','sentence','embedding']
files.sort()
for f in files:
m = re.match(r'https?..www\.gavilan\.edu\+(.*)\.\w\w\w\w?\.txt$',f)
if m:
lines = codecs.open('cache/crawl/' + f,'r','utf-8').readlines()
lines = [L.strip() for L in lines]
lines = [L for L in lines if L]
embeddings = model.encode(lines)
print("\n-----", f)
#Print the embeddings
for sentence, embedding in zip(lines, embeddings):
print("Sentence:", sentence)
#print("Embedding:", embedding)
save_embeds.append([f,sentence,embedding])
pickle.dump( save_embeds, open( "cache/embeddings.p", "wb" ) )
if __name__ == "__main__":
@ -897,16 +1098,25 @@ if __name__ == "__main__":
# 5: ['import freshdesk content', freshdesk ],
6: ['download all a courses pages', grab_course_pages],
7: ['demo vector search', demo_vector_search],
8: ['crawl',crawl],
9: ['clean text index', txt_clean_index],
10: ['make web dir struct', manual_index],
11: ['create search embeddings', create_embeddings],
12: ['do a search', search_embeddings],
}
for key in options:
print(str(key) + '.\t' + options[key][0])
if len(sys.argv) > 1 and re.search(r'^\d+',sys.argv[1]):
resp = int(sys.argv[1])
print("\n\nPerforming: %s\n\n" % options[resp][0])
print('')
resp = input('Choose: ')
else:
print ('')
for key in options:
print(str(key) + '.\t' + options[key][0])
print('')
resp = input('Choose: ')
# Call the function in the options dict
options[ int(resp)][1]()

View File

@ -1086,6 +1086,7 @@ def add_evals(section=0):
s = [ x.strip() for x in codecs.open('cache/sp23_eval_sections.csv','r').readlines()]
s = list(funcy.flatten(s))
s.sort()
print(s)
xyz = input('hit return to continue')
#c = getCoursesInTerm(168,0,1)
@ -1306,7 +1307,6 @@ def set_ext_tools():
if __name__ == "__main__":
options = { 1: ['Cross check schedule with ztc responses',make_ztc_list] ,
30: ['List latestart classes', list_latestarts ],
2: ['Add announcements to homepage', change_course_ann_homepage],
3: ['Cross-list classes', xlist ],
4: ['List students who passed quiz X', get_quiz_passers],
@ -1335,6 +1335,7 @@ if __name__ == "__main__":
27: ['Fine tune term dates and winter session', course_dates_terms],
28: ['Cross list a semester from file', semester_cross_lister],
29: ['Check all courses & their sections in semester', all_semester_course_sanity_check],
#30: ['List latestart classes', list_latestarts ],
# TODO wanted: group shell for each GP (guided pathway) as a basic student services gateway....
#
}

View File

@ -1807,3 +1807,95 @@ def freshdesk():
#### content.py
from pattern.web import plaintext, extension
from pattern.web import download
#from pattern import URL, MIMETYPE_IMAGE
from pattern.web import Crawler, DEPTH, FIFO, MIMETYPE_IMAGE, MIMETYPE_PDF
class GavCrawl(Crawler):
def visit(self, link, source=None):
print('visited:', repr(link.url), 'from:', link.referrer)
print(' ', link.url.mimetype)
#txt = plaintext(source, keep={'h1':[], 'h2':[], 'h3':[], 'h4':[], 'td':[], 'strong':[], 'b':[], 'a':['href'], 'img':['src'], 'ul':[], 'ol':[], 'li':[], 'dd':[], 'dt':[], 'i':[]})
#codecs.open(save_folder + '/' + mycleaner(clean_title(link.url)) + '.txt','w','utf-8').write(tomd.convert(txt))
codecs.open(save_folder + '/' + clean_fn(link.url) + '.txt','w','utf-8').write(trafilatura.extract(source,include_links=True, deduplicate=True, include_images=True, include_formatting=True))
def fail(self, link):
print('failed:', repr(link.url))
if re.search(r'\.pdf$', link.url):
m = re.search(r'\/([^\/]+\.pdf)$', link.url)
if m:
save_file = m.group(1)
print("saving to ", save_folder + '/' + save_file)
pdf_response = requests.get(link.url)
with open(save_folder + '/' + save_file, 'wb') as f:
f.write(pdf_response.content)
text = extract_text(save_folder + '/' + save_file)
#print(text)
codecs.open(save_folder + '/' + save_file + '.txt','w','utf-8').write(text)
else:
print("no match for pdf url: ", link.url)
for ext in ['jpg','jpeg','gif','webp']:
if re.search(r'\.'+ext+'$', link.url):
m = re.search(r'\/([^\/]+\.'+ext+')$', link.url)
if m:
save_file = m.group(1)
print("saving to ", save_folder + '/' + save_file)
pdf_response = requests.get(link.url)
with open(save_folder + '/' + save_file, 'wb') as f:
f.write(pdf_response.content)
else:
print('no match for '+ext+' url: ', link.url)
def crawl2():
#p = GavCrawl(links=['http://www.gavilan.edu/'], domains=['gavilan.edu', 'gavilan.curriqunet.com','www.boarddocs.com'], delay=0.75)
#p = GavCrawl(links=['https://gavilan.edu/finaid/2022-23DirectLoanApplication1.pdf'], domains=['gavilan.edu', 'gavilan.curriqunet.com','www.boarddocs.com'], delay=0.75)
p = GavCrawl(links=['https://gavilan.curriqunet.com/catalog/iq/1826'], domains=['gavilan.edu', 'gavilan.curriqunet.com','www.boarddocs.com'], delay=0.75)
while not p.done:
try:
p.crawl(method=DEPTH, cached=False, throttle=0.76)
except Exception as e:
print("Exception: ", e)
def samples():
crawler = Crawler(links=[], domains=[], delay=20.0, sort=FIFO)
url = URL('http://www.clips.ua.ac.bemedia/pattern_schema.gif')
print(url.mimetype in MIMETYPE_IMAGE)
#html = download('http://www.clips.ua.ac.be/', unicode=True)
s = URL('http://www.clips.ua.ac.be').download()
s = plaintext(s, keep={'h1':[], 'h2':[], 'strong':[], 'a':['href']})
# getting absolute urls
from pattern.web import URL, DOM, abs
url = URL('http://www.clips.ua.ac.be')
dom = DOM(url.download())
for link in dom('a'):
print(abs(link.attributes.get('href',''), base=url.redirect or url.string))
# get pdfs
from pattern.web import URL, PDF
url = URL('http://www.clips.ua.ac.be/sites/default/files/ctrs-002_0.pdf')
pdf = PDF(url.download())
print(pdf.string)

4
gpt.py
View File

@ -4,8 +4,8 @@ import openai
from canvas_secrets import openai_org, openai_api_key
openai.organization = "org-66WLoZQEtBrO42Z9S8rfd10M"
openai.api_key = "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"
openai.organization = openai_org
openai.api_key = openai_api_key
#print(openai.Model.list())
my_prompt = "Write a series of texts trying to sell a pen to a stranger."

View File

@ -1,4 +1,3 @@
import curses
import heapq, re, csv, os, shutil, datetime, urllib
import itertools, time, markdown, csv, json, os.path, webbrowser, threading
from functools import wraps
@ -15,6 +14,20 @@ import localcache
from server import *
from canvas_secrets import flask_secretkey
from content import my_site
import socket
this_host = socket.gethostname()
print('\n\n' + this_host, '\n\n')
has_curses = 0
if this_host != 'ROGDESKTOP':
import curses
has_curses = 1
else:
print("Skipping curses stuff")
q = Queue()
@ -25,7 +38,6 @@ PORT_NUMBER = 8080 # Maybe set this to 9000.
datafile = 'lambda.csv'
#writing_path = 'c:/users/peter/Nextcloud/Documents/writing/'
####
@ -95,7 +107,16 @@ def flask_thread(q):
@app.route('/mirror')
def mirror():
return codecs.open('cache/crawl/index.html','r','utf-8').read()
@app.route('/mirror/<filename>')
def mirror_file(filename):
return markdown.markdown( codecs.open('cache/crawl/'+filename,'r','utf-8').read() ) + \
"<pre>" + codecs.open('cache/crawl/'+filename,'r','utf-8').read() + "</pre>"
@app.route('/clearscreens')
def clears():
clearscreens()
@ -166,6 +187,7 @@ def flask_thread(q):
@app.route('/x/writing/images/<fname>')
def writing_img(fname):
# TODO
img_path = "/media/hd2/peter_home/Documents/writing_img/"
print(img_path + fname + " - writing images folder")
img_ext = fname.split('.')[-1]

View File

@ -1,5 +1,5 @@
import json, codecs, re, markdown, os, pypandoc, striprtf, sqlite3, random, urllib
import subprocess, html
import subprocess, html, time
from striprtf.striprtf import rtf_to_text
from flask import render_template, Response
from flask import send_from_directory
@ -16,8 +16,33 @@ from localcache import arrange_data_for_web, depts_with_classcounts, dept_with_s
from yattag import Doc
import socket
this_host = socket.gethostname()
print('\n\n server host: ' + this_host, '\n\n')
LECPATH = "/media/hd2/peter_home_offload/lecture/"
host = 'http://192.168.1.6:5000'
host = 'http://192.168.1.6:5000'
news_path = '/media/hd2/peter_home/Documents/scripts/browser/'
writing_path = '/media/hd2/peter_home/Documents/writing/'
img_path = '/media/hd2/peter_home/Documents/writing_img/'
pics_path = '/media/hd2/peter_home/misc/'
if this_host == 'ROGDESKTOP':
LECPATH = "d:/peter_home_offload/lecture/"
host = 'http://192.168.1.7:5000'
news_path = 'd:/peter_home/Documents/scripts/browser/'
writing_path = 'd:/peter_home/Documents/writing/'
img_path = 'd:/peter_home/Documents/writing_img/'
pics_path = 'd:/peter_home/misc/'
import paho.mqtt.client as mqtt
@ -55,20 +80,20 @@ def on_message(client, userdata, msg):
print(" %s mqtt msg: %s data: %s" % (now, msg.topic, msg.payload.decode()))
if 0:
while(mqtt_offline):
try:
client = mqtt.Client()
client.on_connect = on_connect
client.on_message = on_message
while(mqtt_offline):
try:
client = mqtt.Client()
client.on_connect = on_connect
client.on_message = on_message
client.connect("192.168.1.6", 1883, 60)
client.connect("192.168.1.6", 1883, 60)
mqtt_offline = 0
mqtt_offline = 0
except OSError as oe:
print('no internet? try again in 5 seconds.')
time.sleep(5)
except OSError as oe:
print('no internet? try again in 5 seconds.')
time.sleep(5)
@ -114,18 +139,6 @@ def screenoff():
######
news_path = '/media/hd2/peter_home/Documents/scripts/browser/'
if platform.system() == 'Windows':
writing_path = 'c:/users/peter/Nextcloud/Documents/writing/'
else:
writing_path = '/media/hd2/peter_home/Documents/writing/'
img_path = '/media/hd2/peter_home/Documents/writing_img/'
if platform.system() == 'Windows':
pics_path = 'c:/users/peter/Nextcloud/misc/'
else:
pics_path = '/media/hd2/peter_home/misc/'
br = "<br />"
nl = "\n"