the vector search toy

This commit is contained in:
Coding with Peter 2023-04-17 12:08:04 -07:00
parent ec8658cd8f
commit 532a71a3da
5 changed files with 381 additions and 90 deletions

View File

@ -1,26 +1,24 @@
#saved_titles = json.loads( codecs.open('cache/saved_youtube_titles.json','r','utf-8').read() ) #saved_titles = json.loads( codecs.open('cache/saved_youtube_titles.json','r','utf-8').read() )
import requests, codecs, os, re, json import requests, codecs, os, re, json, sys, pypandoc
import webbrowser, bs4, trafilatura, pickle, tomd, checker
import html2markdown as h2m
from pipelines import header, fetch, url, put_file from pipelines import header, fetch, url, put_file
from util import clean_title, to_file_friendly, minimal_string, stripper, mycleaner from util import clean_title, to_file_friendly, minimal_string, stripper, mycleaner
from bs4 import BeautifulSoup as bs from bs4 import BeautifulSoup as bs
from html.parser import HTMLParser from html.parser import HTMLParser
from collections import defaultdict from collections import defaultdict
import tomd, checker from pdfminer.high_level import extract_text
import html2markdown as h2m from sentence_transformers import SentenceTransformer, util
import pypandoc
import webbrowser
h = HTMLParser()
h = HTMLParser()
DBG = 1 DBG = 1
def d(s): def d(s):
global DBG global DBG
if DBG: print(s) if DBG: print(s)
@ -829,6 +827,44 @@ Schedule an In-Person, Phone or Zoom Appointment"""
print(f"Vector for the word '{example_word}': {vector}") print(f"Vector for the word '{example_word}': {vector}")
def makedir():
files = os.listdir('cache/crawl')
#print(files)
files.sort()
for f in files:
m = re.match(r'https?..www\.gavilan\.edu\+(.*)\.\w\w\w\w?\.txt$',f)
if m:
name = m.groups()[0]
parts = name.split('+')
print(parts)
def manual_index():
files = os.listdir('cache/crawl')
#print(files)
ii = codecs.open('cache/crawl/index.html','w','utf-8')
ii.write('<html><body><h1>Site index</h1>\n')
files.sort()
for f in files:
m = re.match(r'https?..www\.gavilan\.edu\+(.*)\.\w\w\w\w?\.txt$',f)
if m:
name = m.groups()[0]
parts = name.split('+')
ii.write('<br /><a href="mirror/'+f+'">'+f+'</a>\n')
def my_site():
files = os.listdir('cache/crawl')
output = []
files.sort()
for f in files:
m = re.match(r'https?..www\.gavilan\.edu\+(.*)\.\w\w\w\w?\.txt$',f)
if m:
name = m.groups()[0]
parts = name.split('+')
output.append(parts)
return output
## TODO site scraper ## TODO site scraper
## TODO find package that extracts text from web page ## TODO find package that extracts text from web page
### TODO master list of what to index. ### TODO master list of what to index.
@ -836,21 +872,132 @@ Schedule an In-Person, Phone or Zoom Appointment"""
## TODO PDFs and DOCXs ## TODO PDFs and DOCXs
## TODO fix urls w/ anchors ## TODO fix urls w/ anchors
def crawl():
import scrapy, logging
from scrapy.crawler import CrawlerProcess
logger = logging.getLogger()
logger.setLevel(level=logging.CRITICAL)
logging.basicConfig(level=logging.CRITICAL)
logger.disabled = True
avoid = ['ezproxy','community\.gavilan\.edu','archive\/tag','archive\/category', 'my\.gavilan\.edu', 'augusoft',
'eis-prod', 'ilearn\.gavilan', 'mailto', 'cgi-bin', 'edu\/old\/schedule', ]
class MySpider(scrapy.Spider):
name = 'myspider'
#start_urls = ['https://gavilan.curriqunet.com/catalog/iq/1826']
start_urls = ['https://www.gavilan.edu']
"""
logging.getLogger("scrapy").setLevel(logging.CRITICAL)
logging.getLogger("scrapy.utils.log").setLevel(logging.CRITICAL)
logging.getLogger("scrapy.extensions.telnet").setLevel(logging.CRITICAL)
logging.getLogger("scrapy.middleware").setLevel(logging.CRITICAL)
logging.getLogger("scrapy.core.engine").setLevel(logging.CRITICAL)
logging.getLogger("scrapy.middleware").setLevel(logging.CRITICAL)
logger.disabled = True"""
def parse(self, response):
print('visited:', repr(response.url), 'status:', response.status)
if re.search(r'\.pdf$', response.url):
m = re.search(r'\/([^\/]+\.'+ext+')$', response.url)
if m:
print("saving to ", save_folder + '/' + clean_fn(response.url))
pdf_response = requests.get(response.url)
with open(save_folder + '/' + clean_fn(response.url), 'wb') as f:
f.write(pdf_response.content)
text = extract_text(save_folder + '/' + clean_fn(response.url))
codecs.open(save_folder + '/' + clean_fn(response.url) + '.txt','w','utf-8').write(text)
for ext in ['doc','docx','ppt','pptx']:
if re.search(r'\.'+ext+'$', response.url):
m = re.search(r'\/([^\/]+\.'+ext+')$', response.url)
if m:
print("saving to ", save_folder + '/' + clean_fn(response.url))
pdf_response = requests.get(response.url)
with open(save_folder + '/' + clean_fn(response.url), 'wb') as f:
f.write(pdf_response.content)
#text = extract_text(save_folder + '/' + clean_fn(response.url) + '.txt')
output = pypandoc.convert_file(save_folder + '/' + clean_fn(response.url), 'html', extra_args=['--extract-media=%s' % hash ])
txt_output = trafilatura.extract(response.text,include_links=True, deduplicate=True, include_images=True, include_formatting=True)
if txt_output:
codecs.open(save_folder + '/' + clean_fn(response.url) + '.txt','w','utf-8').write(txt_output)
for ext in ['jpg','jpeg','gif','webp']:
if re.search(r'\.'+ext+'$', response.url):
m = re.search(r'\/([^\/]+\.'+ext+')$', response.url)
if m:
print("saving to ", save_folder + '/' + clean_fn(response.url))
pdf_response = requests.get(response.url)
with open(save_folder + '/' + clean_fn(response.url), 'wb') as f:
f.write(pdf_response.content)
f_out = codecs.open(save_folder + '/' + clean_fn(response.url) + '.txt','w','utf-8')
this_output = trafilatura.extract(response.text,include_links=True, deduplicate=True, include_images=True, include_formatting=True)
if this_output:
f_out.write(this_output)
f_out.close()
links = response.css('a::attr(href)').getall()
# Follow each link and parse its contents
for link in links:
go = 1
full_link = response.urljoin(link)
print('++++++ trying ', full_link)
if not re.search(r'gavilan\.edu',full_link):
go = 0
print('--- not gav edu')
else:
if re.search(r'hhh\.gavilan\.edu',full_link):
pass
elif not re.search(r'^https?:\/\/www\.gavilan\.edu',full_link):
# need to add www to gavilan.edu
m = re.search(r'^(https?:\/\/)gavilan\.edu(\/.*)$',full_link)
if m:
full_link = m.group(1) + 'www.' + m.group(2)
for a in avoid:
if re.search(a,full_link):
go = 0
print('--- avoid ', a)
if go: yield scrapy.Request(full_link, callback=self.parse,
headers={"User-Agent": "Mozilla/5.0 (iPad; CPU OS 12_2 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Mobile/15E148"})
else:
print("------ avoiding ", full_link)
# Instantiate a CrawlerProcess object
process = CrawlerProcess()
# Add the MySpider spider to the process
process.crawl(MySpider)
# Start the process
logging.basicConfig(level=logging.CRITICAL)
logging.getLogger('scrapy').propagate = False
logging.getLogger("trafilatura").setLevel(logging.CRITICAL)
logging.getLogger("trafilatura").propagate = False
logging.getLogger("pdfminer").setLevel(logging.CRITICAL)
logging.getLogger("pdfminer").propagate = False
logging.getLogger("urllib3").setLevel(logging.CRITICAL)
logging.getLogger("urllib3").propagate = False
logging.basicConfig(level=logging.CRITICAL)
process.start()
from pattern.web import plaintext, extension
from pattern.web import download
#from pattern import URL, MIMETYPE_IMAGE
from pattern.web import Crawler, DEPTH
import bs4
import trafilatura
save_folder = 'cache/crawl' save_folder = 'cache/crawl'
clean_folder = 'cache/cleancrawl' clean_folder = 'cache/cleancrawl'
def clean_fn(s): def clean_fn(s):
s = re.sub(r'[\s:]+','',s) s = re.sub(r'[\s:]+','',s)
s = re.sub(r'\/','_',s) s = re.sub(r'\/','+',s)
return s return s
def format_html(html): def format_html(html):
@ -858,25 +1005,7 @@ def format_html(html):
return soup.prettify() return soup.prettify()
class GavCrawl(Crawler):
def visit(self, link, source=None):
print('visited:', repr(link.url), 'from:', link.referrer)
#txt = plaintext(source, keep={'h1':[], 'h2':[], 'h3':[], 'h4':[], 'td':[], 'strong':[], 'b':[], 'a':['href'], 'img':['src'], 'ul':[], 'ol':[], 'li':[], 'dd':[], 'dt':[], 'i':[]})
#codecs.open(save_folder + '/' + mycleaner(clean_title(link.url)) + '.txt','w','utf-8').write(tomd.convert(txt))
codecs.open(save_folder + '/' + clean_fn(link.url) + '.txt','w','utf-8').write(trafilatura.extract(source,include_links=True, deduplicate=True, include_images=True, include_formatting=True))
def fail(self, link):
print('failed:', repr(link.url))
def crawl():
p = GavCrawl(links=['http://www.gavilan.edu/'], domains=['gavilan.edu'], delay=0.75)
while not p.done:
try:
p.crawl(method=DEPTH, cached=False, throttle=0.76)
except Exception as e:
print("Exception: ", e)
def txt_clean_index(): def txt_clean_index():
files = os.listdir(save_folder) files = os.listdir(save_folder)
@ -902,33 +1031,61 @@ def txt_clean_index():
out.write(L + '\n') out.write(L + '\n')
out.close() out.close()
def samples():
crawler = Crawler(links=[], domains=[], delay=20.0, sort=FIFO)
url = URL('http://www.clips.ua.ac.bemedia/pattern_schema.gif')
print(url.mimetype in MIMETYPE_IMAGE)
#html = download('http://www.clips.ua.ac.be/', unicode=True) def search_embeddings():
s = URL('http://www.clips.ua.ac.be').download() model = SentenceTransformer('all-MiniLM-L6-v2')
s = plaintext(s, keep={'h1':[], 'h2':[], 'strong':[], 'a':['href']}) save_embeds = pickle.load( open( "cache/embeddings.p", "rb" ) )
columns = list(zip(*save_embeds))
files = columns[0]
sentences = columns[1]
embeddings = columns[2]
print(files[:20])
print(sentences[:20])
print(embeddings[:20])
s = ''
while s != 'q':
s = input("search or 'q' to quit: ")
if s == 'q':
return
query_embedding = model.encode(s)
# Compute the cosine similarity between the query embedding and the sentence embeddings
cosine_scores = util.cos_sim(query_embedding, embeddings)
# Sort the sentences by their cosine similarity to the query sentence
results = sorted(zip(sentences, cosine_scores, files), key=lambda x: x[1], reverse=True)
# Print the top 5 results
for i, (sentence, score, file) in enumerate(results[:5]):
print(f'Top {i+1}: {file} - {sentence} - (Score: {score})')
# getting absolute urls def create_embeddings():
from pattern.web import URL, DOM, abs model = SentenceTransformer('all-MiniLM-L6-v2')
files = os.listdir('cache/crawl')
output = []
save_embeds = [] # ['file','sentence','embedding']
files.sort()
for f in files:
m = re.match(r'https?..www\.gavilan\.edu\+(.*)\.\w\w\w\w?\.txt$',f)
if m:
lines = codecs.open('cache/crawl/' + f,'r','utf-8').readlines()
lines = [L.strip() for L in lines]
lines = [L for L in lines if L]
embeddings = model.encode(lines)
url = URL('http://www.clips.ua.ac.be') print("\n-----", f)
dom = DOM(url.download())
for link in dom('a'):
print(abs(link.attributes.get('href',''), base=url.redirect or url.string))
# get pdfs
from pattern.web import URL, PDF
url = URL('http://www.clips.ua.ac.be/sites/default/files/ctrs-002_0.pdf')
pdf = PDF(url.download())
print(pdf.string)
#Print the embeddings
for sentence, embedding in zip(lines, embeddings):
print("Sentence:", sentence)
#print("Embedding:", embedding)
save_embeds.append([f,sentence,embedding])
pickle.dump( save_embeds, open( "cache/embeddings.p", "wb" ) )
if __name__ == "__main__": if __name__ == "__main__":
@ -943,16 +1100,23 @@ if __name__ == "__main__":
7: ['demo vector search', demo_vector_search], 7: ['demo vector search', demo_vector_search],
8: ['crawl',crawl], 8: ['crawl',crawl],
9: ['clean text index', txt_clean_index], 9: ['clean text index', txt_clean_index],
10: ['make web dir struct', manual_index],
11: ['create search embeddings', create_embeddings],
12: ['do a search', search_embeddings],
} }
for key in options: if len(sys.argv) > 1 and re.search(r'^\d+',sys.argv[1]):
print(str(key) + '.\t' + options[key][0]) resp = int(sys.argv[1])
print("\n\nPerforming: %s\n\n" % options[resp][0])
print('') else:
resp = input('Choose: ') print ('')
for key in options:
print(str(key) + '.\t' + options[key][0])
print('')
resp = input('Choose: ')
# Call the function in the options dict # Call the function in the options dict
options[ int(resp)][1]() options[ int(resp)][1]()

View File

@ -1807,3 +1807,95 @@ def freshdesk():
#### content.py
from pattern.web import plaintext, extension
from pattern.web import download
#from pattern import URL, MIMETYPE_IMAGE
from pattern.web import Crawler, DEPTH, FIFO, MIMETYPE_IMAGE, MIMETYPE_PDF
class GavCrawl(Crawler):
def visit(self, link, source=None):
print('visited:', repr(link.url), 'from:', link.referrer)
print(' ', link.url.mimetype)
#txt = plaintext(source, keep={'h1':[], 'h2':[], 'h3':[], 'h4':[], 'td':[], 'strong':[], 'b':[], 'a':['href'], 'img':['src'], 'ul':[], 'ol':[], 'li':[], 'dd':[], 'dt':[], 'i':[]})
#codecs.open(save_folder + '/' + mycleaner(clean_title(link.url)) + '.txt','w','utf-8').write(tomd.convert(txt))
codecs.open(save_folder + '/' + clean_fn(link.url) + '.txt','w','utf-8').write(trafilatura.extract(source,include_links=True, deduplicate=True, include_images=True, include_formatting=True))
def fail(self, link):
print('failed:', repr(link.url))
if re.search(r'\.pdf$', link.url):
m = re.search(r'\/([^\/]+\.pdf)$', link.url)
if m:
save_file = m.group(1)
print("saving to ", save_folder + '/' + save_file)
pdf_response = requests.get(link.url)
with open(save_folder + '/' + save_file, 'wb') as f:
f.write(pdf_response.content)
text = extract_text(save_folder + '/' + save_file)
#print(text)
codecs.open(save_folder + '/' + save_file + '.txt','w','utf-8').write(text)
else:
print("no match for pdf url: ", link.url)
for ext in ['jpg','jpeg','gif','webp']:
if re.search(r'\.'+ext+'$', link.url):
m = re.search(r'\/([^\/]+\.'+ext+')$', link.url)
if m:
save_file = m.group(1)
print("saving to ", save_folder + '/' + save_file)
pdf_response = requests.get(link.url)
with open(save_folder + '/' + save_file, 'wb') as f:
f.write(pdf_response.content)
else:
print('no match for '+ext+' url: ', link.url)
def crawl2():
#p = GavCrawl(links=['http://www.gavilan.edu/'], domains=['gavilan.edu', 'gavilan.curriqunet.com','www.boarddocs.com'], delay=0.75)
#p = GavCrawl(links=['https://gavilan.edu/finaid/2022-23DirectLoanApplication1.pdf'], domains=['gavilan.edu', 'gavilan.curriqunet.com','www.boarddocs.com'], delay=0.75)
p = GavCrawl(links=['https://gavilan.curriqunet.com/catalog/iq/1826'], domains=['gavilan.edu', 'gavilan.curriqunet.com','www.boarddocs.com'], delay=0.75)
while not p.done:
try:
p.crawl(method=DEPTH, cached=False, throttle=0.76)
except Exception as e:
print("Exception: ", e)
def samples():
crawler = Crawler(links=[], domains=[], delay=20.0, sort=FIFO)
url = URL('http://www.clips.ua.ac.bemedia/pattern_schema.gif')
print(url.mimetype in MIMETYPE_IMAGE)
#html = download('http://www.clips.ua.ac.be/', unicode=True)
s = URL('http://www.clips.ua.ac.be').download()
s = plaintext(s, keep={'h1':[], 'h2':[], 'strong':[], 'a':['href']})
# getting absolute urls
from pattern.web import URL, DOM, abs
url = URL('http://www.clips.ua.ac.be')
dom = DOM(url.download())
for link in dom('a'):
print(abs(link.attributes.get('href',''), base=url.redirect or url.string))
# get pdfs
from pattern.web import URL, PDF
url = URL('http://www.clips.ua.ac.be/sites/default/files/ctrs-002_0.pdf')
pdf = PDF(url.download())
print(pdf.string)

4
gpt.py
View File

@ -4,8 +4,8 @@ import openai
from canvas_secrets import openai_org, openai_api_key from canvas_secrets import openai_org, openai_api_key
openai.organization = "org-66WLoZQEtBrO42Z9S8rfd10M" openai.organization = openai_org
openai.api_key = "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" openai.api_key = openai_api_key
#print(openai.Model.list()) #print(openai.Model.list())
my_prompt = "Write a series of texts trying to sell a pen to a stranger." my_prompt = "Write a series of texts trying to sell a pen to a stranger."

View File

@ -1,4 +1,3 @@
import curses
import heapq, re, csv, os, shutil, datetime, urllib import heapq, re, csv, os, shutil, datetime, urllib
import itertools, time, markdown, csv, json, os.path, webbrowser, threading import itertools, time, markdown, csv, json, os.path, webbrowser, threading
from functools import wraps from functools import wraps
@ -15,6 +14,20 @@ import localcache
from server import * from server import *
from canvas_secrets import flask_secretkey from canvas_secrets import flask_secretkey
from content import my_site
import socket
this_host = socket.gethostname()
print('\n\n' + this_host, '\n\n')
has_curses = 0
if this_host != 'ROGDESKTOP':
import curses
has_curses = 1
else:
print("Skipping curses stuff")
q = Queue() q = Queue()
@ -25,7 +38,6 @@ PORT_NUMBER = 8080 # Maybe set this to 9000.
datafile = 'lambda.csv' datafile = 'lambda.csv'
#writing_path = 'c:/users/peter/Nextcloud/Documents/writing/'
#### ####
@ -95,7 +107,16 @@ def flask_thread(q):
@app.route('/mirror')
def mirror():
return codecs.open('cache/crawl/index.html','r','utf-8').read()
@app.route('/mirror/<filename>')
def mirror_file(filename):
return markdown.markdown( codecs.open('cache/crawl/'+filename,'r','utf-8').read() ) + \
"<pre>" + codecs.open('cache/crawl/'+filename,'r','utf-8').read() + "</pre>"
@app.route('/clearscreens') @app.route('/clearscreens')
def clears(): def clears():
clearscreens() clearscreens()
@ -166,6 +187,7 @@ def flask_thread(q):
@app.route('/x/writing/images/<fname>') @app.route('/x/writing/images/<fname>')
def writing_img(fname): def writing_img(fname):
# TODO
img_path = "/media/hd2/peter_home/Documents/writing_img/" img_path = "/media/hd2/peter_home/Documents/writing_img/"
print(img_path + fname + " - writing images folder") print(img_path + fname + " - writing images folder")
img_ext = fname.split('.')[-1] img_ext = fname.split('.')[-1]

View File

@ -1,5 +1,5 @@
import json, codecs, re, markdown, os, pypandoc, striprtf, sqlite3, random, urllib import json, codecs, re, markdown, os, pypandoc, striprtf, sqlite3, random, urllib
import subprocess, html import subprocess, html, time
from striprtf.striprtf import rtf_to_text from striprtf.striprtf import rtf_to_text
from flask import render_template, Response from flask import render_template, Response
from flask import send_from_directory from flask import send_from_directory
@ -16,8 +16,33 @@ from localcache import arrange_data_for_web, depts_with_classcounts, dept_with_s
from yattag import Doc from yattag import Doc
import socket
this_host = socket.gethostname()
print('\n\n server host: ' + this_host, '\n\n')
LECPATH = "/media/hd2/peter_home_offload/lecture/" LECPATH = "/media/hd2/peter_home_offload/lecture/"
host = 'http://192.168.1.6:5000' host = 'http://192.168.1.6:5000'
news_path = '/media/hd2/peter_home/Documents/scripts/browser/'
writing_path = '/media/hd2/peter_home/Documents/writing/'
img_path = '/media/hd2/peter_home/Documents/writing_img/'
pics_path = '/media/hd2/peter_home/misc/'
if this_host == 'ROGDESKTOP':
LECPATH = "d:/peter_home_offload/lecture/"
host = 'http://192.168.1.7:5000'
news_path = 'd:/peter_home/Documents/scripts/browser/'
writing_path = 'd:/peter_home/Documents/writing/'
img_path = 'd:/peter_home/Documents/writing_img/'
pics_path = 'd:/peter_home/misc/'
import paho.mqtt.client as mqtt import paho.mqtt.client as mqtt
@ -55,20 +80,20 @@ def on_message(client, userdata, msg):
print(" %s mqtt msg: %s data: %s" % (now, msg.topic, msg.payload.decode())) print(" %s mqtt msg: %s data: %s" % (now, msg.topic, msg.payload.decode()))
if 0:
while(mqtt_offline):
try:
client = mqtt.Client()
client.on_connect = on_connect
client.on_message = on_message
while(mqtt_offline): client.connect("192.168.1.6", 1883, 60)
try:
client = mqtt.Client()
client.on_connect = on_connect
client.on_message = on_message
client.connect("192.168.1.6", 1883, 60) mqtt_offline = 0
mqtt_offline = 0 except OSError as oe:
print('no internet? try again in 5 seconds.')
except OSError as oe: time.sleep(5)
print('no internet? try again in 5 seconds.')
time.sleep(5)
@ -114,18 +139,6 @@ def screenoff():
###### ######
news_path = '/media/hd2/peter_home/Documents/scripts/browser/'
if platform.system() == 'Windows':
writing_path = 'c:/users/peter/Nextcloud/Documents/writing/'
else:
writing_path = '/media/hd2/peter_home/Documents/writing/'
img_path = '/media/hd2/peter_home/Documents/writing_img/'
if platform.system() == 'Windows':
pics_path = 'c:/users/peter/Nextcloud/misc/'
else:
pics_path = '/media/hd2/peter_home/misc/'
br = "<br />" br = "<br />"
nl = "\n" nl = "\n"