From 532a71a3daf6933468fb7ba4b5de85061f96c7c5 Mon Sep 17 00:00:00 2001 From: Coding with Peter Date: Mon, 17 Apr 2023 12:08:04 -0700 Subject: [PATCH] the vector search toy --- content.py | 286 ++++++++++++++++++++++++++++++++++++++----------- depricated.py | 92 ++++++++++++++++ gpt.py | 4 +- interactive.py | 26 ++++- server.py | 63 ++++++----- 5 files changed, 381 insertions(+), 90 deletions(-) diff --git a/content.py b/content.py index d281bb8..6bc1972 100644 --- a/content.py +++ b/content.py @@ -1,26 +1,24 @@ #saved_titles = json.loads( codecs.open('cache/saved_youtube_titles.json','r','utf-8').read() ) -import requests, codecs, os, re, json +import requests, codecs, os, re, json, sys, pypandoc +import webbrowser, bs4, trafilatura, pickle, tomd, checker +import html2markdown as h2m from pipelines import header, fetch, url, put_file from util import clean_title, to_file_friendly, minimal_string, stripper, mycleaner from bs4 import BeautifulSoup as bs from html.parser import HTMLParser from collections import defaultdict -import tomd, checker -import html2markdown as h2m -import pypandoc -import webbrowser -h = HTMLParser() +from pdfminer.high_level import extract_text +from sentence_transformers import SentenceTransformer, util +h = HTMLParser() DBG = 1 def d(s): global DBG if DBG: print(s) - - @@ -829,6 +827,44 @@ Schedule an In-Person, Phone or Zoom Appointment""" print(f"Vector for the word '{example_word}': {vector}") + +def makedir(): + files = os.listdir('cache/crawl') + #print(files) + files.sort() + for f in files: + m = re.match(r'https?..www\.gavilan\.edu\+(.*)\.\w\w\w\w?\.txt$',f) + if m: + name = m.groups()[0] + parts = name.split('+') + print(parts) + +def manual_index(): + files = os.listdir('cache/crawl') + #print(files) + ii = codecs.open('cache/crawl/index.html','w','utf-8') + ii.write('

Site index

\n') + files.sort() + for f in files: + m = re.match(r'https?..www\.gavilan\.edu\+(.*)\.\w\w\w\w?\.txt$',f) + if m: + name = m.groups()[0] + parts = name.split('+') + ii.write('
'+f+'\n') + +def my_site(): + files = os.listdir('cache/crawl') + output = [] + files.sort() + for f in files: + m = re.match(r'https?..www\.gavilan\.edu\+(.*)\.\w\w\w\w?\.txt$',f) + if m: + name = m.groups()[0] + parts = name.split('+') + output.append(parts) + return output + + ## TODO site scraper ## TODO find package that extracts text from web page ### TODO master list of what to index. @@ -836,21 +872,132 @@ Schedule an In-Person, Phone or Zoom Appointment""" ## TODO PDFs and DOCXs ## TODO fix urls w/ anchors +def crawl(): + import scrapy, logging + from scrapy.crawler import CrawlerProcess + + logger = logging.getLogger() + logger.setLevel(level=logging.CRITICAL) + logging.basicConfig(level=logging.CRITICAL) + logger.disabled = True + + + avoid = ['ezproxy','community\.gavilan\.edu','archive\/tag','archive\/category', 'my\.gavilan\.edu', 'augusoft', + 'eis-prod', 'ilearn\.gavilan', 'mailto', 'cgi-bin', 'edu\/old\/schedule', ] + + class MySpider(scrapy.Spider): + name = 'myspider' + #start_urls = ['https://gavilan.curriqunet.com/catalog/iq/1826'] + start_urls = ['https://www.gavilan.edu'] + + + """ + logging.getLogger("scrapy").setLevel(logging.CRITICAL) + logging.getLogger("scrapy.utils.log").setLevel(logging.CRITICAL) + logging.getLogger("scrapy.extensions.telnet").setLevel(logging.CRITICAL) + logging.getLogger("scrapy.middleware").setLevel(logging.CRITICAL) + logging.getLogger("scrapy.core.engine").setLevel(logging.CRITICAL) + logging.getLogger("scrapy.middleware").setLevel(logging.CRITICAL) + + logger.disabled = True""" + + def parse(self, response): + print('visited:', repr(response.url), 'status:', response.status) + + if re.search(r'\.pdf$', response.url): + m = re.search(r'\/([^\/]+\.'+ext+')$', response.url) + if m: + print("saving to ", save_folder + '/' + clean_fn(response.url)) + pdf_response = requests.get(response.url) + with open(save_folder + '/' + clean_fn(response.url), 'wb') as f: + f.write(pdf_response.content) + text = extract_text(save_folder + '/' + clean_fn(response.url)) + codecs.open(save_folder + '/' + clean_fn(response.url) + '.txt','w','utf-8').write(text) + + for ext in ['doc','docx','ppt','pptx']: + if re.search(r'\.'+ext+'$', response.url): + m = re.search(r'\/([^\/]+\.'+ext+')$', response.url) + if m: + print("saving to ", save_folder + '/' + clean_fn(response.url)) + pdf_response = requests.get(response.url) + with open(save_folder + '/' + clean_fn(response.url), 'wb') as f: + f.write(pdf_response.content) + #text = extract_text(save_folder + '/' + clean_fn(response.url) + '.txt') + output = pypandoc.convert_file(save_folder + '/' + clean_fn(response.url), 'html', extra_args=['--extract-media=%s' % hash ]) + txt_output = trafilatura.extract(response.text,include_links=True, deduplicate=True, include_images=True, include_formatting=True) + if txt_output: + codecs.open(save_folder + '/' + clean_fn(response.url) + '.txt','w','utf-8').write(txt_output) + + for ext in ['jpg','jpeg','gif','webp']: + if re.search(r'\.'+ext+'$', response.url): + m = re.search(r'\/([^\/]+\.'+ext+')$', response.url) + if m: + print("saving to ", save_folder + '/' + clean_fn(response.url)) + pdf_response = requests.get(response.url) + with open(save_folder + '/' + clean_fn(response.url), 'wb') as f: + f.write(pdf_response.content) + + f_out = codecs.open(save_folder + '/' + clean_fn(response.url) + '.txt','w','utf-8') + + this_output = trafilatura.extract(response.text,include_links=True, deduplicate=True, include_images=True, include_formatting=True) + if this_output: + f_out.write(this_output) + f_out.close() + links = response.css('a::attr(href)').getall() + + # Follow each link and parse its contents + + for link in links: + go = 1 + full_link = response.urljoin(link) + print('++++++ trying ', full_link) + + if not re.search(r'gavilan\.edu',full_link): + go = 0 + print('--- not gav edu') + else: + if re.search(r'hhh\.gavilan\.edu',full_link): + pass + elif not re.search(r'^https?:\/\/www\.gavilan\.edu',full_link): + # need to add www to gavilan.edu + m = re.search(r'^(https?:\/\/)gavilan\.edu(\/.*)$',full_link) + if m: + full_link = m.group(1) + 'www.' + m.group(2) + for a in avoid: + if re.search(a,full_link): + go = 0 + print('--- avoid ', a) + + if go: yield scrapy.Request(full_link, callback=self.parse, + headers={"User-Agent": "Mozilla/5.0 (iPad; CPU OS 12_2 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Mobile/15E148"}) + else: + print("------ avoiding ", full_link) + # Instantiate a CrawlerProcess object + process = CrawlerProcess() + + # Add the MySpider spider to the process + process.crawl(MySpider) + + # Start the process + logging.basicConfig(level=logging.CRITICAL) + logging.getLogger('scrapy').propagate = False + logging.getLogger("trafilatura").setLevel(logging.CRITICAL) + logging.getLogger("trafilatura").propagate = False + logging.getLogger("pdfminer").setLevel(logging.CRITICAL) + logging.getLogger("pdfminer").propagate = False + logging.getLogger("urllib3").setLevel(logging.CRITICAL) + logging.getLogger("urllib3").propagate = False + logging.basicConfig(level=logging.CRITICAL) + process.start() -from pattern.web import plaintext, extension -from pattern.web import download -#from pattern import URL, MIMETYPE_IMAGE -from pattern.web import Crawler, DEPTH -import bs4 -import trafilatura save_folder = 'cache/crawl' clean_folder = 'cache/cleancrawl' def clean_fn(s): s = re.sub(r'[\s:]+','',s) - s = re.sub(r'\/','_',s) + s = re.sub(r'\/','+',s) return s def format_html(html): @@ -858,25 +1005,7 @@ def format_html(html): return soup.prettify() -class GavCrawl(Crawler): - def visit(self, link, source=None): - print('visited:', repr(link.url), 'from:', link.referrer) - #txt = plaintext(source, keep={'h1':[], 'h2':[], 'h3':[], 'h4':[], 'td':[], 'strong':[], 'b':[], 'a':['href'], 'img':['src'], 'ul':[], 'ol':[], 'li':[], 'dd':[], 'dt':[], 'i':[]}) - #codecs.open(save_folder + '/' + mycleaner(clean_title(link.url)) + '.txt','w','utf-8').write(tomd.convert(txt)) - - codecs.open(save_folder + '/' + clean_fn(link.url) + '.txt','w','utf-8').write(trafilatura.extract(source,include_links=True, deduplicate=True, include_images=True, include_formatting=True)) - - def fail(self, link): - print('failed:', repr(link.url)) - -def crawl(): - p = GavCrawl(links=['http://www.gavilan.edu/'], domains=['gavilan.edu'], delay=0.75) - while not p.done: - try: - p.crawl(method=DEPTH, cached=False, throttle=0.76) - except Exception as e: - print("Exception: ", e) def txt_clean_index(): files = os.listdir(save_folder) @@ -902,33 +1031,61 @@ def txt_clean_index(): out.write(L + '\n') out.close() -def samples(): - crawler = Crawler(links=[], domains=[], delay=20.0, sort=FIFO) - - url = URL('http://www.clips.ua.ac.bemedia/pattern_schema.gif') - print(url.mimetype in MIMETYPE_IMAGE) - #html = download('http://www.clips.ua.ac.be/', unicode=True) - s = URL('http://www.clips.ua.ac.be').download() - s = plaintext(s, keep={'h1':[], 'h2':[], 'strong':[], 'a':['href']}) +def search_embeddings(): + model = SentenceTransformer('all-MiniLM-L6-v2') + save_embeds = pickle.load( open( "cache/embeddings.p", "rb" ) ) + columns = list(zip(*save_embeds)) + files = columns[0] + sentences = columns[1] + embeddings = columns[2] + + print(files[:20]) + print(sentences[:20]) + print(embeddings[:20]) + + s = '' + while s != 'q': + s = input("search or 'q' to quit: ") + if s == 'q': + return + query_embedding = model.encode(s) + + # Compute the cosine similarity between the query embedding and the sentence embeddings + cosine_scores = util.cos_sim(query_embedding, embeddings) + + # Sort the sentences by their cosine similarity to the query sentence + results = sorted(zip(sentences, cosine_scores, files), key=lambda x: x[1], reverse=True) + + # Print the top 5 results + for i, (sentence, score, file) in enumerate(results[:5]): + print(f'Top {i+1}: {file} - {sentence} - (Score: {score})') - # getting absolute urls - from pattern.web import URL, DOM, abs +def create_embeddings(): + model = SentenceTransformer('all-MiniLM-L6-v2') + files = os.listdir('cache/crawl') + output = [] + save_embeds = [] # ['file','sentence','embedding'] + files.sort() + for f in files: + m = re.match(r'https?..www\.gavilan\.edu\+(.*)\.\w\w\w\w?\.txt$',f) + if m: + lines = codecs.open('cache/crawl/' + f,'r','utf-8').readlines() + lines = [L.strip() for L in lines] + lines = [L for L in lines if L] + embeddings = model.encode(lines) - url = URL('http://www.clips.ua.ac.be') - dom = DOM(url.download()) - for link in dom('a'): - print(abs(link.attributes.get('href',''), base=url.redirect or url.string)) - - # get pdfs - from pattern.web import URL, PDF - - url = URL('http://www.clips.ua.ac.be/sites/default/files/ctrs-002_0.pdf') - pdf = PDF(url.download()) - print(pdf.string) + print("\n-----", f) + #Print the embeddings + for sentence, embedding in zip(lines, embeddings): + print("Sentence:", sentence) + #print("Embedding:", embedding) + + save_embeds.append([f,sentence,embedding]) + pickle.dump( save_embeds, open( "cache/embeddings.p", "wb" ) ) if __name__ == "__main__": @@ -943,16 +1100,23 @@ if __name__ == "__main__": 7: ['demo vector search', demo_vector_search], 8: ['crawl',crawl], 9: ['clean text index', txt_clean_index], + 10: ['make web dir struct', manual_index], + 11: ['create search embeddings', create_embeddings], + 12: ['do a search', search_embeddings], } - for key in options: - print(str(key) + '.\t' + options[key][0]) + if len(sys.argv) > 1 and re.search(r'^\d+',sys.argv[1]): + resp = int(sys.argv[1]) + print("\n\nPerforming: %s\n\n" % options[resp][0]) - print('') - resp = input('Choose: ') + else: + print ('') + for key in options: + print(str(key) + '.\t' + options[key][0]) + + print('') + resp = input('Choose: ') # Call the function in the options dict options[ int(resp)][1]() - - - + diff --git a/depricated.py b/depricated.py index d79baaf..0fb44e7 100644 --- a/depricated.py +++ b/depricated.py @@ -1807,3 +1807,95 @@ def freshdesk(): + + +#### content.py + + +from pattern.web import plaintext, extension +from pattern.web import download +#from pattern import URL, MIMETYPE_IMAGE +from pattern.web import Crawler, DEPTH, FIFO, MIMETYPE_IMAGE, MIMETYPE_PDF + +class GavCrawl(Crawler): + def visit(self, link, source=None): + print('visited:', repr(link.url), 'from:', link.referrer) + print(' ', link.url.mimetype) + #txt = plaintext(source, keep={'h1':[], 'h2':[], 'h3':[], 'h4':[], 'td':[], 'strong':[], 'b':[], 'a':['href'], 'img':['src'], 'ul':[], 'ol':[], 'li':[], 'dd':[], 'dt':[], 'i':[]}) + #codecs.open(save_folder + '/' + mycleaner(clean_title(link.url)) + '.txt','w','utf-8').write(tomd.convert(txt)) + + codecs.open(save_folder + '/' + clean_fn(link.url) + '.txt','w','utf-8').write(trafilatura.extract(source,include_links=True, deduplicate=True, include_images=True, include_formatting=True)) + + + def fail(self, link): + print('failed:', repr(link.url)) + if re.search(r'\.pdf$', link.url): + m = re.search(r'\/([^\/]+\.pdf)$', link.url) + if m: + save_file = m.group(1) + print("saving to ", save_folder + '/' + save_file) + pdf_response = requests.get(link.url) + with open(save_folder + '/' + save_file, 'wb') as f: + f.write(pdf_response.content) + text = extract_text(save_folder + '/' + save_file) + #print(text) + codecs.open(save_folder + '/' + save_file + '.txt','w','utf-8').write(text) + else: + print("no match for pdf url: ", link.url) + + for ext in ['jpg','jpeg','gif','webp']: + if re.search(r'\.'+ext+'$', link.url): + m = re.search(r'\/([^\/]+\.'+ext+')$', link.url) + if m: + save_file = m.group(1) + print("saving to ", save_folder + '/' + save_file) + pdf_response = requests.get(link.url) + with open(save_folder + '/' + save_file, 'wb') as f: + f.write(pdf_response.content) + else: + print('no match for '+ext+' url: ', link.url) + +def crawl2(): + #p = GavCrawl(links=['http://www.gavilan.edu/'], domains=['gavilan.edu', 'gavilan.curriqunet.com','www.boarddocs.com'], delay=0.75) + #p = GavCrawl(links=['https://gavilan.edu/finaid/2022-23DirectLoanApplication1.pdf'], domains=['gavilan.edu', 'gavilan.curriqunet.com','www.boarddocs.com'], delay=0.75) + p = GavCrawl(links=['https://gavilan.curriqunet.com/catalog/iq/1826'], domains=['gavilan.edu', 'gavilan.curriqunet.com','www.boarddocs.com'], delay=0.75) + + + + + while not p.done: + try: + p.crawl(method=DEPTH, cached=False, throttle=0.76) + except Exception as e: + print("Exception: ", e) + + + +def samples(): + crawler = Crawler(links=[], domains=[], delay=20.0, sort=FIFO) + + url = URL('http://www.clips.ua.ac.bemedia/pattern_schema.gif') + print(url.mimetype in MIMETYPE_IMAGE) + + + #html = download('http://www.clips.ua.ac.be/', unicode=True) + s = URL('http://www.clips.ua.ac.be').download() + s = plaintext(s, keep={'h1':[], 'h2':[], 'strong':[], 'a':['href']}) + + + # getting absolute urls + from pattern.web import URL, DOM, abs + + url = URL('http://www.clips.ua.ac.be') + dom = DOM(url.download()) + for link in dom('a'): + print(abs(link.attributes.get('href',''), base=url.redirect or url.string)) + + # get pdfs + from pattern.web import URL, PDF + + url = URL('http://www.clips.ua.ac.be/sites/default/files/ctrs-002_0.pdf') + pdf = PDF(url.download()) + print(pdf.string) + + diff --git a/gpt.py b/gpt.py index 42dfe0d..36d5394 100644 --- a/gpt.py +++ b/gpt.py @@ -4,8 +4,8 @@ import openai from canvas_secrets import openai_org, openai_api_key -openai.organization = "org-66WLoZQEtBrO42Z9S8rfd10M" -openai.api_key = "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" +openai.organization = openai_org +openai.api_key = openai_api_key #print(openai.Model.list()) my_prompt = "Write a series of texts trying to sell a pen to a stranger." diff --git a/interactive.py b/interactive.py index 33d3413..7fb76f4 100644 --- a/interactive.py +++ b/interactive.py @@ -1,4 +1,3 @@ -import curses import heapq, re, csv, os, shutil, datetime, urllib import itertools, time, markdown, csv, json, os.path, webbrowser, threading from functools import wraps @@ -15,6 +14,20 @@ import localcache from server import * from canvas_secrets import flask_secretkey +from content import my_site + +import socket +this_host = socket.gethostname() + +print('\n\n' + this_host, '\n\n') + +has_curses = 0 +if this_host != 'ROGDESKTOP': + import curses + has_curses = 1 +else: + print("Skipping curses stuff") + q = Queue() @@ -25,7 +38,6 @@ PORT_NUMBER = 8080 # Maybe set this to 9000. datafile = 'lambda.csv' -#writing_path = 'c:/users/peter/Nextcloud/Documents/writing/' #### @@ -95,7 +107,16 @@ def flask_thread(q): + @app.route('/mirror') + def mirror(): + return codecs.open('cache/crawl/index.html','r','utf-8').read() + + @app.route('/mirror/') + def mirror_file(filename): + return markdown.markdown( codecs.open('cache/crawl/'+filename,'r','utf-8').read() ) + \ + "
" + codecs.open('cache/crawl/'+filename,'r','utf-8').read() + "
" + @app.route('/clearscreens') def clears(): clearscreens() @@ -166,6 +187,7 @@ def flask_thread(q): @app.route('/x/writing/images/') def writing_img(fname): + # TODO img_path = "/media/hd2/peter_home/Documents/writing_img/" print(img_path + fname + " - writing images folder") img_ext = fname.split('.')[-1] diff --git a/server.py b/server.py index aaac0f8..502e352 100644 --- a/server.py +++ b/server.py @@ -1,5 +1,5 @@ import json, codecs, re, markdown, os, pypandoc, striprtf, sqlite3, random, urllib -import subprocess, html +import subprocess, html, time from striprtf.striprtf import rtf_to_text from flask import render_template, Response from flask import send_from_directory @@ -16,8 +16,33 @@ from localcache import arrange_data_for_web, depts_with_classcounts, dept_with_s from yattag import Doc +import socket +this_host = socket.gethostname() +print('\n\n server host: ' + this_host, '\n\n') + LECPATH = "/media/hd2/peter_home_offload/lecture/" - host = 'http://192.168.1.6:5000' +host = 'http://192.168.1.6:5000' +news_path = '/media/hd2/peter_home/Documents/scripts/browser/' +writing_path = '/media/hd2/peter_home/Documents/writing/' +img_path = '/media/hd2/peter_home/Documents/writing_img/' +pics_path = '/media/hd2/peter_home/misc/' + + +if this_host == 'ROGDESKTOP': + LECPATH = "d:/peter_home_offload/lecture/" + host = 'http://192.168.1.7:5000' + news_path = 'd:/peter_home/Documents/scripts/browser/' + writing_path = 'd:/peter_home/Documents/writing/' + img_path = 'd:/peter_home/Documents/writing_img/' + pics_path = 'd:/peter_home/misc/' + + + + + + + + import paho.mqtt.client as mqtt @@ -55,20 +80,20 @@ def on_message(client, userdata, msg): print(" %s mqtt msg: %s data: %s" % (now, msg.topic, msg.payload.decode())) +if 0: + while(mqtt_offline): + try: + client = mqtt.Client() + client.on_connect = on_connect + client.on_message = on_message -while(mqtt_offline): - try: - client = mqtt.Client() - client.on_connect = on_connect - client.on_message = on_message + client.connect("192.168.1.6", 1883, 60) - client.connect("192.168.1.6", 1883, 60) + mqtt_offline = 0 - mqtt_offline = 0 - - except OSError as oe: - print('no internet? try again in 5 seconds.') - time.sleep(5) + except OSError as oe: + print('no internet? try again in 5 seconds.') + time.sleep(5) @@ -114,18 +139,6 @@ def screenoff(): ###### -news_path = '/media/hd2/peter_home/Documents/scripts/browser/' - -if platform.system() == 'Windows': - writing_path = 'c:/users/peter/Nextcloud/Documents/writing/' -else: - writing_path = '/media/hd2/peter_home/Documents/writing/' - img_path = '/media/hd2/peter_home/Documents/writing_img/' - -if platform.system() == 'Windows': - pics_path = 'c:/users/peter/Nextcloud/misc/' -else: - pics_path = '/media/hd2/peter_home/misc/' br = "
" nl = "\n"