diff --git a/content.py b/content.py
index d281bb8..6bc1972 100644
--- a/content.py
+++ b/content.py
@@ -1,26 +1,24 @@
#saved_titles = json.loads( codecs.open('cache/saved_youtube_titles.json','r','utf-8').read() )
-import requests, codecs, os, re, json
+import requests, codecs, os, re, json, sys, pypandoc
+import webbrowser, bs4, trafilatura, pickle, tomd, checker
+import html2markdown as h2m
from pipelines import header, fetch, url, put_file
from util import clean_title, to_file_friendly, minimal_string, stripper, mycleaner
from bs4 import BeautifulSoup as bs
from html.parser import HTMLParser
from collections import defaultdict
-import tomd, checker
-import html2markdown as h2m
-import pypandoc
-import webbrowser
-h = HTMLParser()
+from pdfminer.high_level import extract_text
+from sentence_transformers import SentenceTransformer, util
+h = HTMLParser()
DBG = 1
def d(s):
global DBG
if DBG: print(s)
-
-
@@ -829,6 +827,44 @@ Schedule an In-Person, Phone or Zoom Appointment"""
print(f"Vector for the word '{example_word}': {vector}")
+
+def makedir():
+ files = os.listdir('cache/crawl')
+ #print(files)
+ files.sort()
+ for f in files:
+ m = re.match(r'https?..www\.gavilan\.edu\+(.*)\.\w\w\w\w?\.txt$',f)
+ if m:
+ name = m.groups()[0]
+ parts = name.split('+')
+ print(parts)
+
+def manual_index():
+ files = os.listdir('cache/crawl')
+ #print(files)
+ ii = codecs.open('cache/crawl/index.html','w','utf-8')
+ ii.write('
Site index
\n')
+ files.sort()
+ for f in files:
+ m = re.match(r'https?..www\.gavilan\.edu\+(.*)\.\w\w\w\w?\.txt$',f)
+ if m:
+ name = m.groups()[0]
+ parts = name.split('+')
+ ii.write('
'+f+'\n')
+
+def my_site():
+ files = os.listdir('cache/crawl')
+ output = []
+ files.sort()
+ for f in files:
+ m = re.match(r'https?..www\.gavilan\.edu\+(.*)\.\w\w\w\w?\.txt$',f)
+ if m:
+ name = m.groups()[0]
+ parts = name.split('+')
+ output.append(parts)
+ return output
+
+
## TODO site scraper
## TODO find package that extracts text from web page
### TODO master list of what to index.
@@ -836,21 +872,132 @@ Schedule an In-Person, Phone or Zoom Appointment"""
## TODO PDFs and DOCXs
## TODO fix urls w/ anchors
+def crawl():
+ import scrapy, logging
+ from scrapy.crawler import CrawlerProcess
+
+ logger = logging.getLogger()
+ logger.setLevel(level=logging.CRITICAL)
+ logging.basicConfig(level=logging.CRITICAL)
+ logger.disabled = True
+
+
+ avoid = ['ezproxy','community\.gavilan\.edu','archive\/tag','archive\/category', 'my\.gavilan\.edu', 'augusoft',
+ 'eis-prod', 'ilearn\.gavilan', 'mailto', 'cgi-bin', 'edu\/old\/schedule', ]
+
+ class MySpider(scrapy.Spider):
+ name = 'myspider'
+ #start_urls = ['https://gavilan.curriqunet.com/catalog/iq/1826']
+ start_urls = ['https://www.gavilan.edu']
+
+
+ """
+ logging.getLogger("scrapy").setLevel(logging.CRITICAL)
+ logging.getLogger("scrapy.utils.log").setLevel(logging.CRITICAL)
+ logging.getLogger("scrapy.extensions.telnet").setLevel(logging.CRITICAL)
+ logging.getLogger("scrapy.middleware").setLevel(logging.CRITICAL)
+ logging.getLogger("scrapy.core.engine").setLevel(logging.CRITICAL)
+ logging.getLogger("scrapy.middleware").setLevel(logging.CRITICAL)
+
+ logger.disabled = True"""
+
+ def parse(self, response):
+ print('visited:', repr(response.url), 'status:', response.status)
+
+ if re.search(r'\.pdf$', response.url):
+ m = re.search(r'\/([^\/]+\.'+ext+')$', response.url)
+ if m:
+ print("saving to ", save_folder + '/' + clean_fn(response.url))
+ pdf_response = requests.get(response.url)
+ with open(save_folder + '/' + clean_fn(response.url), 'wb') as f:
+ f.write(pdf_response.content)
+ text = extract_text(save_folder + '/' + clean_fn(response.url))
+ codecs.open(save_folder + '/' + clean_fn(response.url) + '.txt','w','utf-8').write(text)
+
+ for ext in ['doc','docx','ppt','pptx']:
+ if re.search(r'\.'+ext+'$', response.url):
+ m = re.search(r'\/([^\/]+\.'+ext+')$', response.url)
+ if m:
+ print("saving to ", save_folder + '/' + clean_fn(response.url))
+ pdf_response = requests.get(response.url)
+ with open(save_folder + '/' + clean_fn(response.url), 'wb') as f:
+ f.write(pdf_response.content)
+ #text = extract_text(save_folder + '/' + clean_fn(response.url) + '.txt')
+ output = pypandoc.convert_file(save_folder + '/' + clean_fn(response.url), 'html', extra_args=['--extract-media=%s' % hash ])
+ txt_output = trafilatura.extract(response.text,include_links=True, deduplicate=True, include_images=True, include_formatting=True)
+ if txt_output:
+ codecs.open(save_folder + '/' + clean_fn(response.url) + '.txt','w','utf-8').write(txt_output)
+
+ for ext in ['jpg','jpeg','gif','webp']:
+ if re.search(r'\.'+ext+'$', response.url):
+ m = re.search(r'\/([^\/]+\.'+ext+')$', response.url)
+ if m:
+ print("saving to ", save_folder + '/' + clean_fn(response.url))
+ pdf_response = requests.get(response.url)
+ with open(save_folder + '/' + clean_fn(response.url), 'wb') as f:
+ f.write(pdf_response.content)
+
+ f_out = codecs.open(save_folder + '/' + clean_fn(response.url) + '.txt','w','utf-8')
+
+ this_output = trafilatura.extract(response.text,include_links=True, deduplicate=True, include_images=True, include_formatting=True)
+ if this_output:
+ f_out.write(this_output)
+ f_out.close()
+ links = response.css('a::attr(href)').getall()
+
+ # Follow each link and parse its contents
+
+ for link in links:
+ go = 1
+ full_link = response.urljoin(link)
+ print('++++++ trying ', full_link)
+
+ if not re.search(r'gavilan\.edu',full_link):
+ go = 0
+ print('--- not gav edu')
+ else:
+ if re.search(r'hhh\.gavilan\.edu',full_link):
+ pass
+ elif not re.search(r'^https?:\/\/www\.gavilan\.edu',full_link):
+ # need to add www to gavilan.edu
+ m = re.search(r'^(https?:\/\/)gavilan\.edu(\/.*)$',full_link)
+ if m:
+ full_link = m.group(1) + 'www.' + m.group(2)
+ for a in avoid:
+ if re.search(a,full_link):
+ go = 0
+ print('--- avoid ', a)
+
+ if go: yield scrapy.Request(full_link, callback=self.parse,
+ headers={"User-Agent": "Mozilla/5.0 (iPad; CPU OS 12_2 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Mobile/15E148"})
+ else:
+ print("------ avoiding ", full_link)
+ # Instantiate a CrawlerProcess object
+ process = CrawlerProcess()
+
+ # Add the MySpider spider to the process
+ process.crawl(MySpider)
+
+ # Start the process
+ logging.basicConfig(level=logging.CRITICAL)
+ logging.getLogger('scrapy').propagate = False
+ logging.getLogger("trafilatura").setLevel(logging.CRITICAL)
+ logging.getLogger("trafilatura").propagate = False
+ logging.getLogger("pdfminer").setLevel(logging.CRITICAL)
+ logging.getLogger("pdfminer").propagate = False
+ logging.getLogger("urllib3").setLevel(logging.CRITICAL)
+ logging.getLogger("urllib3").propagate = False
+ logging.basicConfig(level=logging.CRITICAL)
+ process.start()
-from pattern.web import plaintext, extension
-from pattern.web import download
-#from pattern import URL, MIMETYPE_IMAGE
-from pattern.web import Crawler, DEPTH
-import bs4
-import trafilatura
save_folder = 'cache/crawl'
clean_folder = 'cache/cleancrawl'
def clean_fn(s):
s = re.sub(r'[\s:]+','',s)
- s = re.sub(r'\/','_',s)
+ s = re.sub(r'\/','+',s)
return s
def format_html(html):
@@ -858,25 +1005,7 @@ def format_html(html):
return soup.prettify()
-class GavCrawl(Crawler):
- def visit(self, link, source=None):
- print('visited:', repr(link.url), 'from:', link.referrer)
- #txt = plaintext(source, keep={'h1':[], 'h2':[], 'h3':[], 'h4':[], 'td':[], 'strong':[], 'b':[], 'a':['href'], 'img':['src'], 'ul':[], 'ol':[], 'li':[], 'dd':[], 'dt':[], 'i':[]})
- #codecs.open(save_folder + '/' + mycleaner(clean_title(link.url)) + '.txt','w','utf-8').write(tomd.convert(txt))
-
- codecs.open(save_folder + '/' + clean_fn(link.url) + '.txt','w','utf-8').write(trafilatura.extract(source,include_links=True, deduplicate=True, include_images=True, include_formatting=True))
-
- def fail(self, link):
- print('failed:', repr(link.url))
-
-def crawl():
- p = GavCrawl(links=['http://www.gavilan.edu/'], domains=['gavilan.edu'], delay=0.75)
- while not p.done:
- try:
- p.crawl(method=DEPTH, cached=False, throttle=0.76)
- except Exception as e:
- print("Exception: ", e)
def txt_clean_index():
files = os.listdir(save_folder)
@@ -902,33 +1031,61 @@ def txt_clean_index():
out.write(L + '\n')
out.close()
-def samples():
- crawler = Crawler(links=[], domains=[], delay=20.0, sort=FIFO)
-
- url = URL('http://www.clips.ua.ac.bemedia/pattern_schema.gif')
- print(url.mimetype in MIMETYPE_IMAGE)
- #html = download('http://www.clips.ua.ac.be/', unicode=True)
- s = URL('http://www.clips.ua.ac.be').download()
- s = plaintext(s, keep={'h1':[], 'h2':[], 'strong':[], 'a':['href']})
+def search_embeddings():
+ model = SentenceTransformer('all-MiniLM-L6-v2')
+ save_embeds = pickle.load( open( "cache/embeddings.p", "rb" ) )
+ columns = list(zip(*save_embeds))
+ files = columns[0]
+ sentences = columns[1]
+ embeddings = columns[2]
+
+ print(files[:20])
+ print(sentences[:20])
+ print(embeddings[:20])
+
+ s = ''
+ while s != 'q':
+ s = input("search or 'q' to quit: ")
+ if s == 'q':
+ return
+ query_embedding = model.encode(s)
+
+ # Compute the cosine similarity between the query embedding and the sentence embeddings
+ cosine_scores = util.cos_sim(query_embedding, embeddings)
+
+ # Sort the sentences by their cosine similarity to the query sentence
+ results = sorted(zip(sentences, cosine_scores, files), key=lambda x: x[1], reverse=True)
+
+ # Print the top 5 results
+ for i, (sentence, score, file) in enumerate(results[:5]):
+ print(f'Top {i+1}: {file} - {sentence} - (Score: {score})')
- # getting absolute urls
- from pattern.web import URL, DOM, abs
+def create_embeddings():
+ model = SentenceTransformer('all-MiniLM-L6-v2')
+ files = os.listdir('cache/crawl')
+ output = []
+ save_embeds = [] # ['file','sentence','embedding']
+ files.sort()
+ for f in files:
+ m = re.match(r'https?..www\.gavilan\.edu\+(.*)\.\w\w\w\w?\.txt$',f)
+ if m:
+ lines = codecs.open('cache/crawl/' + f,'r','utf-8').readlines()
+ lines = [L.strip() for L in lines]
+ lines = [L for L in lines if L]
+ embeddings = model.encode(lines)
- url = URL('http://www.clips.ua.ac.be')
- dom = DOM(url.download())
- for link in dom('a'):
- print(abs(link.attributes.get('href',''), base=url.redirect or url.string))
-
- # get pdfs
- from pattern.web import URL, PDF
-
- url = URL('http://www.clips.ua.ac.be/sites/default/files/ctrs-002_0.pdf')
- pdf = PDF(url.download())
- print(pdf.string)
+ print("\n-----", f)
+ #Print the embeddings
+ for sentence, embedding in zip(lines, embeddings):
+ print("Sentence:", sentence)
+ #print("Embedding:", embedding)
+
+ save_embeds.append([f,sentence,embedding])
+ pickle.dump( save_embeds, open( "cache/embeddings.p", "wb" ) )
if __name__ == "__main__":
@@ -943,16 +1100,23 @@ if __name__ == "__main__":
7: ['demo vector search', demo_vector_search],
8: ['crawl',crawl],
9: ['clean text index', txt_clean_index],
+ 10: ['make web dir struct', manual_index],
+ 11: ['create search embeddings', create_embeddings],
+ 12: ['do a search', search_embeddings],
}
- for key in options:
- print(str(key) + '.\t' + options[key][0])
+ if len(sys.argv) > 1 and re.search(r'^\d+',sys.argv[1]):
+ resp = int(sys.argv[1])
+ print("\n\nPerforming: %s\n\n" % options[resp][0])
- print('')
- resp = input('Choose: ')
+ else:
+ print ('')
+ for key in options:
+ print(str(key) + '.\t' + options[key][0])
+
+ print('')
+ resp = input('Choose: ')
# Call the function in the options dict
options[ int(resp)][1]()
-
-
-
+
diff --git a/depricated.py b/depricated.py
index d79baaf..0fb44e7 100644
--- a/depricated.py
+++ b/depricated.py
@@ -1807,3 +1807,95 @@ def freshdesk():
+
+
+#### content.py
+
+
+from pattern.web import plaintext, extension
+from pattern.web import download
+#from pattern import URL, MIMETYPE_IMAGE
+from pattern.web import Crawler, DEPTH, FIFO, MIMETYPE_IMAGE, MIMETYPE_PDF
+
+class GavCrawl(Crawler):
+ def visit(self, link, source=None):
+ print('visited:', repr(link.url), 'from:', link.referrer)
+ print(' ', link.url.mimetype)
+ #txt = plaintext(source, keep={'h1':[], 'h2':[], 'h3':[], 'h4':[], 'td':[], 'strong':[], 'b':[], 'a':['href'], 'img':['src'], 'ul':[], 'ol':[], 'li':[], 'dd':[], 'dt':[], 'i':[]})
+ #codecs.open(save_folder + '/' + mycleaner(clean_title(link.url)) + '.txt','w','utf-8').write(tomd.convert(txt))
+
+ codecs.open(save_folder + '/' + clean_fn(link.url) + '.txt','w','utf-8').write(trafilatura.extract(source,include_links=True, deduplicate=True, include_images=True, include_formatting=True))
+
+
+ def fail(self, link):
+ print('failed:', repr(link.url))
+ if re.search(r'\.pdf$', link.url):
+ m = re.search(r'\/([^\/]+\.pdf)$', link.url)
+ if m:
+ save_file = m.group(1)
+ print("saving to ", save_folder + '/' + save_file)
+ pdf_response = requests.get(link.url)
+ with open(save_folder + '/' + save_file, 'wb') as f:
+ f.write(pdf_response.content)
+ text = extract_text(save_folder + '/' + save_file)
+ #print(text)
+ codecs.open(save_folder + '/' + save_file + '.txt','w','utf-8').write(text)
+ else:
+ print("no match for pdf url: ", link.url)
+
+ for ext in ['jpg','jpeg','gif','webp']:
+ if re.search(r'\.'+ext+'$', link.url):
+ m = re.search(r'\/([^\/]+\.'+ext+')$', link.url)
+ if m:
+ save_file = m.group(1)
+ print("saving to ", save_folder + '/' + save_file)
+ pdf_response = requests.get(link.url)
+ with open(save_folder + '/' + save_file, 'wb') as f:
+ f.write(pdf_response.content)
+ else:
+ print('no match for '+ext+' url: ', link.url)
+
+def crawl2():
+ #p = GavCrawl(links=['http://www.gavilan.edu/'], domains=['gavilan.edu', 'gavilan.curriqunet.com','www.boarddocs.com'], delay=0.75)
+ #p = GavCrawl(links=['https://gavilan.edu/finaid/2022-23DirectLoanApplication1.pdf'], domains=['gavilan.edu', 'gavilan.curriqunet.com','www.boarddocs.com'], delay=0.75)
+ p = GavCrawl(links=['https://gavilan.curriqunet.com/catalog/iq/1826'], domains=['gavilan.edu', 'gavilan.curriqunet.com','www.boarddocs.com'], delay=0.75)
+
+
+
+
+ while not p.done:
+ try:
+ p.crawl(method=DEPTH, cached=False, throttle=0.76)
+ except Exception as e:
+ print("Exception: ", e)
+
+
+
+def samples():
+ crawler = Crawler(links=[], domains=[], delay=20.0, sort=FIFO)
+
+ url = URL('http://www.clips.ua.ac.bemedia/pattern_schema.gif')
+ print(url.mimetype in MIMETYPE_IMAGE)
+
+
+ #html = download('http://www.clips.ua.ac.be/', unicode=True)
+ s = URL('http://www.clips.ua.ac.be').download()
+ s = plaintext(s, keep={'h1':[], 'h2':[], 'strong':[], 'a':['href']})
+
+
+ # getting absolute urls
+ from pattern.web import URL, DOM, abs
+
+ url = URL('http://www.clips.ua.ac.be')
+ dom = DOM(url.download())
+ for link in dom('a'):
+ print(abs(link.attributes.get('href',''), base=url.redirect or url.string))
+
+ # get pdfs
+ from pattern.web import URL, PDF
+
+ url = URL('http://www.clips.ua.ac.be/sites/default/files/ctrs-002_0.pdf')
+ pdf = PDF(url.download())
+ print(pdf.string)
+
+
diff --git a/gpt.py b/gpt.py
index 42dfe0d..36d5394 100644
--- a/gpt.py
+++ b/gpt.py
@@ -4,8 +4,8 @@ import openai
from canvas_secrets import openai_org, openai_api_key
-openai.organization = "org-66WLoZQEtBrO42Z9S8rfd10M"
-openai.api_key = "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"
+openai.organization = openai_org
+openai.api_key = openai_api_key
#print(openai.Model.list())
my_prompt = "Write a series of texts trying to sell a pen to a stranger."
diff --git a/interactive.py b/interactive.py
index 33d3413..7fb76f4 100644
--- a/interactive.py
+++ b/interactive.py
@@ -1,4 +1,3 @@
-import curses
import heapq, re, csv, os, shutil, datetime, urllib
import itertools, time, markdown, csv, json, os.path, webbrowser, threading
from functools import wraps
@@ -15,6 +14,20 @@ import localcache
from server import *
from canvas_secrets import flask_secretkey
+from content import my_site
+
+import socket
+this_host = socket.gethostname()
+
+print('\n\n' + this_host, '\n\n')
+
+has_curses = 0
+if this_host != 'ROGDESKTOP':
+ import curses
+ has_curses = 1
+else:
+ print("Skipping curses stuff")
+
q = Queue()
@@ -25,7 +38,6 @@ PORT_NUMBER = 8080 # Maybe set this to 9000.
datafile = 'lambda.csv'
-#writing_path = 'c:/users/peter/Nextcloud/Documents/writing/'
####
@@ -95,7 +107,16 @@ def flask_thread(q):
+ @app.route('/mirror')
+ def mirror():
+ return codecs.open('cache/crawl/index.html','r','utf-8').read()
+
+ @app.route('/mirror/')
+ def mirror_file(filename):
+ return markdown.markdown( codecs.open('cache/crawl/'+filename,'r','utf-8').read() ) + \
+ "" + codecs.open('cache/crawl/'+filename,'r','utf-8').read() + ""
+
@app.route('/clearscreens')
def clears():
clearscreens()
@@ -166,6 +187,7 @@ def flask_thread(q):
@app.route('/x/writing/images/')
def writing_img(fname):
+ # TODO
img_path = "/media/hd2/peter_home/Documents/writing_img/"
print(img_path + fname + " - writing images folder")
img_ext = fname.split('.')[-1]
diff --git a/server.py b/server.py
index aaac0f8..502e352 100644
--- a/server.py
+++ b/server.py
@@ -1,5 +1,5 @@
import json, codecs, re, markdown, os, pypandoc, striprtf, sqlite3, random, urllib
-import subprocess, html
+import subprocess, html, time
from striprtf.striprtf import rtf_to_text
from flask import render_template, Response
from flask import send_from_directory
@@ -16,8 +16,33 @@ from localcache import arrange_data_for_web, depts_with_classcounts, dept_with_s
from yattag import Doc
+import socket
+this_host = socket.gethostname()
+print('\n\n server host: ' + this_host, '\n\n')
+
LECPATH = "/media/hd2/peter_home_offload/lecture/"
- host = 'http://192.168.1.6:5000'
+host = 'http://192.168.1.6:5000'
+news_path = '/media/hd2/peter_home/Documents/scripts/browser/'
+writing_path = '/media/hd2/peter_home/Documents/writing/'
+img_path = '/media/hd2/peter_home/Documents/writing_img/'
+pics_path = '/media/hd2/peter_home/misc/'
+
+
+if this_host == 'ROGDESKTOP':
+ LECPATH = "d:/peter_home_offload/lecture/"
+ host = 'http://192.168.1.7:5000'
+ news_path = 'd:/peter_home/Documents/scripts/browser/'
+ writing_path = 'd:/peter_home/Documents/writing/'
+ img_path = 'd:/peter_home/Documents/writing_img/'
+ pics_path = 'd:/peter_home/misc/'
+
+
+
+
+
+
+
+
import paho.mqtt.client as mqtt
@@ -55,20 +80,20 @@ def on_message(client, userdata, msg):
print(" %s mqtt msg: %s data: %s" % (now, msg.topic, msg.payload.decode()))
+if 0:
+ while(mqtt_offline):
+ try:
+ client = mqtt.Client()
+ client.on_connect = on_connect
+ client.on_message = on_message
-while(mqtt_offline):
- try:
- client = mqtt.Client()
- client.on_connect = on_connect
- client.on_message = on_message
+ client.connect("192.168.1.6", 1883, 60)
- client.connect("192.168.1.6", 1883, 60)
+ mqtt_offline = 0
- mqtt_offline = 0
-
- except OSError as oe:
- print('no internet? try again in 5 seconds.')
- time.sleep(5)
+ except OSError as oe:
+ print('no internet? try again in 5 seconds.')
+ time.sleep(5)
@@ -114,18 +139,6 @@ def screenoff():
######
-news_path = '/media/hd2/peter_home/Documents/scripts/browser/'
-
-if platform.system() == 'Windows':
- writing_path = 'c:/users/peter/Nextcloud/Documents/writing/'
-else:
- writing_path = '/media/hd2/peter_home/Documents/writing/'
- img_path = '/media/hd2/peter_home/Documents/writing_img/'
-
-if platform.system() == 'Windows':
- pics_path = 'c:/users/peter/Nextcloud/misc/'
-else:
- pics_path = '/media/hd2/peter_home/misc/'
br = "
"
nl = "\n"