the vector search toy
This commit is contained in:
parent
ec8658cd8f
commit
532a71a3da
274
content.py
274
content.py
|
|
@ -1,18 +1,18 @@
|
||||||
|
|
||||||
|
|
||||||
#saved_titles = json.loads( codecs.open('cache/saved_youtube_titles.json','r','utf-8').read() )
|
#saved_titles = json.loads( codecs.open('cache/saved_youtube_titles.json','r','utf-8').read() )
|
||||||
import requests, codecs, os, re, json
|
import requests, codecs, os, re, json, sys, pypandoc
|
||||||
|
import webbrowser, bs4, trafilatura, pickle, tomd, checker
|
||||||
|
import html2markdown as h2m
|
||||||
from pipelines import header, fetch, url, put_file
|
from pipelines import header, fetch, url, put_file
|
||||||
from util import clean_title, to_file_friendly, minimal_string, stripper, mycleaner
|
from util import clean_title, to_file_friendly, minimal_string, stripper, mycleaner
|
||||||
from bs4 import BeautifulSoup as bs
|
from bs4 import BeautifulSoup as bs
|
||||||
from html.parser import HTMLParser
|
from html.parser import HTMLParser
|
||||||
from collections import defaultdict
|
from collections import defaultdict
|
||||||
import tomd, checker
|
from pdfminer.high_level import extract_text
|
||||||
import html2markdown as h2m
|
from sentence_transformers import SentenceTransformer, util
|
||||||
import pypandoc
|
|
||||||
import webbrowser
|
|
||||||
h = HTMLParser()
|
|
||||||
|
|
||||||
|
h = HTMLParser()
|
||||||
|
|
||||||
DBG = 1
|
DBG = 1
|
||||||
|
|
||||||
|
|
@ -22,8 +22,6 @@ def d(s):
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
# Download everything interesting in a course to a local folder
|
# Download everything interesting in a course to a local folder
|
||||||
# Build a master file with the entire class content
|
# Build a master file with the entire class content
|
||||||
def accessible_check(id=""):
|
def accessible_check(id=""):
|
||||||
|
|
@ -829,6 +827,44 @@ Schedule an In-Person, Phone or Zoom Appointment"""
|
||||||
print(f"Vector for the word '{example_word}': {vector}")
|
print(f"Vector for the word '{example_word}': {vector}")
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def makedir():
|
||||||
|
files = os.listdir('cache/crawl')
|
||||||
|
#print(files)
|
||||||
|
files.sort()
|
||||||
|
for f in files:
|
||||||
|
m = re.match(r'https?..www\.gavilan\.edu\+(.*)\.\w\w\w\w?\.txt$',f)
|
||||||
|
if m:
|
||||||
|
name = m.groups()[0]
|
||||||
|
parts = name.split('+')
|
||||||
|
print(parts)
|
||||||
|
|
||||||
|
def manual_index():
|
||||||
|
files = os.listdir('cache/crawl')
|
||||||
|
#print(files)
|
||||||
|
ii = codecs.open('cache/crawl/index.html','w','utf-8')
|
||||||
|
ii.write('<html><body><h1>Site index</h1>\n')
|
||||||
|
files.sort()
|
||||||
|
for f in files:
|
||||||
|
m = re.match(r'https?..www\.gavilan\.edu\+(.*)\.\w\w\w\w?\.txt$',f)
|
||||||
|
if m:
|
||||||
|
name = m.groups()[0]
|
||||||
|
parts = name.split('+')
|
||||||
|
ii.write('<br /><a href="mirror/'+f+'">'+f+'</a>\n')
|
||||||
|
|
||||||
|
def my_site():
|
||||||
|
files = os.listdir('cache/crawl')
|
||||||
|
output = []
|
||||||
|
files.sort()
|
||||||
|
for f in files:
|
||||||
|
m = re.match(r'https?..www\.gavilan\.edu\+(.*)\.\w\w\w\w?\.txt$',f)
|
||||||
|
if m:
|
||||||
|
name = m.groups()[0]
|
||||||
|
parts = name.split('+')
|
||||||
|
output.append(parts)
|
||||||
|
return output
|
||||||
|
|
||||||
|
|
||||||
## TODO site scraper
|
## TODO site scraper
|
||||||
## TODO find package that extracts text from web page
|
## TODO find package that extracts text from web page
|
||||||
### TODO master list of what to index.
|
### TODO master list of what to index.
|
||||||
|
|
@ -836,21 +872,132 @@ Schedule an In-Person, Phone or Zoom Appointment"""
|
||||||
## TODO PDFs and DOCXs
|
## TODO PDFs and DOCXs
|
||||||
## TODO fix urls w/ anchors
|
## TODO fix urls w/ anchors
|
||||||
|
|
||||||
|
def crawl():
|
||||||
|
import scrapy, logging
|
||||||
|
from scrapy.crawler import CrawlerProcess
|
||||||
|
|
||||||
|
logger = logging.getLogger()
|
||||||
|
logger.setLevel(level=logging.CRITICAL)
|
||||||
|
logging.basicConfig(level=logging.CRITICAL)
|
||||||
|
logger.disabled = True
|
||||||
|
|
||||||
|
|
||||||
|
avoid = ['ezproxy','community\.gavilan\.edu','archive\/tag','archive\/category', 'my\.gavilan\.edu', 'augusoft',
|
||||||
|
'eis-prod', 'ilearn\.gavilan', 'mailto', 'cgi-bin', 'edu\/old\/schedule', ]
|
||||||
|
|
||||||
|
class MySpider(scrapy.Spider):
|
||||||
|
name = 'myspider'
|
||||||
|
#start_urls = ['https://gavilan.curriqunet.com/catalog/iq/1826']
|
||||||
|
start_urls = ['https://www.gavilan.edu']
|
||||||
|
|
||||||
|
|
||||||
|
"""
|
||||||
|
logging.getLogger("scrapy").setLevel(logging.CRITICAL)
|
||||||
|
logging.getLogger("scrapy.utils.log").setLevel(logging.CRITICAL)
|
||||||
|
logging.getLogger("scrapy.extensions.telnet").setLevel(logging.CRITICAL)
|
||||||
|
logging.getLogger("scrapy.middleware").setLevel(logging.CRITICAL)
|
||||||
|
logging.getLogger("scrapy.core.engine").setLevel(logging.CRITICAL)
|
||||||
|
logging.getLogger("scrapy.middleware").setLevel(logging.CRITICAL)
|
||||||
|
|
||||||
|
logger.disabled = True"""
|
||||||
|
|
||||||
|
def parse(self, response):
|
||||||
|
print('visited:', repr(response.url), 'status:', response.status)
|
||||||
|
|
||||||
|
if re.search(r'\.pdf$', response.url):
|
||||||
|
m = re.search(r'\/([^\/]+\.'+ext+')$', response.url)
|
||||||
|
if m:
|
||||||
|
print("saving to ", save_folder + '/' + clean_fn(response.url))
|
||||||
|
pdf_response = requests.get(response.url)
|
||||||
|
with open(save_folder + '/' + clean_fn(response.url), 'wb') as f:
|
||||||
|
f.write(pdf_response.content)
|
||||||
|
text = extract_text(save_folder + '/' + clean_fn(response.url))
|
||||||
|
codecs.open(save_folder + '/' + clean_fn(response.url) + '.txt','w','utf-8').write(text)
|
||||||
|
|
||||||
|
for ext in ['doc','docx','ppt','pptx']:
|
||||||
|
if re.search(r'\.'+ext+'$', response.url):
|
||||||
|
m = re.search(r'\/([^\/]+\.'+ext+')$', response.url)
|
||||||
|
if m:
|
||||||
|
print("saving to ", save_folder + '/' + clean_fn(response.url))
|
||||||
|
pdf_response = requests.get(response.url)
|
||||||
|
with open(save_folder + '/' + clean_fn(response.url), 'wb') as f:
|
||||||
|
f.write(pdf_response.content)
|
||||||
|
#text = extract_text(save_folder + '/' + clean_fn(response.url) + '.txt')
|
||||||
|
output = pypandoc.convert_file(save_folder + '/' + clean_fn(response.url), 'html', extra_args=['--extract-media=%s' % hash ])
|
||||||
|
txt_output = trafilatura.extract(response.text,include_links=True, deduplicate=True, include_images=True, include_formatting=True)
|
||||||
|
if txt_output:
|
||||||
|
codecs.open(save_folder + '/' + clean_fn(response.url) + '.txt','w','utf-8').write(txt_output)
|
||||||
|
|
||||||
|
for ext in ['jpg','jpeg','gif','webp']:
|
||||||
|
if re.search(r'\.'+ext+'$', response.url):
|
||||||
|
m = re.search(r'\/([^\/]+\.'+ext+')$', response.url)
|
||||||
|
if m:
|
||||||
|
print("saving to ", save_folder + '/' + clean_fn(response.url))
|
||||||
|
pdf_response = requests.get(response.url)
|
||||||
|
with open(save_folder + '/' + clean_fn(response.url), 'wb') as f:
|
||||||
|
f.write(pdf_response.content)
|
||||||
|
|
||||||
|
f_out = codecs.open(save_folder + '/' + clean_fn(response.url) + '.txt','w','utf-8')
|
||||||
|
|
||||||
|
this_output = trafilatura.extract(response.text,include_links=True, deduplicate=True, include_images=True, include_formatting=True)
|
||||||
|
if this_output:
|
||||||
|
f_out.write(this_output)
|
||||||
|
f_out.close()
|
||||||
|
links = response.css('a::attr(href)').getall()
|
||||||
|
|
||||||
|
# Follow each link and parse its contents
|
||||||
|
|
||||||
|
for link in links:
|
||||||
|
go = 1
|
||||||
|
full_link = response.urljoin(link)
|
||||||
|
print('++++++ trying ', full_link)
|
||||||
|
|
||||||
|
if not re.search(r'gavilan\.edu',full_link):
|
||||||
|
go = 0
|
||||||
|
print('--- not gav edu')
|
||||||
|
else:
|
||||||
|
if re.search(r'hhh\.gavilan\.edu',full_link):
|
||||||
|
pass
|
||||||
|
elif not re.search(r'^https?:\/\/www\.gavilan\.edu',full_link):
|
||||||
|
# need to add www to gavilan.edu
|
||||||
|
m = re.search(r'^(https?:\/\/)gavilan\.edu(\/.*)$',full_link)
|
||||||
|
if m:
|
||||||
|
full_link = m.group(1) + 'www.' + m.group(2)
|
||||||
|
for a in avoid:
|
||||||
|
if re.search(a,full_link):
|
||||||
|
go = 0
|
||||||
|
print('--- avoid ', a)
|
||||||
|
|
||||||
|
if go: yield scrapy.Request(full_link, callback=self.parse,
|
||||||
|
headers={"User-Agent": "Mozilla/5.0 (iPad; CPU OS 12_2 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Mobile/15E148"})
|
||||||
|
else:
|
||||||
|
print("------ avoiding ", full_link)
|
||||||
|
# Instantiate a CrawlerProcess object
|
||||||
|
process = CrawlerProcess()
|
||||||
|
|
||||||
|
# Add the MySpider spider to the process
|
||||||
|
process.crawl(MySpider)
|
||||||
|
|
||||||
|
# Start the process
|
||||||
|
logging.basicConfig(level=logging.CRITICAL)
|
||||||
|
logging.getLogger('scrapy').propagate = False
|
||||||
|
logging.getLogger("trafilatura").setLevel(logging.CRITICAL)
|
||||||
|
logging.getLogger("trafilatura").propagate = False
|
||||||
|
logging.getLogger("pdfminer").setLevel(logging.CRITICAL)
|
||||||
|
logging.getLogger("pdfminer").propagate = False
|
||||||
|
logging.getLogger("urllib3").setLevel(logging.CRITICAL)
|
||||||
|
logging.getLogger("urllib3").propagate = False
|
||||||
|
logging.basicConfig(level=logging.CRITICAL)
|
||||||
|
process.start()
|
||||||
|
|
||||||
|
|
||||||
from pattern.web import plaintext, extension
|
|
||||||
from pattern.web import download
|
|
||||||
#from pattern import URL, MIMETYPE_IMAGE
|
|
||||||
from pattern.web import Crawler, DEPTH
|
|
||||||
import bs4
|
|
||||||
import trafilatura
|
|
||||||
|
|
||||||
save_folder = 'cache/crawl'
|
save_folder = 'cache/crawl'
|
||||||
clean_folder = 'cache/cleancrawl'
|
clean_folder = 'cache/cleancrawl'
|
||||||
|
|
||||||
def clean_fn(s):
|
def clean_fn(s):
|
||||||
s = re.sub(r'[\s:]+','',s)
|
s = re.sub(r'[\s:]+','',s)
|
||||||
s = re.sub(r'\/','_',s)
|
s = re.sub(r'\/','+',s)
|
||||||
return s
|
return s
|
||||||
|
|
||||||
def format_html(html):
|
def format_html(html):
|
||||||
|
|
@ -858,25 +1005,7 @@ def format_html(html):
|
||||||
return soup.prettify()
|
return soup.prettify()
|
||||||
|
|
||||||
|
|
||||||
class GavCrawl(Crawler):
|
|
||||||
def visit(self, link, source=None):
|
|
||||||
print('visited:', repr(link.url), 'from:', link.referrer)
|
|
||||||
#txt = plaintext(source, keep={'h1':[], 'h2':[], 'h3':[], 'h4':[], 'td':[], 'strong':[], 'b':[], 'a':['href'], 'img':['src'], 'ul':[], 'ol':[], 'li':[], 'dd':[], 'dt':[], 'i':[]})
|
|
||||||
#codecs.open(save_folder + '/' + mycleaner(clean_title(link.url)) + '.txt','w','utf-8').write(tomd.convert(txt))
|
|
||||||
|
|
||||||
codecs.open(save_folder + '/' + clean_fn(link.url) + '.txt','w','utf-8').write(trafilatura.extract(source,include_links=True, deduplicate=True, include_images=True, include_formatting=True))
|
|
||||||
|
|
||||||
|
|
||||||
def fail(self, link):
|
|
||||||
print('failed:', repr(link.url))
|
|
||||||
|
|
||||||
def crawl():
|
|
||||||
p = GavCrawl(links=['http://www.gavilan.edu/'], domains=['gavilan.edu'], delay=0.75)
|
|
||||||
while not p.done:
|
|
||||||
try:
|
|
||||||
p.crawl(method=DEPTH, cached=False, throttle=0.76)
|
|
||||||
except Exception as e:
|
|
||||||
print("Exception: ", e)
|
|
||||||
|
|
||||||
def txt_clean_index():
|
def txt_clean_index():
|
||||||
files = os.listdir(save_folder)
|
files = os.listdir(save_folder)
|
||||||
|
|
@ -902,33 +1031,61 @@ def txt_clean_index():
|
||||||
out.write(L + '\n')
|
out.write(L + '\n')
|
||||||
out.close()
|
out.close()
|
||||||
|
|
||||||
def samples():
|
|
||||||
crawler = Crawler(links=[], domains=[], delay=20.0, sort=FIFO)
|
|
||||||
|
|
||||||
url = URL('http://www.clips.ua.ac.bemedia/pattern_schema.gif')
|
|
||||||
print(url.mimetype in MIMETYPE_IMAGE)
|
|
||||||
|
|
||||||
|
|
||||||
#html = download('http://www.clips.ua.ac.be/', unicode=True)
|
def search_embeddings():
|
||||||
s = URL('http://www.clips.ua.ac.be').download()
|
model = SentenceTransformer('all-MiniLM-L6-v2')
|
||||||
s = plaintext(s, keep={'h1':[], 'h2':[], 'strong':[], 'a':['href']})
|
save_embeds = pickle.load( open( "cache/embeddings.p", "rb" ) )
|
||||||
|
columns = list(zip(*save_embeds))
|
||||||
|
files = columns[0]
|
||||||
|
sentences = columns[1]
|
||||||
|
embeddings = columns[2]
|
||||||
|
|
||||||
|
print(files[:20])
|
||||||
|
print(sentences[:20])
|
||||||
|
print(embeddings[:20])
|
||||||
|
|
||||||
|
s = ''
|
||||||
|
while s != 'q':
|
||||||
|
s = input("search or 'q' to quit: ")
|
||||||
|
if s == 'q':
|
||||||
|
return
|
||||||
|
query_embedding = model.encode(s)
|
||||||
|
|
||||||
|
# Compute the cosine similarity between the query embedding and the sentence embeddings
|
||||||
|
cosine_scores = util.cos_sim(query_embedding, embeddings)
|
||||||
|
|
||||||
|
# Sort the sentences by their cosine similarity to the query sentence
|
||||||
|
results = sorted(zip(sentences, cosine_scores, files), key=lambda x: x[1], reverse=True)
|
||||||
|
|
||||||
|
# Print the top 5 results
|
||||||
|
for i, (sentence, score, file) in enumerate(results[:5]):
|
||||||
|
print(f'Top {i+1}: {file} - {sentence} - (Score: {score})')
|
||||||
|
|
||||||
|
|
||||||
# getting absolute urls
|
def create_embeddings():
|
||||||
from pattern.web import URL, DOM, abs
|
model = SentenceTransformer('all-MiniLM-L6-v2')
|
||||||
|
files = os.listdir('cache/crawl')
|
||||||
|
output = []
|
||||||
|
save_embeds = [] # ['file','sentence','embedding']
|
||||||
|
files.sort()
|
||||||
|
for f in files:
|
||||||
|
m = re.match(r'https?..www\.gavilan\.edu\+(.*)\.\w\w\w\w?\.txt$',f)
|
||||||
|
if m:
|
||||||
|
lines = codecs.open('cache/crawl/' + f,'r','utf-8').readlines()
|
||||||
|
lines = [L.strip() for L in lines]
|
||||||
|
lines = [L for L in lines if L]
|
||||||
|
embeddings = model.encode(lines)
|
||||||
|
|
||||||
url = URL('http://www.clips.ua.ac.be')
|
print("\n-----", f)
|
||||||
dom = DOM(url.download())
|
|
||||||
for link in dom('a'):
|
|
||||||
print(abs(link.attributes.get('href',''), base=url.redirect or url.string))
|
|
||||||
|
|
||||||
# get pdfs
|
#Print the embeddings
|
||||||
from pattern.web import URL, PDF
|
for sentence, embedding in zip(lines, embeddings):
|
||||||
|
print("Sentence:", sentence)
|
||||||
url = URL('http://www.clips.ua.ac.be/sites/default/files/ctrs-002_0.pdf')
|
#print("Embedding:", embedding)
|
||||||
pdf = PDF(url.download())
|
|
||||||
print(pdf.string)
|
|
||||||
|
|
||||||
|
save_embeds.append([f,sentence,embedding])
|
||||||
|
pickle.dump( save_embeds, open( "cache/embeddings.p", "wb" ) )
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
|
@ -943,8 +1100,17 @@ if __name__ == "__main__":
|
||||||
7: ['demo vector search', demo_vector_search],
|
7: ['demo vector search', demo_vector_search],
|
||||||
8: ['crawl',crawl],
|
8: ['crawl',crawl],
|
||||||
9: ['clean text index', txt_clean_index],
|
9: ['clean text index', txt_clean_index],
|
||||||
|
10: ['make web dir struct', manual_index],
|
||||||
|
11: ['create search embeddings', create_embeddings],
|
||||||
|
12: ['do a search', search_embeddings],
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if len(sys.argv) > 1 and re.search(r'^\d+',sys.argv[1]):
|
||||||
|
resp = int(sys.argv[1])
|
||||||
|
print("\n\nPerforming: %s\n\n" % options[resp][0])
|
||||||
|
|
||||||
|
else:
|
||||||
|
print ('')
|
||||||
for key in options:
|
for key in options:
|
||||||
print(str(key) + '.\t' + options[key][0])
|
print(str(key) + '.\t' + options[key][0])
|
||||||
|
|
||||||
|
|
@ -954,5 +1120,3 @@ if __name__ == "__main__":
|
||||||
# Call the function in the options dict
|
# Call the function in the options dict
|
||||||
options[ int(resp)][1]()
|
options[ int(resp)][1]()
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -1807,3 +1807,95 @@ def freshdesk():
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
#### content.py
|
||||||
|
|
||||||
|
|
||||||
|
from pattern.web import plaintext, extension
|
||||||
|
from pattern.web import download
|
||||||
|
#from pattern import URL, MIMETYPE_IMAGE
|
||||||
|
from pattern.web import Crawler, DEPTH, FIFO, MIMETYPE_IMAGE, MIMETYPE_PDF
|
||||||
|
|
||||||
|
class GavCrawl(Crawler):
|
||||||
|
def visit(self, link, source=None):
|
||||||
|
print('visited:', repr(link.url), 'from:', link.referrer)
|
||||||
|
print(' ', link.url.mimetype)
|
||||||
|
#txt = plaintext(source, keep={'h1':[], 'h2':[], 'h3':[], 'h4':[], 'td':[], 'strong':[], 'b':[], 'a':['href'], 'img':['src'], 'ul':[], 'ol':[], 'li':[], 'dd':[], 'dt':[], 'i':[]})
|
||||||
|
#codecs.open(save_folder + '/' + mycleaner(clean_title(link.url)) + '.txt','w','utf-8').write(tomd.convert(txt))
|
||||||
|
|
||||||
|
codecs.open(save_folder + '/' + clean_fn(link.url) + '.txt','w','utf-8').write(trafilatura.extract(source,include_links=True, deduplicate=True, include_images=True, include_formatting=True))
|
||||||
|
|
||||||
|
|
||||||
|
def fail(self, link):
|
||||||
|
print('failed:', repr(link.url))
|
||||||
|
if re.search(r'\.pdf$', link.url):
|
||||||
|
m = re.search(r'\/([^\/]+\.pdf)$', link.url)
|
||||||
|
if m:
|
||||||
|
save_file = m.group(1)
|
||||||
|
print("saving to ", save_folder + '/' + save_file)
|
||||||
|
pdf_response = requests.get(link.url)
|
||||||
|
with open(save_folder + '/' + save_file, 'wb') as f:
|
||||||
|
f.write(pdf_response.content)
|
||||||
|
text = extract_text(save_folder + '/' + save_file)
|
||||||
|
#print(text)
|
||||||
|
codecs.open(save_folder + '/' + save_file + '.txt','w','utf-8').write(text)
|
||||||
|
else:
|
||||||
|
print("no match for pdf url: ", link.url)
|
||||||
|
|
||||||
|
for ext in ['jpg','jpeg','gif','webp']:
|
||||||
|
if re.search(r'\.'+ext+'$', link.url):
|
||||||
|
m = re.search(r'\/([^\/]+\.'+ext+')$', link.url)
|
||||||
|
if m:
|
||||||
|
save_file = m.group(1)
|
||||||
|
print("saving to ", save_folder + '/' + save_file)
|
||||||
|
pdf_response = requests.get(link.url)
|
||||||
|
with open(save_folder + '/' + save_file, 'wb') as f:
|
||||||
|
f.write(pdf_response.content)
|
||||||
|
else:
|
||||||
|
print('no match for '+ext+' url: ', link.url)
|
||||||
|
|
||||||
|
def crawl2():
|
||||||
|
#p = GavCrawl(links=['http://www.gavilan.edu/'], domains=['gavilan.edu', 'gavilan.curriqunet.com','www.boarddocs.com'], delay=0.75)
|
||||||
|
#p = GavCrawl(links=['https://gavilan.edu/finaid/2022-23DirectLoanApplication1.pdf'], domains=['gavilan.edu', 'gavilan.curriqunet.com','www.boarddocs.com'], delay=0.75)
|
||||||
|
p = GavCrawl(links=['https://gavilan.curriqunet.com/catalog/iq/1826'], domains=['gavilan.edu', 'gavilan.curriqunet.com','www.boarddocs.com'], delay=0.75)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
while not p.done:
|
||||||
|
try:
|
||||||
|
p.crawl(method=DEPTH, cached=False, throttle=0.76)
|
||||||
|
except Exception as e:
|
||||||
|
print("Exception: ", e)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def samples():
|
||||||
|
crawler = Crawler(links=[], domains=[], delay=20.0, sort=FIFO)
|
||||||
|
|
||||||
|
url = URL('http://www.clips.ua.ac.bemedia/pattern_schema.gif')
|
||||||
|
print(url.mimetype in MIMETYPE_IMAGE)
|
||||||
|
|
||||||
|
|
||||||
|
#html = download('http://www.clips.ua.ac.be/', unicode=True)
|
||||||
|
s = URL('http://www.clips.ua.ac.be').download()
|
||||||
|
s = plaintext(s, keep={'h1':[], 'h2':[], 'strong':[], 'a':['href']})
|
||||||
|
|
||||||
|
|
||||||
|
# getting absolute urls
|
||||||
|
from pattern.web import URL, DOM, abs
|
||||||
|
|
||||||
|
url = URL('http://www.clips.ua.ac.be')
|
||||||
|
dom = DOM(url.download())
|
||||||
|
for link in dom('a'):
|
||||||
|
print(abs(link.attributes.get('href',''), base=url.redirect or url.string))
|
||||||
|
|
||||||
|
# get pdfs
|
||||||
|
from pattern.web import URL, PDF
|
||||||
|
|
||||||
|
url = URL('http://www.clips.ua.ac.be/sites/default/files/ctrs-002_0.pdf')
|
||||||
|
pdf = PDF(url.download())
|
||||||
|
print(pdf.string)
|
||||||
|
|
||||||
|
|
||||||
|
|
|
||||||
4
gpt.py
4
gpt.py
|
|
@ -4,8 +4,8 @@ import openai
|
||||||
from canvas_secrets import openai_org, openai_api_key
|
from canvas_secrets import openai_org, openai_api_key
|
||||||
|
|
||||||
|
|
||||||
openai.organization = "org-66WLoZQEtBrO42Z9S8rfd10M"
|
openai.organization = openai_org
|
||||||
openai.api_key = "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"
|
openai.api_key = openai_api_key
|
||||||
#print(openai.Model.list())
|
#print(openai.Model.list())
|
||||||
|
|
||||||
my_prompt = "Write a series of texts trying to sell a pen to a stranger."
|
my_prompt = "Write a series of texts trying to sell a pen to a stranger."
|
||||||
|
|
|
||||||
|
|
@ -1,4 +1,3 @@
|
||||||
import curses
|
|
||||||
import heapq, re, csv, os, shutil, datetime, urllib
|
import heapq, re, csv, os, shutil, datetime, urllib
|
||||||
import itertools, time, markdown, csv, json, os.path, webbrowser, threading
|
import itertools, time, markdown, csv, json, os.path, webbrowser, threading
|
||||||
from functools import wraps
|
from functools import wraps
|
||||||
|
|
@ -15,6 +14,20 @@ import localcache
|
||||||
from server import *
|
from server import *
|
||||||
from canvas_secrets import flask_secretkey
|
from canvas_secrets import flask_secretkey
|
||||||
|
|
||||||
|
from content import my_site
|
||||||
|
|
||||||
|
import socket
|
||||||
|
this_host = socket.gethostname()
|
||||||
|
|
||||||
|
print('\n\n' + this_host, '\n\n')
|
||||||
|
|
||||||
|
has_curses = 0
|
||||||
|
if this_host != 'ROGDESKTOP':
|
||||||
|
import curses
|
||||||
|
has_curses = 1
|
||||||
|
else:
|
||||||
|
print("Skipping curses stuff")
|
||||||
|
|
||||||
q = Queue()
|
q = Queue()
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -25,7 +38,6 @@ PORT_NUMBER = 8080 # Maybe set this to 9000.
|
||||||
|
|
||||||
datafile = 'lambda.csv'
|
datafile = 'lambda.csv'
|
||||||
|
|
||||||
#writing_path = 'c:/users/peter/Nextcloud/Documents/writing/'
|
|
||||||
|
|
||||||
|
|
||||||
####
|
####
|
||||||
|
|
@ -95,6 +107,15 @@ def flask_thread(q):
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
@app.route('/mirror')
|
||||||
|
def mirror():
|
||||||
|
return codecs.open('cache/crawl/index.html','r','utf-8').read()
|
||||||
|
|
||||||
|
|
||||||
|
@app.route('/mirror/<filename>')
|
||||||
|
def mirror_file(filename):
|
||||||
|
return markdown.markdown( codecs.open('cache/crawl/'+filename,'r','utf-8').read() ) + \
|
||||||
|
"<pre>" + codecs.open('cache/crawl/'+filename,'r','utf-8').read() + "</pre>"
|
||||||
|
|
||||||
@app.route('/clearscreens')
|
@app.route('/clearscreens')
|
||||||
def clears():
|
def clears():
|
||||||
|
|
@ -166,6 +187,7 @@ def flask_thread(q):
|
||||||
|
|
||||||
@app.route('/x/writing/images/<fname>')
|
@app.route('/x/writing/images/<fname>')
|
||||||
def writing_img(fname):
|
def writing_img(fname):
|
||||||
|
# TODO
|
||||||
img_path = "/media/hd2/peter_home/Documents/writing_img/"
|
img_path = "/media/hd2/peter_home/Documents/writing_img/"
|
||||||
print(img_path + fname + " - writing images folder")
|
print(img_path + fname + " - writing images folder")
|
||||||
img_ext = fname.split('.')[-1]
|
img_ext = fname.split('.')[-1]
|
||||||
|
|
|
||||||
45
server.py
45
server.py
|
|
@ -1,5 +1,5 @@
|
||||||
import json, codecs, re, markdown, os, pypandoc, striprtf, sqlite3, random, urllib
|
import json, codecs, re, markdown, os, pypandoc, striprtf, sqlite3, random, urllib
|
||||||
import subprocess, html
|
import subprocess, html, time
|
||||||
from striprtf.striprtf import rtf_to_text
|
from striprtf.striprtf import rtf_to_text
|
||||||
from flask import render_template, Response
|
from flask import render_template, Response
|
||||||
from flask import send_from_directory
|
from flask import send_from_directory
|
||||||
|
|
@ -16,8 +16,33 @@ from localcache import arrange_data_for_web, depts_with_classcounts, dept_with_s
|
||||||
from yattag import Doc
|
from yattag import Doc
|
||||||
|
|
||||||
|
|
||||||
|
import socket
|
||||||
|
this_host = socket.gethostname()
|
||||||
|
print('\n\n server host: ' + this_host, '\n\n')
|
||||||
|
|
||||||
LECPATH = "/media/hd2/peter_home_offload/lecture/"
|
LECPATH = "/media/hd2/peter_home_offload/lecture/"
|
||||||
host = 'http://192.168.1.6:5000'
|
host = 'http://192.168.1.6:5000'
|
||||||
|
news_path = '/media/hd2/peter_home/Documents/scripts/browser/'
|
||||||
|
writing_path = '/media/hd2/peter_home/Documents/writing/'
|
||||||
|
img_path = '/media/hd2/peter_home/Documents/writing_img/'
|
||||||
|
pics_path = '/media/hd2/peter_home/misc/'
|
||||||
|
|
||||||
|
|
||||||
|
if this_host == 'ROGDESKTOP':
|
||||||
|
LECPATH = "d:/peter_home_offload/lecture/"
|
||||||
|
host = 'http://192.168.1.7:5000'
|
||||||
|
news_path = 'd:/peter_home/Documents/scripts/browser/'
|
||||||
|
writing_path = 'd:/peter_home/Documents/writing/'
|
||||||
|
img_path = 'd:/peter_home/Documents/writing_img/'
|
||||||
|
pics_path = 'd:/peter_home/misc/'
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
import paho.mqtt.client as mqtt
|
import paho.mqtt.client as mqtt
|
||||||
|
|
@ -55,8 +80,8 @@ def on_message(client, userdata, msg):
|
||||||
print(" %s mqtt msg: %s data: %s" % (now, msg.topic, msg.payload.decode()))
|
print(" %s mqtt msg: %s data: %s" % (now, msg.topic, msg.payload.decode()))
|
||||||
|
|
||||||
|
|
||||||
|
if 0:
|
||||||
while(mqtt_offline):
|
while(mqtt_offline):
|
||||||
try:
|
try:
|
||||||
client = mqtt.Client()
|
client = mqtt.Client()
|
||||||
client.on_connect = on_connect
|
client.on_connect = on_connect
|
||||||
|
|
@ -114,18 +139,6 @@ def screenoff():
|
||||||
######
|
######
|
||||||
|
|
||||||
|
|
||||||
news_path = '/media/hd2/peter_home/Documents/scripts/browser/'
|
|
||||||
|
|
||||||
if platform.system() == 'Windows':
|
|
||||||
writing_path = 'c:/users/peter/Nextcloud/Documents/writing/'
|
|
||||||
else:
|
|
||||||
writing_path = '/media/hd2/peter_home/Documents/writing/'
|
|
||||||
img_path = '/media/hd2/peter_home/Documents/writing_img/'
|
|
||||||
|
|
||||||
if platform.system() == 'Windows':
|
|
||||||
pics_path = 'c:/users/peter/Nextcloud/misc/'
|
|
||||||
else:
|
|
||||||
pics_path = '/media/hd2/peter_home/misc/'
|
|
||||||
|
|
||||||
br = "<br />"
|
br = "<br />"
|
||||||
nl = "\n"
|
nl = "\n"
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue