content scraping

This commit is contained in:
Peter Howell 2023-04-14 10:20:06 -07:00
parent 3f9fa2a8b5
commit ec8658cd8f
3 changed files with 2042 additions and 1637 deletions

3596
apphelp.py

File diff suppressed because it is too large Load Diff

View File

@ -6,6 +6,7 @@ from pipelines import header, fetch, url, put_file
from util import clean_title, to_file_friendly, minimal_string, stripper, mycleaner from util import clean_title, to_file_friendly, minimal_string, stripper, mycleaner
from bs4 import BeautifulSoup as bs from bs4 import BeautifulSoup as bs
from html.parser import HTMLParser from html.parser import HTMLParser
from collections import defaultdict
import tomd, checker import tomd, checker
import html2markdown as h2m import html2markdown as h2m
import pypandoc import pypandoc
@ -829,40 +830,83 @@ Schedule an In-Person, Phone or Zoom Appointment"""
## TODO site scraper ## TODO site scraper
## TODO find package that extracts text from web page
## TODO finde package that extracts text from web page
### TODO master list of what to index. ### TODO master list of what to index.
from pattern.web import URL, plaintext, extension ## TODO PDFs and DOCXs
## TODO fix urls w/ anchors
from pattern.web import plaintext, extension
from pattern.web import download from pattern.web import download
from pattern import URL, MIMETYPE_IMAGE #from pattern import URL, MIMETYPE_IMAGE
from pattern.web import Crawler from pattern.web import Crawler, DEPTH
from util import clean_title import bs4
import trafilatura
save_folder = 'cache/crawl' save_folder = 'cache/crawl'
clean_folder = 'cache/cleancrawl'
def clean_fn(s):
s = re.sub(r'[\s:]+','',s)
s = re.sub(r'\/','_',s)
return s
def format_html(html):
soup = bs4.BeautifulSoup(html, 'html.parser')
return soup.prettify()
class GavCrawl(Crawler): class GavCrawl(Crawler):
def visit(self, link, source=None): def visit(self, link, source=None):
print 'visited:', repr(link.url), 'from:', link.referrer print('visited:', repr(link.url), 'from:', link.referrer)
txt = plaintext(link.source) ## , keep={'h1':[], 'h2':[], 'strong':[], 'a':['href']}) #txt = plaintext(source, keep={'h1':[], 'h2':[], 'h3':[], 'h4':[], 'td':[], 'strong':[], 'b':[], 'a':['href'], 'img':['src'], 'ul':[], 'ol':[], 'li':[], 'dd':[], 'dt':[], 'i':[]})
codecs.open(save_folder + '/' + clean_title(link.url) + '.txt').write(txt) #codecs.open(save_folder + '/' + mycleaner(clean_title(link.url)) + '.txt','w','utf-8').write(tomd.convert(txt))
codecs.open(save_folder + '/' + clean_fn(link.url) + '.txt','w','utf-8').write(trafilatura.extract(source,include_links=True, deduplicate=True, include_images=True, include_formatting=True))
def fail(self, link): def fail(self, link):
print 'failed:', repr(link.url) print('failed:', repr(link.url))
def crawl(): def crawl():
p = GavCrawl(links=['http://www.gavilan.edu/'], delay=3) p = GavCrawl(links=['http://www.gavilan.edu/'], domains=['gavilan.edu'], delay=0.75)
while not p.done: while not p.done:
p.crawl(method=DEPTH, cached=False, throttle=3) try:
p.crawl(method=DEPTH, cached=False, throttle=0.76)
except Exception as e:
print("Exception: ", e)
def txt_clean_index():
files = os.listdir(save_folder)
line_freq = defaultdict(int)
# first pass
for f in files:
lines = codecs.open(save_folder + '/' + f,'r','utf-8').readlines()
for L in lines:
L = L.strip()
line_freq[L] += 1
# second pass
for f in files:
print("\n\n",f)
lines = codecs.open(save_folder + '/' + f,'r','utf-8').readlines()
out = codecs.open(clean_folder + '/' + f,'w','utf-8')
for L in lines:
L = L.strip()
if L in line_freq and line_freq[L] > 3:
continue
print(L)
out.write(L + '\n')
out.close()
def samples(): def samples():
crawler = Crawler(links=[], domains=[], delay=20.0, sort=FIFO) crawler = Crawler(links=[], domains=[], delay=20.0, sort=FIFO)
url = URL('http://www.clips.ua.ac.bemedia/pattern_schema.gif') url = URL('http://www.clips.ua.ac.bemedia/pattern_schema.gif')
print url.mimetype in MIMETYPE_IMAGE print(url.mimetype in MIMETYPE_IMAGE)
#html = download('http://www.clips.ua.ac.be/', unicode=True) #html = download('http://www.clips.ua.ac.be/', unicode=True)
@ -876,14 +920,14 @@ def samples():
url = URL('http://www.clips.ua.ac.be') url = URL('http://www.clips.ua.ac.be')
dom = DOM(url.download()) dom = DOM(url.download())
for link in dom('a'): for link in dom('a'):
print abs(link.attributes.get('href',''), base=url.redirect or url.string) print(abs(link.attributes.get('href',''), base=url.redirect or url.string))
# get pdfs # get pdfs
from pattern.web import URL, PDF from pattern.web import URL, PDF
url = URL('http://www.clips.ua.ac.be/sites/default/files/ctrs-002_0.pdf') url = URL('http://www.clips.ua.ac.be/sites/default/files/ctrs-002_0.pdf')
pdf = PDF(url.download()) pdf = PDF(url.download())
print pdf.string print(pdf.string)
@ -897,6 +941,8 @@ if __name__ == "__main__":
# 5: ['import freshdesk content', freshdesk ], # 5: ['import freshdesk content', freshdesk ],
6: ['download all a courses pages', grab_course_pages], 6: ['download all a courses pages', grab_course_pages],
7: ['demo vector search', demo_vector_search], 7: ['demo vector search', demo_vector_search],
8: ['crawl',crawl],
9: ['clean text index', txt_clean_index],
} }
for key in options: for key in options:

View File

@ -1086,6 +1086,7 @@ def add_evals(section=0):
s = [ x.strip() for x in codecs.open('cache/sp23_eval_sections.csv','r').readlines()] s = [ x.strip() for x in codecs.open('cache/sp23_eval_sections.csv','r').readlines()]
s = list(funcy.flatten(s)) s = list(funcy.flatten(s))
s.sort() s.sort()
print(s)
xyz = input('hit return to continue') xyz = input('hit return to continue')
#c = getCoursesInTerm(168,0,1) #c = getCoursesInTerm(168,0,1)
@ -1306,7 +1307,6 @@ def set_ext_tools():
if __name__ == "__main__": if __name__ == "__main__":
options = { 1: ['Cross check schedule with ztc responses',make_ztc_list] , options = { 1: ['Cross check schedule with ztc responses',make_ztc_list] ,
30: ['List latestart classes', list_latestarts ],
2: ['Add announcements to homepage', change_course_ann_homepage], 2: ['Add announcements to homepage', change_course_ann_homepage],
3: ['Cross-list classes', xlist ], 3: ['Cross-list classes', xlist ],
4: ['List students who passed quiz X', get_quiz_passers], 4: ['List students who passed quiz X', get_quiz_passers],
@ -1335,6 +1335,7 @@ if __name__ == "__main__":
27: ['Fine tune term dates and winter session', course_dates_terms], 27: ['Fine tune term dates and winter session', course_dates_terms],
28: ['Cross list a semester from file', semester_cross_lister], 28: ['Cross list a semester from file', semester_cross_lister],
29: ['Check all courses & their sections in semester', all_semester_course_sanity_check], 29: ['Check all courses & their sections in semester', all_semester_course_sanity_check],
#30: ['List latestart classes', list_latestarts ],
# TODO wanted: group shell for each GP (guided pathway) as a basic student services gateway.... # TODO wanted: group shell for each GP (guided pathway) as a basic student services gateway....
# #
} }