content scraping
This commit is contained in:
parent
3f9fa2a8b5
commit
ec8658cd8f
3596
apphelp.py
3596
apphelp.py
File diff suppressed because it is too large
Load Diff
80
content.py
80
content.py
|
|
@ -6,6 +6,7 @@ from pipelines import header, fetch, url, put_file
|
||||||
from util import clean_title, to_file_friendly, minimal_string, stripper, mycleaner
|
from util import clean_title, to_file_friendly, minimal_string, stripper, mycleaner
|
||||||
from bs4 import BeautifulSoup as bs
|
from bs4 import BeautifulSoup as bs
|
||||||
from html.parser import HTMLParser
|
from html.parser import HTMLParser
|
||||||
|
from collections import defaultdict
|
||||||
import tomd, checker
|
import tomd, checker
|
||||||
import html2markdown as h2m
|
import html2markdown as h2m
|
||||||
import pypandoc
|
import pypandoc
|
||||||
|
|
@ -829,40 +830,83 @@ Schedule an In-Person, Phone or Zoom Appointment"""
|
||||||
|
|
||||||
|
|
||||||
## TODO site scraper
|
## TODO site scraper
|
||||||
|
## TODO find package that extracts text from web page
|
||||||
## TODO finde package that extracts text from web page
|
|
||||||
|
|
||||||
### TODO master list of what to index.
|
### TODO master list of what to index.
|
||||||
|
|
||||||
from pattern.web import URL, plaintext, extension
|
## TODO PDFs and DOCXs
|
||||||
|
## TODO fix urls w/ anchors
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
from pattern.web import plaintext, extension
|
||||||
from pattern.web import download
|
from pattern.web import download
|
||||||
from pattern import URL, MIMETYPE_IMAGE
|
#from pattern import URL, MIMETYPE_IMAGE
|
||||||
from pattern.web import Crawler
|
from pattern.web import Crawler, DEPTH
|
||||||
from util import clean_title
|
import bs4
|
||||||
|
import trafilatura
|
||||||
|
|
||||||
save_folder = 'cache/crawl'
|
save_folder = 'cache/crawl'
|
||||||
|
clean_folder = 'cache/cleancrawl'
|
||||||
|
|
||||||
|
def clean_fn(s):
|
||||||
|
s = re.sub(r'[\s:]+','',s)
|
||||||
|
s = re.sub(r'\/','_',s)
|
||||||
|
return s
|
||||||
|
|
||||||
|
def format_html(html):
|
||||||
|
soup = bs4.BeautifulSoup(html, 'html.parser')
|
||||||
|
return soup.prettify()
|
||||||
|
|
||||||
|
|
||||||
class GavCrawl(Crawler):
|
class GavCrawl(Crawler):
|
||||||
def visit(self, link, source=None):
|
def visit(self, link, source=None):
|
||||||
print 'visited:', repr(link.url), 'from:', link.referrer
|
print('visited:', repr(link.url), 'from:', link.referrer)
|
||||||
txt = plaintext(link.source) ## , keep={'h1':[], 'h2':[], 'strong':[], 'a':['href']})
|
#txt = plaintext(source, keep={'h1':[], 'h2':[], 'h3':[], 'h4':[], 'td':[], 'strong':[], 'b':[], 'a':['href'], 'img':['src'], 'ul':[], 'ol':[], 'li':[], 'dd':[], 'dt':[], 'i':[]})
|
||||||
codecs.open(save_folder + '/' + clean_title(link.url) + '.txt').write(txt)
|
#codecs.open(save_folder + '/' + mycleaner(clean_title(link.url)) + '.txt','w','utf-8').write(tomd.convert(txt))
|
||||||
|
|
||||||
|
codecs.open(save_folder + '/' + clean_fn(link.url) + '.txt','w','utf-8').write(trafilatura.extract(source,include_links=True, deduplicate=True, include_images=True, include_formatting=True))
|
||||||
|
|
||||||
|
|
||||||
def fail(self, link):
|
def fail(self, link):
|
||||||
print 'failed:', repr(link.url)
|
print('failed:', repr(link.url))
|
||||||
|
|
||||||
def crawl():
|
def crawl():
|
||||||
p = GavCrawl(links=['http://www.gavilan.edu/'], delay=3)
|
p = GavCrawl(links=['http://www.gavilan.edu/'], domains=['gavilan.edu'], delay=0.75)
|
||||||
while not p.done:
|
while not p.done:
|
||||||
p.crawl(method=DEPTH, cached=False, throttle=3)
|
try:
|
||||||
|
p.crawl(method=DEPTH, cached=False, throttle=0.76)
|
||||||
|
except Exception as e:
|
||||||
|
print("Exception: ", e)
|
||||||
|
|
||||||
|
def txt_clean_index():
|
||||||
|
files = os.listdir(save_folder)
|
||||||
|
line_freq = defaultdict(int)
|
||||||
|
|
||||||
|
# first pass
|
||||||
|
for f in files:
|
||||||
|
lines = codecs.open(save_folder + '/' + f,'r','utf-8').readlines()
|
||||||
|
for L in lines:
|
||||||
|
L = L.strip()
|
||||||
|
line_freq[L] += 1
|
||||||
|
|
||||||
|
# second pass
|
||||||
|
for f in files:
|
||||||
|
print("\n\n",f)
|
||||||
|
lines = codecs.open(save_folder + '/' + f,'r','utf-8').readlines()
|
||||||
|
out = codecs.open(clean_folder + '/' + f,'w','utf-8')
|
||||||
|
for L in lines:
|
||||||
|
L = L.strip()
|
||||||
|
if L in line_freq and line_freq[L] > 3:
|
||||||
|
continue
|
||||||
|
print(L)
|
||||||
|
out.write(L + '\n')
|
||||||
|
out.close()
|
||||||
|
|
||||||
def samples():
|
def samples():
|
||||||
crawler = Crawler(links=[], domains=[], delay=20.0, sort=FIFO)
|
crawler = Crawler(links=[], domains=[], delay=20.0, sort=FIFO)
|
||||||
|
|
||||||
url = URL('http://www.clips.ua.ac.bemedia/pattern_schema.gif')
|
url = URL('http://www.clips.ua.ac.bemedia/pattern_schema.gif')
|
||||||
print url.mimetype in MIMETYPE_IMAGE
|
print(url.mimetype in MIMETYPE_IMAGE)
|
||||||
|
|
||||||
|
|
||||||
#html = download('http://www.clips.ua.ac.be/', unicode=True)
|
#html = download('http://www.clips.ua.ac.be/', unicode=True)
|
||||||
|
|
@ -876,14 +920,14 @@ def samples():
|
||||||
url = URL('http://www.clips.ua.ac.be')
|
url = URL('http://www.clips.ua.ac.be')
|
||||||
dom = DOM(url.download())
|
dom = DOM(url.download())
|
||||||
for link in dom('a'):
|
for link in dom('a'):
|
||||||
print abs(link.attributes.get('href',''), base=url.redirect or url.string)
|
print(abs(link.attributes.get('href',''), base=url.redirect or url.string))
|
||||||
|
|
||||||
# get pdfs
|
# get pdfs
|
||||||
from pattern.web import URL, PDF
|
from pattern.web import URL, PDF
|
||||||
|
|
||||||
url = URL('http://www.clips.ua.ac.be/sites/default/files/ctrs-002_0.pdf')
|
url = URL('http://www.clips.ua.ac.be/sites/default/files/ctrs-002_0.pdf')
|
||||||
pdf = PDF(url.download())
|
pdf = PDF(url.download())
|
||||||
print pdf.string
|
print(pdf.string)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -897,6 +941,8 @@ if __name__ == "__main__":
|
||||||
# 5: ['import freshdesk content', freshdesk ],
|
# 5: ['import freshdesk content', freshdesk ],
|
||||||
6: ['download all a courses pages', grab_course_pages],
|
6: ['download all a courses pages', grab_course_pages],
|
||||||
7: ['demo vector search', demo_vector_search],
|
7: ['demo vector search', demo_vector_search],
|
||||||
|
8: ['crawl',crawl],
|
||||||
|
9: ['clean text index', txt_clean_index],
|
||||||
}
|
}
|
||||||
|
|
||||||
for key in options:
|
for key in options:
|
||||||
|
|
|
||||||
|
|
@ -1086,6 +1086,7 @@ def add_evals(section=0):
|
||||||
s = [ x.strip() for x in codecs.open('cache/sp23_eval_sections.csv','r').readlines()]
|
s = [ x.strip() for x in codecs.open('cache/sp23_eval_sections.csv','r').readlines()]
|
||||||
s = list(funcy.flatten(s))
|
s = list(funcy.flatten(s))
|
||||||
s.sort()
|
s.sort()
|
||||||
|
print(s)
|
||||||
xyz = input('hit return to continue')
|
xyz = input('hit return to continue')
|
||||||
|
|
||||||
#c = getCoursesInTerm(168,0,1)
|
#c = getCoursesInTerm(168,0,1)
|
||||||
|
|
@ -1306,7 +1307,6 @@ def set_ext_tools():
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
options = { 1: ['Cross check schedule with ztc responses',make_ztc_list] ,
|
options = { 1: ['Cross check schedule with ztc responses',make_ztc_list] ,
|
||||||
30: ['List latestart classes', list_latestarts ],
|
|
||||||
2: ['Add announcements to homepage', change_course_ann_homepage],
|
2: ['Add announcements to homepage', change_course_ann_homepage],
|
||||||
3: ['Cross-list classes', xlist ],
|
3: ['Cross-list classes', xlist ],
|
||||||
4: ['List students who passed quiz X', get_quiz_passers],
|
4: ['List students who passed quiz X', get_quiz_passers],
|
||||||
|
|
@ -1335,6 +1335,7 @@ if __name__ == "__main__":
|
||||||
27: ['Fine tune term dates and winter session', course_dates_terms],
|
27: ['Fine tune term dates and winter session', course_dates_terms],
|
||||||
28: ['Cross list a semester from file', semester_cross_lister],
|
28: ['Cross list a semester from file', semester_cross_lister],
|
||||||
29: ['Check all courses & their sections in semester', all_semester_course_sanity_check],
|
29: ['Check all courses & their sections in semester', all_semester_course_sanity_check],
|
||||||
|
#30: ['List latestart classes', list_latestarts ],
|
||||||
# TODO wanted: group shell for each GP (guided pathway) as a basic student services gateway....
|
# TODO wanted: group shell for each GP (guided pathway) as a basic student services gateway....
|
||||||
#
|
#
|
||||||
}
|
}
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue