diff --git a/content.py b/content.py index 56307a6..a855d93 100644 --- a/content.py +++ b/content.py @@ -828,12 +828,65 @@ Schedule an In-Person, Phone or Zoom Appointment""" print(f"Vector for the word '{example_word}': {vector}") - ## TODO site scraper +## TODO site scraper + +## TODO finde package that extracts text from web page + +### TODO master list of what to index. + +from pattern.web import URL, plaintext, extension +from pattern.web import download +from pattern import URL, MIMETYPE_IMAGE +from pattern.web import Crawler +from util import clean_title + +save_folder = 'cache/crawl' + +class GavCrawl(Crawler): + def visit(self, link, source=None): + print 'visited:', repr(link.url), 'from:', link.referrer + txt = plaintext(link.source) ## , keep={'h1':[], 'h2':[], 'strong':[], 'a':['href']}) + codecs.open(save_folder + '/' + clean_title(link.url) + '.txt').write(txt) + + def fail(self, link): + print 'failed:', repr(link.url) + +def crawl(): + p = GavCrawl(links=['http://www.gavilan.edu/'], delay=3) + while not p.done: + p.crawl(method=DEPTH, cached=False, throttle=3) + + + +def samples(): + crawler = Crawler(links=[], domains=[], delay=20.0, sort=FIFO) + + url = URL('http://www.clips.ua.ac.bemedia/pattern_schema.gif') + print url.mimetype in MIMETYPE_IMAGE + + + #html = download('http://www.clips.ua.ac.be/', unicode=True) + s = URL('http://www.clips.ua.ac.be').download() + s = plaintext(s, keep={'h1':[], 'h2':[], 'strong':[], 'a':['href']}) + + + # getting absolute urls + from pattern.web import URL, DOM, abs + + url = URL('http://www.clips.ua.ac.be') + dom = DOM(url.download()) + for link in dom('a'): + print abs(link.attributes.get('href',''), base=url.redirect or url.string) + + # get pdfs + from pattern.web import URL, PDF + + url = URL('http://www.clips.ua.ac.be/sites/default/files/ctrs-002_0.pdf') + pdf = PDF(url.download()) + print pdf.string + - ## TODO finde package that extracts text from web page - ### TODO master list of what to index. - if __name__ == "__main__": print ('')