start web crawler
This commit is contained in:
parent
907a46acd3
commit
3f9fa2a8b5
59
content.py
59
content.py
|
|
@ -828,11 +828,64 @@ Schedule an In-Person, Phone or Zoom Appointment"""
|
||||||
print(f"Vector for the word '{example_word}': {vector}")
|
print(f"Vector for the word '{example_word}': {vector}")
|
||||||
|
|
||||||
|
|
||||||
## TODO site scraper
|
## TODO site scraper
|
||||||
|
|
||||||
|
## TODO finde package that extracts text from web page
|
||||||
|
|
||||||
|
### TODO master list of what to index.
|
||||||
|
|
||||||
|
from pattern.web import URL, plaintext, extension
|
||||||
|
from pattern.web import download
|
||||||
|
from pattern import URL, MIMETYPE_IMAGE
|
||||||
|
from pattern.web import Crawler
|
||||||
|
from util import clean_title
|
||||||
|
|
||||||
|
save_folder = 'cache/crawl'
|
||||||
|
|
||||||
|
class GavCrawl(Crawler):
|
||||||
|
def visit(self, link, source=None):
|
||||||
|
print 'visited:', repr(link.url), 'from:', link.referrer
|
||||||
|
txt = plaintext(link.source) ## , keep={'h1':[], 'h2':[], 'strong':[], 'a':['href']})
|
||||||
|
codecs.open(save_folder + '/' + clean_title(link.url) + '.txt').write(txt)
|
||||||
|
|
||||||
|
def fail(self, link):
|
||||||
|
print 'failed:', repr(link.url)
|
||||||
|
|
||||||
|
def crawl():
|
||||||
|
p = GavCrawl(links=['http://www.gavilan.edu/'], delay=3)
|
||||||
|
while not p.done:
|
||||||
|
p.crawl(method=DEPTH, cached=False, throttle=3)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def samples():
|
||||||
|
crawler = Crawler(links=[], domains=[], delay=20.0, sort=FIFO)
|
||||||
|
|
||||||
|
url = URL('http://www.clips.ua.ac.bemedia/pattern_schema.gif')
|
||||||
|
print url.mimetype in MIMETYPE_IMAGE
|
||||||
|
|
||||||
|
|
||||||
|
#html = download('http://www.clips.ua.ac.be/', unicode=True)
|
||||||
|
s = URL('http://www.clips.ua.ac.be').download()
|
||||||
|
s = plaintext(s, keep={'h1':[], 'h2':[], 'strong':[], 'a':['href']})
|
||||||
|
|
||||||
|
|
||||||
|
# getting absolute urls
|
||||||
|
from pattern.web import URL, DOM, abs
|
||||||
|
|
||||||
|
url = URL('http://www.clips.ua.ac.be')
|
||||||
|
dom = DOM(url.download())
|
||||||
|
for link in dom('a'):
|
||||||
|
print abs(link.attributes.get('href',''), base=url.redirect or url.string)
|
||||||
|
|
||||||
|
# get pdfs
|
||||||
|
from pattern.web import URL, PDF
|
||||||
|
|
||||||
|
url = URL('http://www.clips.ua.ac.be/sites/default/files/ctrs-002_0.pdf')
|
||||||
|
pdf = PDF(url.download())
|
||||||
|
print pdf.string
|
||||||
|
|
||||||
## TODO finde package that extracts text from web page
|
|
||||||
|
|
||||||
### TODO master list of what to index.
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue