start web crawler
This commit is contained in:
parent
907a46acd3
commit
3f9fa2a8b5
59
content.py
59
content.py
|
|
@ -828,11 +828,64 @@ Schedule an In-Person, Phone or Zoom Appointment"""
|
|||
print(f"Vector for the word '{example_word}': {vector}")
|
||||
|
||||
|
||||
## TODO site scraper
|
||||
## TODO site scraper
|
||||
|
||||
## TODO finde package that extracts text from web page
|
||||
|
||||
### TODO master list of what to index.
|
||||
|
||||
from pattern.web import URL, plaintext, extension
|
||||
from pattern.web import download
|
||||
from pattern import URL, MIMETYPE_IMAGE
|
||||
from pattern.web import Crawler
|
||||
from util import clean_title
|
||||
|
||||
save_folder = 'cache/crawl'
|
||||
|
||||
class GavCrawl(Crawler):
|
||||
def visit(self, link, source=None):
|
||||
print 'visited:', repr(link.url), 'from:', link.referrer
|
||||
txt = plaintext(link.source) ## , keep={'h1':[], 'h2':[], 'strong':[], 'a':['href']})
|
||||
codecs.open(save_folder + '/' + clean_title(link.url) + '.txt').write(txt)
|
||||
|
||||
def fail(self, link):
|
||||
print 'failed:', repr(link.url)
|
||||
|
||||
def crawl():
|
||||
p = GavCrawl(links=['http://www.gavilan.edu/'], delay=3)
|
||||
while not p.done:
|
||||
p.crawl(method=DEPTH, cached=False, throttle=3)
|
||||
|
||||
|
||||
|
||||
def samples():
|
||||
crawler = Crawler(links=[], domains=[], delay=20.0, sort=FIFO)
|
||||
|
||||
url = URL('http://www.clips.ua.ac.bemedia/pattern_schema.gif')
|
||||
print url.mimetype in MIMETYPE_IMAGE
|
||||
|
||||
|
||||
#html = download('http://www.clips.ua.ac.be/', unicode=True)
|
||||
s = URL('http://www.clips.ua.ac.be').download()
|
||||
s = plaintext(s, keep={'h1':[], 'h2':[], 'strong':[], 'a':['href']})
|
||||
|
||||
|
||||
# getting absolute urls
|
||||
from pattern.web import URL, DOM, abs
|
||||
|
||||
url = URL('http://www.clips.ua.ac.be')
|
||||
dom = DOM(url.download())
|
||||
for link in dom('a'):
|
||||
print abs(link.attributes.get('href',''), base=url.redirect or url.string)
|
||||
|
||||
# get pdfs
|
||||
from pattern.web import URL, PDF
|
||||
|
||||
url = URL('http://www.clips.ua.ac.be/sites/default/files/ctrs-002_0.pdf')
|
||||
pdf = PDF(url.download())
|
||||
print pdf.string
|
||||
|
||||
## TODO finde package that extracts text from web page
|
||||
|
||||
### TODO master list of what to index.
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
|
|
|
|||
Loading…
Reference in New Issue