content scraping

2023-04-14 10:20:06 -07:00 · 2023-04-14 10:20:06 -07:00 · ec8658cd8f
parent 3f9fa2a8b5
commit ec8658cd8f
3 changed files with 2042 additions and 1637 deletions
--- a/apphelp.py
+++ b/apphelp.py
--- a/content.py
+++ b/content.py
@ -6,6 +6,7 @@ from pipelines import header, fetch, url, put_file
 from util import clean_title, to_file_friendly, minimal_string, stripper, mycleaner
 from bs4 import BeautifulSoup as bs
 from html.parser import HTMLParser
+from collections import defaultdict
 import tomd, checker 
 import html2markdown as h2m
 import pypandoc
@ -829,40 +830,83 @@ Schedule an In-Person, Phone or Zoom Appointment"""


 ## TODO  site scraper
-
-## TODO  finde package that extracts text from web page
-
+## TODO  find package that extracts text from web page
 ### TODO master list of what to index.

-from pattern.web import URL, plaintext, extension
+## TODO PDFs and DOCXs
+## TODO fix urls w/ anchors
+
+
+
+from pattern.web import  plaintext, extension
 from pattern.web import download
-from pattern import URL, MIMETYPE_IMAGE
-from pattern.web import Crawler
-from util import clean_title
+#from pattern import URL, MIMETYPE_IMAGE
+from pattern.web import Crawler, DEPTH
+import bs4
+import trafilatura

 save_folder = 'cache/crawl'
+clean_folder = 'cache/cleancrawl'
+
+def clean_fn(s):
+    s = re.sub(r'[\s:]+','',s)
+    s = re.sub(r'\/','_',s)
+    return s
+    
+def format_html(html):
+    soup = bs4.BeautifulSoup(html, 'html.parser')
+    return soup.prettify()
+    

 class GavCrawl(Crawler): 
    def visit(self, link, source=None):
-        print 'visited:', repr(link.url), 'from:', link.referrer
-        txt = plaintext(link.source)  ## , keep={'h1':[], 'h2':[], 'strong':[], 'a':['href']})
-        codecs.open(save_folder + '/' + clean_title(link.url) + '.txt').write(txt)
+        print('visited:', repr(link.url), 'from:', link.referrer)
+        #txt = plaintext(source, keep={'h1':[], 'h2':[], 'h3':[], 'h4':[], 'td':[], 'strong':[], 'b':[], 'a':['href'], 'img':['src'], 'ul':[], 'ol':[], 'li':[], 'dd':[], 'dt':[], 'i':[]})
+        #codecs.open(save_folder + '/' + mycleaner(clean_title(link.url)) + '.txt','w','utf-8').write(tomd.convert(txt))
+        
+        codecs.open(save_folder + '/' + clean_fn(link.url) + '.txt','w','utf-8').write(trafilatura.extract(source,include_links=True, deduplicate=True, include_images=True, include_formatting=True))
+        

    def fail(self, link):
-        print 'failed:', repr(link.url)
+        print('failed:', repr(link.url))

 def crawl():
-    p = GavCrawl(links=['http://www.gavilan.edu/'], delay=3)
+    p = GavCrawl(links=['http://www.gavilan.edu/'], domains=['gavilan.edu'], delay=0.75)
    while not p.done:
-        p.crawl(method=DEPTH, cached=False, throttle=3)
-
+        try:
+            p.crawl(method=DEPTH, cached=False, throttle=0.76)
+        except Exception as e:
+            print("Exception: ", e)

+def txt_clean_index():
+    files = os.listdir(save_folder)
+    line_freq = defaultdict(int)
+    
+    # first pass
+    for f in files:
+        lines = codecs.open(save_folder + '/' + f,'r','utf-8').readlines()
+        for L in lines:
+            L = L.strip()
+            line_freq[L] += 1
+    
+    # second pass
+    for f in files:
+        print("\n\n",f)
+        lines = codecs.open(save_folder + '/' + f,'r','utf-8').readlines()
+        out = codecs.open(clean_folder + '/' + f,'w','utf-8')
+        for L in lines:
+            L = L.strip()
+            if L in line_freq and line_freq[L] > 3:
+                continue
+            print(L)
+            out.write(L + '\n')
+        out.close()

 def samples():
    crawler = Crawler(links=[], domains=[], delay=20.0, sort=FIFO)

    url = URL('http://www.clips.ua.ac.bemedia/pattern_schema.gif')
-    print url.mimetype in MIMETYPE_IMAGE
+    print(url.mimetype in MIMETYPE_IMAGE)


    #html = download('http://www.clips.ua.ac.be/', unicode=True)
@ -876,14 +920,14 @@ def samples():
    url = URL('http://www.clips.ua.ac.be')
    dom = DOM(url.download())
    for link in dom('a'):
-        print abs(link.attributes.get('href',''), base=url.redirect or url.string) 
+        print(abs(link.attributes.get('href',''), base=url.redirect or url.string))

    # get pdfs
    from pattern.web import URL, PDF

    url = URL('http://www.clips.ua.ac.be/sites/default/files/ctrs-002_0.pdf')
    pdf = PDF(url.download())
-    print pdf.string
+    print(pdf.string)



@ -897,6 +941,8 @@ if __name__ == "__main__":
                # 5: ['import freshdesk content', freshdesk ],
                6: ['download all a courses pages', grab_course_pages],
                7: ['demo vector search', demo_vector_search],
+                8: ['crawl',crawl],
+                9: ['clean text index', txt_clean_index],
              }
    
    for key in options:
--- a/courses.py
+++ b/courses.py
@ -1086,6 +1086,7 @@ def add_evals(section=0):
    s = [ x.strip() for x in codecs.open('cache/sp23_eval_sections.csv','r').readlines()]
    s = list(funcy.flatten(s))
    s.sort()
+    print(s)
    xyz = input('hit return to continue')
    
    #c = getCoursesInTerm(168,0,1)
@ -1306,7 +1307,6 @@ def set_ext_tools():
    
 if __name__ == "__main__":
    options = { 1: ['Cross check schedule with ztc responses',make_ztc_list] ,  
-                30: ['List latestart classes', list_latestarts ],
                2: ['Add announcements to homepage', change_course_ann_homepage],
                3: ['Cross-list classes', xlist ],
                4: ['List students who passed quiz X', get_quiz_passers],
@ -1335,6 +1335,7 @@ if __name__ == "__main__":
                27: ['Fine tune term dates and winter session', course_dates_terms],
                28: ['Cross list a semester from file', semester_cross_lister],
                29: ['Check all courses & their sections in semester', all_semester_course_sanity_check],
+                #30: ['List latestart classes', list_latestarts ],
                # TODO wanted: group shell for each GP (guided pathway) as a basic student services gateway....
                #
              }