content scraping

This commit is contained in:
Peter Howell 2023-04-14 10:20:06 -07:00
parent 3f9fa2a8b5
commit ec8658cd8f
3 changed files with 2042 additions and 1637 deletions

View File

@ -71,22 +71,21 @@ prog_write.close()
## Functions
#
# Total TODOs remaining: 57
# Total TODOs remaining: 67
#
# TODOs per file:
#
# 1 - checker.py
# 1 - content.py
# 6 - content.py
# 6 - courses.py
# 3 - curriculum.py
# 5 - depricated.py
# 6 - localcache.py
# 6 - depricated.py
# 7 - localcache.py
# 2 - outcomes.py
# 20 - pipelines.py
# 17 - pipelines.py
# 2 - server.py
# 2 - tasks.py
# 5 - tasks.py
# 1 - tempget.py
# 8 - users.py
# 12 - users.py
#
@ -95,8 +94,9 @@ prog_write.close()
__init__.py
canvas_secrets.py
checker.py
todo: make this sweet
def safe_html(html):
@ -120,12 +120,6 @@ content.py
def d(s):
def stripper(s):
def mycleaner(s):
def freshdesk():
# Build a master file with the entire class content
def accessible_check(id=""):
todo: include linked pages even if they aren't in module
@ -140,12 +134,6 @@ content.py
# DL pages only
def grab_course_pages(course_num=-1):
# Appears to not be used
def put_course_pages():
# Also not used
def put_revised_pages():
# Download, clean html, and reupload page
def update_page():
@ -163,13 +151,32 @@ content.py
def multiple_downloads():
def demo_vector_search():
def is_complete_sentence(text):
todo: site scraper
todo: find package that extracts text from web page
todo: master list of what to index.
todo: PDFs and DOCXs
todo: fix urls w/ anchors
def clean_fn(s):
def format_html(html):
def visit(self, link, source=None):
def fail(self, link):
def crawl():
def txt_clean_index():
def samples():
courses.py
todo:
def int_or_zero(x):
def float_or_zero(x):
# Gott 1 Bootcamp - report on who completed it.
def get_gott1_passers():
@ -179,15 +186,12 @@ courses.py
# Who, in a class, passed?
def get_course_passers(course, min_passing, passers_filename, still_active_filename):
# Who, in a class and a quiz, passed?
def get_quiz_passers():
# Change courses to show 2 announcements
def change_course_ann_homepage(id="10458"):
def scrape_bookstore():
todo: where does the most recent schedule come from?
# Input: xxxx_sched.json. Output: xxxx_latestarts.txt
def list_latestarts():
# All students enrolled in a class in the given semester. Simpler verson of below. Return SET of course_ids.
def users_in_semester():
todo:
@ -203,13 +207,18 @@ courses.py
def getTerms(printme=1, ask=1):
todo: unsafe overwrite
def getCourses(): # a dict
def getCourses(x=0): # a dict
def update_course_conclude(courseid="13590",enddate='2021-12-23T01:00Z'):
# Relevant stuff trying to see if its even being used or not
def course_term_summary():
def course_term_summary_local(term="176",term_label="FA22"):
# Relevant stuff trying to see if its even being used or not
def course_term_summary(term="176",term_label="FA22"):
# Fetch all courses in a given term
def getCoursesInTerm(term=0,show=1,active=0): # a list
def getCoursesInTerm(term=0,get_fresh=1,show=0,active=0): # a list
def getCoursesTermSearch(term=0,search='',v=0):
@ -217,6 +226,16 @@ courses.py
def xlistLineSummary(c,sections={}):
def numbers_in_common(L):
def combined_name(nic,L):
def semester_cross_lister():
def xlist_ii(parasite_id,host_id,new_name,new_code):
def all_semester_course_sanity_check():
def eslCrosslister():
def xlist(parasite='', host=''): # section id , new course id
@ -226,21 +245,93 @@ courses.py
def enroll_stem_students_live():
def enroll_bulk_students_bydept(course_id, depts, the_term="172", cautious=1): # a string, a list of strings
todo: not done here
def enroll_art_students_live():
def enroll_orientation_students():
def summarize_proportion_online_classes(u):
def summarize_num_term_classes(u):
def enroll_o_s_students():
def make_ztc_list(sem='sp20'):
def course_search_by_sis():
def mod_eval_visibility( shell_id, visible=True ):
def instructor_list_to_activate_evals():
def add_evals(section=0):
def course_dates_terms(section=0):
def remove_n_analytics(section=0):
def create_sandboxes():
def course_term_summary_2():
def get_ext_tools():
def set_ext_tools():
todo: wanted: group shell for each GP (guided pathway) as a basic student services gateway....
cq_demo.py
def fetch(target):
curric2022.py
def fetch_all_programs():
def nothing(x=0):
def clean(st):
def recur_matcher(item, depth=0):
def single_course_parse(c):
def match_style_test():
def single_program_path_parse(c):
def path_style_prog():
def term_txt_to_code(t):
def all_outcomes():
def ddl():
def splitclassline(cl, id=''):
def path_style_2_html():
def course_path_style_2_html():
def another_request(url,startat):
def fetch_all_classes():
def recur_path_matcher(item, path=[]):
def x2_path_update(x,y,z):
def pathstyle(theclass):
def single_course_path_parse(c):
def path_style_test():
def make_sl():
def course_rank():
curriculum.py
todo: These secrets
def dbg(x):
def another_request(url,startat):
@ -335,6 +426,9 @@ curriculum.py
def is_online_inblock(c):
# 9/2021 clean programs to good json
def organize_programs_stage2():
# of all the programs, what can be accomplished online?
def find_online_programs():
@ -408,6 +502,35 @@ curriculum.py
def cq_8020_start():
def recurse3(sec,path=''):
def get_id_sortorder(sec):
def include_exclude(str,inc,exc=[]):
def pbd3(str):
def handleField(f):
def boolToStr(b):
# Almost final formatting
def prog_info_to_entry(c):
def cbd_to_entry(c):
def pc5(str):
def remove_prefix(str,i):
def course_to_entry(c,order="0"):
def courseline_to_pretty(line):
# restarted oct 2019 and try to simplify
def prog_take_4(program):
todo:
curriculum2020.py
def to_md(s):
@ -473,6 +596,18 @@ curriculum_patterns.py
def jj2(a,b,c,d):
depricated.py
todo: where does the most recent schedule come from?
# Input: xxxx_sched.json. Output: xxxx_latestarts.txt
def list_latestarts():
def prep_online_courses_df():
def course_is_online(crn):
def get_crn_from_name(name):
def get_enrlmts_for_user(user,enrollments):
# Don't know
def demo():
@ -504,10 +639,30 @@ depricated.py
def sd():
def serve():
def summarize_proportion_online_classes(u):
def summarize_num_term_classes(u):
todo: this duplicates courses.py ??
# Prompt for course id, return list of user dicts. TODO this duplicates courses.py ??
def getUsersInCourse(id=0): # returns list
def recur_look_for_leafs(item,indent=0,show=1):
def am_i_a_leaf(item):
def sampleclass():
def matchstyle():
def user_role_and_online():
def more_unused_xreferencing():
def users_p_file():
def com_channel_dim():
todo:
# NO LONGER USED - SEE COURSES
@ -559,6 +714,22 @@ depricated.py
def get_schedule(term='201870', sem='fall'):
def dates(s):
def parse_www_csv_sched():
def parse_json_test_sched():
def put_revised_pages():
def put_course_pages():
def freshdesk():
gpt.py
graphics.py
interactive.py
def dict_generator(indict, pre=None):
@ -579,8 +750,24 @@ interactive.py
def before_request():
def clears():
def dpi():
def dpi2():
def screenoff_a():
def light():
def do_image(filename):
def do_image_crop(filename,x,y,w,h,newname):
def save_post():
def writing_img(fname):
def restart():
def dispatch3(func,arg,arrg):
@ -597,12 +784,12 @@ interactive.py
def home():
def send_jslib(path):
def send_cachedata(path):
def send_js(path):
def send_jslib(path):
def s(key,val):
def do_sample():
@ -787,8 +974,6 @@ interactivex.py
def repl():
ipython_log.py
localcache.py
def db():
@ -893,7 +1078,12 @@ localcache.py
def f(x):
# get student count and teacher name from local db
def get_courses_in_term_local(term="172"):
# get student count
def course_student_stats(canvasid):
# get teacher name from local db
def course_quick_stats(canvasid):
# What a student has taken / teacher has taught
@ -932,24 +1122,37 @@ localcache.py
def qstrip(txt): return txt.strip('"')
def more_unused_xreferencing():
def user_role_and_online():
def comm_channel_file():
def pseudonym_file():
def users_p_file():
def com_channel_dim():
def abcd():
def crns_to_teachers():
def all_sem_courses_teachers():
def to_sis_sem(s):
def build_db_schedule():
def finder(st):
def process_enrollment_data():
def sem_to_idx(s):
todo:
def do_encoding():
main.py
myconsole.py
def handler(signum, frame):
def mainloop():
outcomes.py
def outcome_overview(term=21):
@ -958,12 +1161,10 @@ outcomes.py
def connect_acct_oc_to_course(course_id,oc_group_id):
def outcome_groups():
def outcome_groups_dump():
def outcome_groups_backup():
def x_ref_dept_names():
def create_course_group(short,parent):
def create_dept_group(short):
@ -992,6 +1193,74 @@ outcomes.py
def slo_source_by_dept():
def printj(j):
def writej(o,j):
# Get root outcome group
def root_og():
def recur_og():
def recur_main(out,g_url=""):
def recur2(out,og={}):
def all_og():
def course_slo_getter(q):
def threaded_getter():
def demo_o_fetch():
def outcome_groups_2021():
def x_ref_dept_names():
def all_outcome_results_in_term(termid=''):
def all_outcome_results_in_term_sub(termid=''):
def all_linked_outcomes_in_term(termid=''):
def all_linked_outcomes_in_term_sub(termid=''):
def assemblerow(g,parent=''):
def recur_full_fetch(out,g,parent=""):
# return the appropriate cq course version.
def find_cq_course_version(code):
def outcome_groups():
def summary_string(s):
def add_outcomes_course_id(canvas_id):
def add_outcomes_course_code():
def add_outcomes_course_code_sub(target_code='AJ184',term=178,fresh=0):
def add_csis_sp22():
def quick_add_course_outcomes(ilearn_course_id, cq_outcome_id_list):
def stringpad(s,n):
def code_from_ilearn_name(n,verbose=0):
def parse_ilearn_course_names_ALLSEMESTERS():
def parse_ilearn_course_names(term='178',fresh=1,log=0):
outcomes2022.py
def course_slo_getter(q):
def ilearn_shell_slo_to_csv(shell_slos):
patterns_8020.py
patterns_topdown.py
@ -1041,11 +1310,10 @@ patterns_topdown.py
def jj2(a,b,c,d):
pipelines.py
todo: secrets
todo: all these constants for SSB -- line 1008
todo: secrets
todo: https://stackoverflow.com/questions/42656247/how-can-i-use-canvas-data-rest-api-using-python
def d(s):
def d(s,end=''):
# Main canvas querying fxn
def fetch(target,verbose=0):
@ -1073,12 +1341,6 @@ pipelines.py
def getSemesterSchedule(short='sp21'): # I used to be current_schedule
todo: Some semesters have a different format.... partofday type site xxx i just dL'd them again
def prep_online_courses_df():
def course_is_online(crn):
def get_crn_from_name(name):
def get_enrlmts_for_user(user,enrollments):
# Get something from Canvas Data
@ -1110,6 +1372,7 @@ pipelines.py
### course is a list of 1-3 lists, each one being a line in the schedule's output. First one has section
def course_start(course):
todo: use this to make a early/late/short field and store semester dates w/ other constants
todo: do these years matter?
def time_to_partofday(t):
todo: account for multiple sites/rows
@ -1132,9 +1395,10 @@ pipelines.py
# Use Firefox and log in to ssb and get full schedule. Only works where selenium is installed
def scrape_schedule():
todo: my data here.... secret
todo:
def dza_sched():
# recreate schedule json files with most current online schedule format.
def recent_schedules():
todo: sems is a global in this file. Is that the right thing to do?
@ -1149,7 +1413,6 @@ pipelines.py
# From instructure sftp site
def fetch_current_rosters():
todo: secret
def fetch_current_rosters_auto():
@ -1159,7 +1422,6 @@ pipelines.py
# Upload a json file to www
def put_file(remotepath,localpath, localfile,prompt=1):
todo: remove this secret
todo: these paths
def sec(t): return "<h3>"+t+"</h3>\n"
@ -1205,8 +1467,43 @@ pipelines.py
def scrape_schedule_py():
def scrape_schedule_multi():
def scrape_for_db():
def argos_data():
def days_times(s):
def remove_year(s):
def argos_data_from_cvc():
def expand_old_semesters():
# Input: xxxx_sched.json. Output: xxxx_latestarts.txt
def list_latestarts(term="su23"):
server.py
def mqtt_loop():
# called when MQTT server connects
def on_connect(client, userdata, flags, rc):
# The callback for when a PUBLISH message is received from the server.
def on_message(client, userdata, msg):
def displaypi_on():
def displaypi_off():
def desklight():
def clearscreens():
def screenoff():
def tag(x,y): return "<%s>%s</%s>" % (x,y,x)
def tagc(x,c,y): return '<%s class="%s">%s</%s>' % (x,c,y,x)
@ -1242,6 +1539,8 @@ server.py
def randPic():
def do_img_crop(im):
def sample():
def sample2(a=""):
@ -1286,6 +1585,16 @@ server.py
def staff_dir(search=''):
def find_goo(n):
def byname(x):
def fn_to_struct( n, staff ):
def image_edit(filename=''):
def image_crop(filename,x,y,w,h,newname=''):
def server_save(key,value):
def server_dispatch_json(function_name,arg='', arg2=''):
@ -1310,6 +1619,8 @@ stats.py
tasks.py
def scrape_bookstore():
def survey_answer(q=0):
def survey_organize():
@ -1360,6 +1671,36 @@ tasks.py
def pos_atten():
def lname(x):
def l_initial(x):
def job_titles2():
def job_titles():
# an early version, before tearing up...
def job_titles3():
def index_pics():
def cmtes():
def strip(x): return x.strip()
def esc_comma(x): return re.sub(',','[CMA]',x)
def by_sem(x): return x['sem']
def parse_schedule():
todo: check if i need to update it
todo: some weird hour offset issue w/ these activities
def cal():
todo: >
def file_renamer():
temp.py
tempget.py
@ -1466,6 +1807,7 @@ users.py
def teacher_basic_info(sched, from_ilearn, names):
def find_that_name(x):
todo: Old and broken
# Outputs: cache/teacher_by_semester.csv,
def teacherModalityHistory(sched=[],names=[]):
@ -1475,6 +1817,7 @@ users.py
# Outputs: cache/course_teacher_combos.csv,
def teacherSharedCourses(a=[]):
todo: this is broken
# How many courses in each department were taught in the last year?
def departmentCountCourses(a=[]):
@ -1512,6 +1855,7 @@ users.py
# Make one big csv file of everything I know about a teacher
def getTeachersInfoMain():
todo: - broken
def enroll_staff_shell():
@ -1521,8 +1865,8 @@ users.py
# Get views counts on current teachers. todo: month is hardcoded here
def get_recent_views(id=1):
todo: broken?
# Have they taught online or hybrid classes?
def categorize_user(u):
todo: threaded
@ -1539,8 +1883,6 @@ users.py
# Go through my local profile pics, upload any that are missing.
def uploadPhoto():
def test_email():
def create_ztc_list():
def get_user_info(id):
@ -1583,8 +1925,20 @@ users.py
def one_course_enrol():
def find_new_teachers():
def user_db_sync():
def find_no_goo():
def track_a_user():
util.py
def stripper(s):
def mycleaner(s):
def print_table(table):
def remove_nl(str):
@ -1597,6 +1951,10 @@ util.py
def clean_title(st):
def int_or_zero(x):
def float_or_zero(x):
def match59(x):
def item_2(x): return x[2]

View File

@ -6,6 +6,7 @@ from pipelines import header, fetch, url, put_file
from util import clean_title, to_file_friendly, minimal_string, stripper, mycleaner
from bs4 import BeautifulSoup as bs
from html.parser import HTMLParser
from collections import defaultdict
import tomd, checker
import html2markdown as h2m
import pypandoc
@ -829,40 +830,83 @@ Schedule an In-Person, Phone or Zoom Appointment"""
## TODO site scraper
## TODO finde package that extracts text from web page
## TODO find package that extracts text from web page
### TODO master list of what to index.
from pattern.web import URL, plaintext, extension
## TODO PDFs and DOCXs
## TODO fix urls w/ anchors
from pattern.web import plaintext, extension
from pattern.web import download
from pattern import URL, MIMETYPE_IMAGE
from pattern.web import Crawler
from util import clean_title
#from pattern import URL, MIMETYPE_IMAGE
from pattern.web import Crawler, DEPTH
import bs4
import trafilatura
save_folder = 'cache/crawl'
clean_folder = 'cache/cleancrawl'
def clean_fn(s):
s = re.sub(r'[\s:]+','',s)
s = re.sub(r'\/','_',s)
return s
def format_html(html):
soup = bs4.BeautifulSoup(html, 'html.parser')
return soup.prettify()
class GavCrawl(Crawler):
def visit(self, link, source=None):
print 'visited:', repr(link.url), 'from:', link.referrer
txt = plaintext(link.source) ## , keep={'h1':[], 'h2':[], 'strong':[], 'a':['href']})
codecs.open(save_folder + '/' + clean_title(link.url) + '.txt').write(txt)
print('visited:', repr(link.url), 'from:', link.referrer)
#txt = plaintext(source, keep={'h1':[], 'h2':[], 'h3':[], 'h4':[], 'td':[], 'strong':[], 'b':[], 'a':['href'], 'img':['src'], 'ul':[], 'ol':[], 'li':[], 'dd':[], 'dt':[], 'i':[]})
#codecs.open(save_folder + '/' + mycleaner(clean_title(link.url)) + '.txt','w','utf-8').write(tomd.convert(txt))
codecs.open(save_folder + '/' + clean_fn(link.url) + '.txt','w','utf-8').write(trafilatura.extract(source,include_links=True, deduplicate=True, include_images=True, include_formatting=True))
def fail(self, link):
print 'failed:', repr(link.url)
print('failed:', repr(link.url))
def crawl():
p = GavCrawl(links=['http://www.gavilan.edu/'], delay=3)
p = GavCrawl(links=['http://www.gavilan.edu/'], domains=['gavilan.edu'], delay=0.75)
while not p.done:
p.crawl(method=DEPTH, cached=False, throttle=3)
try:
p.crawl(method=DEPTH, cached=False, throttle=0.76)
except Exception as e:
print("Exception: ", e)
def txt_clean_index():
files = os.listdir(save_folder)
line_freq = defaultdict(int)
# first pass
for f in files:
lines = codecs.open(save_folder + '/' + f,'r','utf-8').readlines()
for L in lines:
L = L.strip()
line_freq[L] += 1
# second pass
for f in files:
print("\n\n",f)
lines = codecs.open(save_folder + '/' + f,'r','utf-8').readlines()
out = codecs.open(clean_folder + '/' + f,'w','utf-8')
for L in lines:
L = L.strip()
if L in line_freq and line_freq[L] > 3:
continue
print(L)
out.write(L + '\n')
out.close()
def samples():
crawler = Crawler(links=[], domains=[], delay=20.0, sort=FIFO)
url = URL('http://www.clips.ua.ac.bemedia/pattern_schema.gif')
print url.mimetype in MIMETYPE_IMAGE
print(url.mimetype in MIMETYPE_IMAGE)
#html = download('http://www.clips.ua.ac.be/', unicode=True)
@ -876,14 +920,14 @@ def samples():
url = URL('http://www.clips.ua.ac.be')
dom = DOM(url.download())
for link in dom('a'):
print abs(link.attributes.get('href',''), base=url.redirect or url.string)
print(abs(link.attributes.get('href',''), base=url.redirect or url.string))
# get pdfs
from pattern.web import URL, PDF
url = URL('http://www.clips.ua.ac.be/sites/default/files/ctrs-002_0.pdf')
pdf = PDF(url.download())
print pdf.string
print(pdf.string)
@ -897,6 +941,8 @@ if __name__ == "__main__":
# 5: ['import freshdesk content', freshdesk ],
6: ['download all a courses pages', grab_course_pages],
7: ['demo vector search', demo_vector_search],
8: ['crawl',crawl],
9: ['clean text index', txt_clean_index],
}
for key in options:

View File

@ -1086,6 +1086,7 @@ def add_evals(section=0):
s = [ x.strip() for x in codecs.open('cache/sp23_eval_sections.csv','r').readlines()]
s = list(funcy.flatten(s))
s.sort()
print(s)
xyz = input('hit return to continue')
#c = getCoursesInTerm(168,0,1)
@ -1306,7 +1307,6 @@ def set_ext_tools():
if __name__ == "__main__":
options = { 1: ['Cross check schedule with ztc responses',make_ztc_list] ,
30: ['List latestart classes', list_latestarts ],
2: ['Add announcements to homepage', change_course_ann_homepage],
3: ['Cross-list classes', xlist ],
4: ['List students who passed quiz X', get_quiz_passers],
@ -1335,6 +1335,7 @@ if __name__ == "__main__":
27: ['Fine tune term dates and winter session', course_dates_terms],
28: ['Cross list a semester from file', semester_cross_lister],
29: ['Check all courses & their sections in semester', all_semester_course_sanity_check],
#30: ['List latestart classes', list_latestarts ],
# TODO wanted: group shell for each GP (guided pathway) as a basic student services gateway....
#
}