content scraping

This commit is contained in:
Peter Howell 2023-04-14 10:20:06 -07:00
parent 3f9fa2a8b5
commit ec8658cd8f
3 changed files with 2042 additions and 1637 deletions

View File

@ -71,22 +71,21 @@ prog_write.close()
## Functions ## Functions
# #
# Total TODOs remaining: 57 # Total TODOs remaining: 67
# #
# TODOs per file: # TODOs per file:
# #
# 1 - checker.py # 6 - content.py
# 1 - content.py
# 6 - courses.py # 6 - courses.py
# 3 - curriculum.py # 3 - curriculum.py
# 5 - depricated.py # 6 - depricated.py
# 6 - localcache.py # 7 - localcache.py
# 2 - outcomes.py # 2 - outcomes.py
# 20 - pipelines.py # 17 - pipelines.py
# 2 - server.py # 2 - server.py
# 2 - tasks.py # 5 - tasks.py
# 1 - tempget.py # 1 - tempget.py
# 8 - users.py # 12 - users.py
# #
@ -95,8 +94,9 @@ prog_write.close()
__init__.py __init__.py
canvas_secrets.py
checker.py checker.py
todo: make this sweet
def safe_html(html): def safe_html(html):
@ -120,12 +120,6 @@ content.py
def d(s): def d(s):
def stripper(s):
def mycleaner(s):
def freshdesk():
# Build a master file with the entire class content # Build a master file with the entire class content
def accessible_check(id=""): def accessible_check(id=""):
todo: include linked pages even if they aren't in module todo: include linked pages even if they aren't in module
@ -140,12 +134,6 @@ content.py
# DL pages only # DL pages only
def grab_course_pages(course_num=-1): def grab_course_pages(course_num=-1):
# Appears to not be used
def put_course_pages():
# Also not used
def put_revised_pages():
# Download, clean html, and reupload page # Download, clean html, and reupload page
def update_page(): def update_page():
@ -163,13 +151,32 @@ content.py
def multiple_downloads(): def multiple_downloads():
def demo_vector_search():
def is_complete_sentence(text):
todo: site scraper
todo: find package that extracts text from web page
todo: master list of what to index.
todo: PDFs and DOCXs
todo: fix urls w/ anchors
def clean_fn(s):
def format_html(html):
def visit(self, link, source=None):
def fail(self, link):
def crawl():
def txt_clean_index():
def samples():
courses.py courses.py
todo: todo:
def int_or_zero(x):
def float_or_zero(x):
# Gott 1 Bootcamp - report on who completed it. # Gott 1 Bootcamp - report on who completed it.
def get_gott1_passers(): def get_gott1_passers():
@ -179,15 +186,12 @@ courses.py
# Who, in a class, passed? # Who, in a class, passed?
def get_course_passers(course, min_passing, passers_filename, still_active_filename): def get_course_passers(course, min_passing, passers_filename, still_active_filename):
# Who, in a class and a quiz, passed?
def get_quiz_passers():
# Change courses to show 2 announcements # Change courses to show 2 announcements
def change_course_ann_homepage(id="10458"): def change_course_ann_homepage(id="10458"):
def scrape_bookstore():
todo: where does the most recent schedule come from?
# Input: xxxx_sched.json. Output: xxxx_latestarts.txt
def list_latestarts():
# All students enrolled in a class in the given semester. Simpler verson of below. Return SET of course_ids. # All students enrolled in a class in the given semester. Simpler verson of below. Return SET of course_ids.
def users_in_semester(): def users_in_semester():
todo: todo:
@ -203,13 +207,18 @@ courses.py
def getTerms(printme=1, ask=1): def getTerms(printme=1, ask=1):
todo: unsafe overwrite todo: unsafe overwrite
def getCourses(): # a dict def getCourses(x=0): # a dict
def update_course_conclude(courseid="13590",enddate='2021-12-23T01:00Z'):
# Relevant stuff trying to see if its even being used or not # Relevant stuff trying to see if its even being used or not
def course_term_summary(): def course_term_summary_local(term="176",term_label="FA22"):
# Relevant stuff trying to see if its even being used or not
def course_term_summary(term="176",term_label="FA22"):
# Fetch all courses in a given term # Fetch all courses in a given term
def getCoursesInTerm(term=0,show=1,active=0): # a list def getCoursesInTerm(term=0,get_fresh=1,show=0,active=0): # a list
def getCoursesTermSearch(term=0,search='',v=0): def getCoursesTermSearch(term=0,search='',v=0):
@ -217,6 +226,16 @@ courses.py
def xlistLineSummary(c,sections={}): def xlistLineSummary(c,sections={}):
def numbers_in_common(L):
def combined_name(nic,L):
def semester_cross_lister():
def xlist_ii(parasite_id,host_id,new_name,new_code):
def all_semester_course_sanity_check():
def eslCrosslister(): def eslCrosslister():
def xlist(parasite='', host=''): # section id , new course id def xlist(parasite='', host=''): # section id , new course id
@ -226,21 +245,93 @@ courses.py
def enroll_stem_students_live(): def enroll_stem_students_live():
def enroll_bulk_students_bydept(course_id, depts, the_term="172", cautious=1): # a string, a list of strings
todo: not done here
def enroll_art_students_live():
def enroll_orientation_students(): def enroll_orientation_students():
def summarize_proportion_online_classes(u): def enroll_o_s_students():
def summarize_num_term_classes(u):
def make_ztc_list(sem='sp20'): def make_ztc_list(sem='sp20'):
def course_search_by_sis(): def course_search_by_sis():
def mod_eval_visibility( shell_id, visible=True ):
def instructor_list_to_activate_evals():
def add_evals(section=0): def add_evals(section=0):
def course_dates_terms(section=0):
def remove_n_analytics(section=0):
def create_sandboxes():
def course_term_summary_2():
def get_ext_tools():
def set_ext_tools():
todo: wanted: group shell for each GP (guided pathway) as a basic student services gateway.... todo: wanted: group shell for each GP (guided pathway) as a basic student services gateway....
cq_demo.py
def fetch(target):
curric2022.py
def fetch_all_programs():
def nothing(x=0):
def clean(st):
def recur_matcher(item, depth=0):
def single_course_parse(c):
def match_style_test():
def single_program_path_parse(c):
def path_style_prog():
def term_txt_to_code(t):
def all_outcomes():
def ddl():
def splitclassline(cl, id=''):
def path_style_2_html():
def course_path_style_2_html():
def another_request(url,startat):
def fetch_all_classes():
def recur_path_matcher(item, path=[]):
def x2_path_update(x,y,z):
def pathstyle(theclass):
def single_course_path_parse(c):
def path_style_test():
def make_sl():
def course_rank():
curriculum.py curriculum.py
todo: These secrets
def dbg(x):
def another_request(url,startat): def another_request(url,startat):
@ -335,6 +426,9 @@ curriculum.py
def is_online_inblock(c): def is_online_inblock(c):
# 9/2021 clean programs to good json
def organize_programs_stage2():
# of all the programs, what can be accomplished online? # of all the programs, what can be accomplished online?
def find_online_programs(): def find_online_programs():
@ -408,6 +502,35 @@ curriculum.py
def cq_8020_start(): def cq_8020_start():
def recurse3(sec,path=''):
def get_id_sortorder(sec):
def include_exclude(str,inc,exc=[]):
def pbd3(str):
def handleField(f):
def boolToStr(b):
# Almost final formatting
def prog_info_to_entry(c):
def cbd_to_entry(c):
def pc5(str):
def remove_prefix(str,i):
def course_to_entry(c,order="0"):
def courseline_to_pretty(line):
# restarted oct 2019 and try to simplify
def prog_take_4(program):
todo:
curriculum2020.py curriculum2020.py
def to_md(s): def to_md(s):
@ -473,6 +596,18 @@ curriculum_patterns.py
def jj2(a,b,c,d): def jj2(a,b,c,d):
depricated.py depricated.py
todo: where does the most recent schedule come from?
# Input: xxxx_sched.json. Output: xxxx_latestarts.txt
def list_latestarts():
def prep_online_courses_df():
def course_is_online(crn):
def get_crn_from_name(name):
def get_enrlmts_for_user(user,enrollments):
# Don't know # Don't know
def demo(): def demo():
@ -504,10 +639,30 @@ depricated.py
def sd(): def sd():
def serve(): def serve():
def summarize_proportion_online_classes(u):
def summarize_num_term_classes(u):
todo: this duplicates courses.py ?? todo: this duplicates courses.py ??
# Prompt for course id, return list of user dicts. TODO this duplicates courses.py ?? # Prompt for course id, return list of user dicts. TODO this duplicates courses.py ??
def getUsersInCourse(id=0): # returns list def getUsersInCourse(id=0): # returns list
def recur_look_for_leafs(item,indent=0,show=1):
def am_i_a_leaf(item):
def sampleclass():
def matchstyle():
def user_role_and_online():
def more_unused_xreferencing():
def users_p_file():
def com_channel_dim():
todo: todo:
# NO LONGER USED - SEE COURSES # NO LONGER USED - SEE COURSES
@ -559,6 +714,22 @@ depricated.py
def get_schedule(term='201870', sem='fall'): def get_schedule(term='201870', sem='fall'):
def dates(s):
def parse_www_csv_sched():
def parse_json_test_sched():
def put_revised_pages():
def put_course_pages():
def freshdesk():
gpt.py
graphics.py
interactive.py interactive.py
def dict_generator(indict, pre=None): def dict_generator(indict, pre=None):
@ -579,8 +750,24 @@ interactive.py
def before_request(): def before_request():
def clears():
def dpi():
def dpi2():
def screenoff_a():
def light():
def do_image(filename):
def do_image_crop(filename,x,y,w,h,newname):
def save_post(): def save_post():
def writing_img(fname):
def restart(): def restart():
def dispatch3(func,arg,arrg): def dispatch3(func,arg,arrg):
@ -597,12 +784,12 @@ interactive.py
def home(): def home():
def send_jslib(path):
def send_cachedata(path): def send_cachedata(path):
def send_js(path): def send_js(path):
def send_jslib(path):
def s(key,val): def s(key,val):
def do_sample(): def do_sample():
@ -787,8 +974,6 @@ interactivex.py
def repl(): def repl():
ipython_log.py
localcache.py localcache.py
def db(): def db():
@ -893,7 +1078,12 @@ localcache.py
def f(x): def f(x):
# get student count and teacher name from local db def get_courses_in_term_local(term="172"):
# get student count
def course_student_stats(canvasid):
# get teacher name from local db
def course_quick_stats(canvasid): def course_quick_stats(canvasid):
# What a student has taken / teacher has taught # What a student has taken / teacher has taught
@ -932,24 +1122,37 @@ localcache.py
def qstrip(txt): return txt.strip('"') def qstrip(txt): return txt.strip('"')
def more_unused_xreferencing():
def user_role_and_online():
def comm_channel_file(): def comm_channel_file():
def pseudonym_file(): def pseudonym_file():
def users_p_file():
def com_channel_dim():
def abcd(): def abcd():
def crns_to_teachers(): def crns_to_teachers():
def all_sem_courses_teachers():
def to_sis_sem(s):
def build_db_schedule():
def finder(st):
def process_enrollment_data():
def sem_to_idx(s):
todo:
def do_encoding():
main.py main.py
myconsole.py
def handler(signum, frame):
def mainloop():
outcomes.py outcomes.py
def outcome_overview(term=21): def outcome_overview(term=21):
@ -958,12 +1161,10 @@ outcomes.py
def connect_acct_oc_to_course(course_id,oc_group_id): def connect_acct_oc_to_course(course_id,oc_group_id):
def outcome_groups(): def outcome_groups_dump():
def outcome_groups_backup(): def outcome_groups_backup():
def x_ref_dept_names():
def create_course_group(short,parent): def create_course_group(short,parent):
def create_dept_group(short): def create_dept_group(short):
@ -992,6 +1193,74 @@ outcomes.py
def slo_source_by_dept(): def slo_source_by_dept():
def printj(j):
def writej(o,j):
# Get root outcome group
def root_og():
def recur_og():
def recur_main(out,g_url=""):
def recur2(out,og={}):
def all_og():
def course_slo_getter(q):
def threaded_getter():
def demo_o_fetch():
def outcome_groups_2021():
def x_ref_dept_names():
def all_outcome_results_in_term(termid=''):
def all_outcome_results_in_term_sub(termid=''):
def all_linked_outcomes_in_term(termid=''):
def all_linked_outcomes_in_term_sub(termid=''):
def assemblerow(g,parent=''):
def recur_full_fetch(out,g,parent=""):
# return the appropriate cq course version.
def find_cq_course_version(code):
def outcome_groups():
def summary_string(s):
def add_outcomes_course_id(canvas_id):
def add_outcomes_course_code():
def add_outcomes_course_code_sub(target_code='AJ184',term=178,fresh=0):
def add_csis_sp22():
def quick_add_course_outcomes(ilearn_course_id, cq_outcome_id_list):
def stringpad(s,n):
def code_from_ilearn_name(n,verbose=0):
def parse_ilearn_course_names_ALLSEMESTERS():
def parse_ilearn_course_names(term='178',fresh=1,log=0):
outcomes2022.py
def course_slo_getter(q):
def ilearn_shell_slo_to_csv(shell_slos):
patterns_8020.py patterns_8020.py
patterns_topdown.py patterns_topdown.py
@ -1041,11 +1310,10 @@ patterns_topdown.py
def jj2(a,b,c,d): def jj2(a,b,c,d):
pipelines.py pipelines.py
todo: secrets
todo: all these constants for SSB -- line 1008 todo: all these constants for SSB -- line 1008
todo: secrets todo: https://stackoverflow.com/questions/42656247/how-can-i-use-canvas-data-rest-api-using-python
def d(s): def d(s,end=''):
# Main canvas querying fxn # Main canvas querying fxn
def fetch(target,verbose=0): def fetch(target,verbose=0):
@ -1073,12 +1341,6 @@ pipelines.py
def getSemesterSchedule(short='sp21'): # I used to be current_schedule def getSemesterSchedule(short='sp21'): # I used to be current_schedule
todo: Some semesters have a different format.... partofday type site xxx i just dL'd them again todo: Some semesters have a different format.... partofday type site xxx i just dL'd them again
def prep_online_courses_df():
def course_is_online(crn):
def get_crn_from_name(name):
def get_enrlmts_for_user(user,enrollments): def get_enrlmts_for_user(user,enrollments):
# Get something from Canvas Data # Get something from Canvas Data
@ -1110,6 +1372,7 @@ pipelines.py
### course is a list of 1-3 lists, each one being a line in the schedule's output. First one has section ### course is a list of 1-3 lists, each one being a line in the schedule's output. First one has section
def course_start(course): def course_start(course):
todo: use this to make a early/late/short field and store semester dates w/ other constants todo: use this to make a early/late/short field and store semester dates w/ other constants
todo: do these years matter?
def time_to_partofday(t): def time_to_partofday(t):
todo: account for multiple sites/rows todo: account for multiple sites/rows
@ -1132,9 +1395,10 @@ pipelines.py
# Use Firefox and log in to ssb and get full schedule. Only works where selenium is installed # Use Firefox and log in to ssb and get full schedule. Only works where selenium is installed
def scrape_schedule(): def scrape_schedule():
todo: my data here.... secret
todo: todo:
def dza_sched():
# recreate schedule json files with most current online schedule format. # recreate schedule json files with most current online schedule format.
def recent_schedules(): def recent_schedules():
todo: sems is a global in this file. Is that the right thing to do? todo: sems is a global in this file. Is that the right thing to do?
@ -1149,7 +1413,6 @@ pipelines.py
# From instructure sftp site # From instructure sftp site
def fetch_current_rosters(): def fetch_current_rosters():
todo: secret
def fetch_current_rosters_auto(): def fetch_current_rosters_auto():
@ -1159,7 +1422,6 @@ pipelines.py
# Upload a json file to www # Upload a json file to www
def put_file(remotepath,localpath, localfile,prompt=1): def put_file(remotepath,localpath, localfile,prompt=1):
todo: remove this secret
todo: these paths todo: these paths
def sec(t): return "<h3>"+t+"</h3>\n" def sec(t): return "<h3>"+t+"</h3>\n"
@ -1205,8 +1467,43 @@ pipelines.py
def scrape_schedule_py(): def scrape_schedule_py():
def scrape_schedule_multi():
def scrape_for_db():
def argos_data():
def days_times(s):
def remove_year(s):
def argos_data_from_cvc():
def expand_old_semesters():
# Input: xxxx_sched.json. Output: xxxx_latestarts.txt
def list_latestarts(term="su23"):
server.py server.py
def mqtt_loop():
# called when MQTT server connects
def on_connect(client, userdata, flags, rc):
# The callback for when a PUBLISH message is received from the server.
def on_message(client, userdata, msg):
def displaypi_on():
def displaypi_off():
def desklight():
def clearscreens():
def screenoff():
def tag(x,y): return "<%s>%s</%s>" % (x,y,x) def tag(x,y): return "<%s>%s</%s>" % (x,y,x)
def tagc(x,c,y): return '<%s class="%s">%s</%s>' % (x,c,y,x) def tagc(x,c,y): return '<%s class="%s">%s</%s>' % (x,c,y,x)
@ -1242,6 +1539,8 @@ server.py
def randPic(): def randPic():
def do_img_crop(im):
def sample(): def sample():
def sample2(a=""): def sample2(a=""):
@ -1286,6 +1585,16 @@ server.py
def staff_dir(search=''): def staff_dir(search=''):
def find_goo(n):
def byname(x):
def fn_to_struct( n, staff ):
def image_edit(filename=''):
def image_crop(filename,x,y,w,h,newname=''):
def server_save(key,value): def server_save(key,value):
def server_dispatch_json(function_name,arg='', arg2=''): def server_dispatch_json(function_name,arg='', arg2=''):
@ -1310,6 +1619,8 @@ stats.py
tasks.py tasks.py
def scrape_bookstore():
def survey_answer(q=0): def survey_answer(q=0):
def survey_organize(): def survey_organize():
@ -1360,6 +1671,36 @@ tasks.py
def pos_atten(): def pos_atten():
def lname(x):
def l_initial(x):
def job_titles2():
def job_titles():
# an early version, before tearing up...
def job_titles3():
def index_pics():
def cmtes():
def strip(x): return x.strip()
def esc_comma(x): return re.sub(',','[CMA]',x)
def by_sem(x): return x['sem']
def parse_schedule():
todo: check if i need to update it
todo: some weird hour offset issue w/ these activities
def cal():
todo: >
def file_renamer():
temp.py temp.py
tempget.py tempget.py
@ -1466,6 +1807,7 @@ users.py
def teacher_basic_info(sched, from_ilearn, names): def teacher_basic_info(sched, from_ilearn, names):
def find_that_name(x): def find_that_name(x):
todo: Old and broken
# Outputs: cache/teacher_by_semester.csv, # Outputs: cache/teacher_by_semester.csv,
def teacherModalityHistory(sched=[],names=[]): def teacherModalityHistory(sched=[],names=[]):
@ -1475,6 +1817,7 @@ users.py
# Outputs: cache/course_teacher_combos.csv, # Outputs: cache/course_teacher_combos.csv,
def teacherSharedCourses(a=[]): def teacherSharedCourses(a=[]):
todo: this is broken
# How many courses in each department were taught in the last year? # How many courses in each department were taught in the last year?
def departmentCountCourses(a=[]): def departmentCountCourses(a=[]):
@ -1512,6 +1855,7 @@ users.py
# Make one big csv file of everything I know about a teacher # Make one big csv file of everything I know about a teacher
def getTeachersInfoMain(): def getTeachersInfoMain():
todo: - broken
def enroll_staff_shell(): def enroll_staff_shell():
@ -1521,8 +1865,8 @@ users.py
# Get views counts on current teachers. todo: month is hardcoded here # Get views counts on current teachers. todo: month is hardcoded here
def get_recent_views(id=1): def get_recent_views(id=1):
todo: broken?
# Have they taught online or hybrid classes?
def categorize_user(u): def categorize_user(u):
todo: threaded todo: threaded
@ -1539,8 +1883,6 @@ users.py
# Go through my local profile pics, upload any that are missing. # Go through my local profile pics, upload any that are missing.
def uploadPhoto(): def uploadPhoto():
def test_email():
def create_ztc_list(): def create_ztc_list():
def get_user_info(id): def get_user_info(id):
@ -1583,8 +1925,20 @@ users.py
def one_course_enrol(): def one_course_enrol():
def find_new_teachers():
def user_db_sync():
def find_no_goo():
def track_a_user():
util.py util.py
def stripper(s):
def mycleaner(s):
def print_table(table): def print_table(table):
def remove_nl(str): def remove_nl(str):
@ -1597,6 +1951,10 @@ util.py
def clean_title(st): def clean_title(st):
def int_or_zero(x):
def float_or_zero(x):
def match59(x): def match59(x):
def item_2(x): return x[2] def item_2(x): return x[2]

View File

@ -6,6 +6,7 @@ from pipelines import header, fetch, url, put_file
from util import clean_title, to_file_friendly, minimal_string, stripper, mycleaner from util import clean_title, to_file_friendly, minimal_string, stripper, mycleaner
from bs4 import BeautifulSoup as bs from bs4 import BeautifulSoup as bs
from html.parser import HTMLParser from html.parser import HTMLParser
from collections import defaultdict
import tomd, checker import tomd, checker
import html2markdown as h2m import html2markdown as h2m
import pypandoc import pypandoc
@ -829,40 +830,83 @@ Schedule an In-Person, Phone or Zoom Appointment"""
## TODO site scraper ## TODO site scraper
## TODO find package that extracts text from web page
## TODO finde package that extracts text from web page
### TODO master list of what to index. ### TODO master list of what to index.
from pattern.web import URL, plaintext, extension ## TODO PDFs and DOCXs
## TODO fix urls w/ anchors
from pattern.web import plaintext, extension
from pattern.web import download from pattern.web import download
from pattern import URL, MIMETYPE_IMAGE #from pattern import URL, MIMETYPE_IMAGE
from pattern.web import Crawler from pattern.web import Crawler, DEPTH
from util import clean_title import bs4
import trafilatura
save_folder = 'cache/crawl' save_folder = 'cache/crawl'
clean_folder = 'cache/cleancrawl'
def clean_fn(s):
s = re.sub(r'[\s:]+','',s)
s = re.sub(r'\/','_',s)
return s
def format_html(html):
soup = bs4.BeautifulSoup(html, 'html.parser')
return soup.prettify()
class GavCrawl(Crawler): class GavCrawl(Crawler):
def visit(self, link, source=None): def visit(self, link, source=None):
print 'visited:', repr(link.url), 'from:', link.referrer print('visited:', repr(link.url), 'from:', link.referrer)
txt = plaintext(link.source) ## , keep={'h1':[], 'h2':[], 'strong':[], 'a':['href']}) #txt = plaintext(source, keep={'h1':[], 'h2':[], 'h3':[], 'h4':[], 'td':[], 'strong':[], 'b':[], 'a':['href'], 'img':['src'], 'ul':[], 'ol':[], 'li':[], 'dd':[], 'dt':[], 'i':[]})
codecs.open(save_folder + '/' + clean_title(link.url) + '.txt').write(txt) #codecs.open(save_folder + '/' + mycleaner(clean_title(link.url)) + '.txt','w','utf-8').write(tomd.convert(txt))
codecs.open(save_folder + '/' + clean_fn(link.url) + '.txt','w','utf-8').write(trafilatura.extract(source,include_links=True, deduplicate=True, include_images=True, include_formatting=True))
def fail(self, link): def fail(self, link):
print 'failed:', repr(link.url) print('failed:', repr(link.url))
def crawl(): def crawl():
p = GavCrawl(links=['http://www.gavilan.edu/'], delay=3) p = GavCrawl(links=['http://www.gavilan.edu/'], domains=['gavilan.edu'], delay=0.75)
while not p.done: while not p.done:
p.crawl(method=DEPTH, cached=False, throttle=3) try:
p.crawl(method=DEPTH, cached=False, throttle=0.76)
except Exception as e:
print("Exception: ", e)
def txt_clean_index():
files = os.listdir(save_folder)
line_freq = defaultdict(int)
# first pass
for f in files:
lines = codecs.open(save_folder + '/' + f,'r','utf-8').readlines()
for L in lines:
L = L.strip()
line_freq[L] += 1
# second pass
for f in files:
print("\n\n",f)
lines = codecs.open(save_folder + '/' + f,'r','utf-8').readlines()
out = codecs.open(clean_folder + '/' + f,'w','utf-8')
for L in lines:
L = L.strip()
if L in line_freq and line_freq[L] > 3:
continue
print(L)
out.write(L + '\n')
out.close()
def samples(): def samples():
crawler = Crawler(links=[], domains=[], delay=20.0, sort=FIFO) crawler = Crawler(links=[], domains=[], delay=20.0, sort=FIFO)
url = URL('http://www.clips.ua.ac.bemedia/pattern_schema.gif') url = URL('http://www.clips.ua.ac.bemedia/pattern_schema.gif')
print url.mimetype in MIMETYPE_IMAGE print(url.mimetype in MIMETYPE_IMAGE)
#html = download('http://www.clips.ua.ac.be/', unicode=True) #html = download('http://www.clips.ua.ac.be/', unicode=True)
@ -876,14 +920,14 @@ def samples():
url = URL('http://www.clips.ua.ac.be') url = URL('http://www.clips.ua.ac.be')
dom = DOM(url.download()) dom = DOM(url.download())
for link in dom('a'): for link in dom('a'):
print abs(link.attributes.get('href',''), base=url.redirect or url.string) print(abs(link.attributes.get('href',''), base=url.redirect or url.string))
# get pdfs # get pdfs
from pattern.web import URL, PDF from pattern.web import URL, PDF
url = URL('http://www.clips.ua.ac.be/sites/default/files/ctrs-002_0.pdf') url = URL('http://www.clips.ua.ac.be/sites/default/files/ctrs-002_0.pdf')
pdf = PDF(url.download()) pdf = PDF(url.download())
print pdf.string print(pdf.string)
@ -897,6 +941,8 @@ if __name__ == "__main__":
# 5: ['import freshdesk content', freshdesk ], # 5: ['import freshdesk content', freshdesk ],
6: ['download all a courses pages', grab_course_pages], 6: ['download all a courses pages', grab_course_pages],
7: ['demo vector search', demo_vector_search], 7: ['demo vector search', demo_vector_search],
8: ['crawl',crawl],
9: ['clean text index', txt_clean_index],
} }
for key in options: for key in options:

View File

@ -1086,6 +1086,7 @@ def add_evals(section=0):
s = [ x.strip() for x in codecs.open('cache/sp23_eval_sections.csv','r').readlines()] s = [ x.strip() for x in codecs.open('cache/sp23_eval_sections.csv','r').readlines()]
s = list(funcy.flatten(s)) s = list(funcy.flatten(s))
s.sort() s.sort()
print(s)
xyz = input('hit return to continue') xyz = input('hit return to continue')
#c = getCoursesInTerm(168,0,1) #c = getCoursesInTerm(168,0,1)
@ -1306,7 +1307,6 @@ def set_ext_tools():
if __name__ == "__main__": if __name__ == "__main__":
options = { 1: ['Cross check schedule with ztc responses',make_ztc_list] , options = { 1: ['Cross check schedule with ztc responses',make_ztc_list] ,
30: ['List latestart classes', list_latestarts ],
2: ['Add announcements to homepage', change_course_ann_homepage], 2: ['Add announcements to homepage', change_course_ann_homepage],
3: ['Cross-list classes', xlist ], 3: ['Cross-list classes', xlist ],
4: ['List students who passed quiz X', get_quiz_passers], 4: ['List students who passed quiz X', get_quiz_passers],
@ -1335,6 +1335,7 @@ if __name__ == "__main__":
27: ['Fine tune term dates and winter session', course_dates_terms], 27: ['Fine tune term dates and winter session', course_dates_terms],
28: ['Cross list a semester from file', semester_cross_lister], 28: ['Cross list a semester from file', semester_cross_lister],
29: ['Check all courses & their sections in semester', all_semester_course_sanity_check], 29: ['Check all courses & their sections in semester', all_semester_course_sanity_check],
#30: ['List latestart classes', list_latestarts ],
# TODO wanted: group shell for each GP (guided pathway) as a basic student services gateway.... # TODO wanted: group shell for each GP (guided pathway) as a basic student services gateway....
# #
} }