This commit is contained in:
phowell 2023-04-17 16:30:17 -07:00
commit 6807ddd96c
7 changed files with 2400 additions and 1704 deletions

View File

@ -71,22 +71,21 @@ prog_write.close()
## Functions ## Functions
# #
# Total TODOs remaining: 57 # Total TODOs remaining: 67
# #
# TODOs per file: # TODOs per file:
# #
# 1 - checker.py # 6 - content.py
# 1 - content.py
# 6 - courses.py # 6 - courses.py
# 3 - curriculum.py # 3 - curriculum.py
# 5 - depricated.py # 6 - depricated.py
# 6 - localcache.py # 7 - localcache.py
# 2 - outcomes.py # 2 - outcomes.py
# 20 - pipelines.py # 17 - pipelines.py
# 2 - server.py # 2 - server.py
# 2 - tasks.py # 5 - tasks.py
# 1 - tempget.py # 1 - tempget.py
# 8 - users.py # 12 - users.py
# #
@ -95,8 +94,9 @@ prog_write.close()
__init__.py __init__.py
canvas_secrets.py
checker.py checker.py
todo: make this sweet
def safe_html(html): def safe_html(html):
@ -120,12 +120,6 @@ content.py
def d(s): def d(s):
def stripper(s):
def mycleaner(s):
def freshdesk():
# Build a master file with the entire class content # Build a master file with the entire class content
def accessible_check(id=""): def accessible_check(id=""):
todo: include linked pages even if they aren't in module todo: include linked pages even if they aren't in module
@ -140,12 +134,6 @@ content.py
# DL pages only # DL pages only
def grab_course_pages(course_num=-1): def grab_course_pages(course_num=-1):
# Appears to not be used
def put_course_pages():
# Also not used
def put_revised_pages():
# Download, clean html, and reupload page # Download, clean html, and reupload page
def update_page(): def update_page():
@ -163,13 +151,32 @@ content.py
def multiple_downloads(): def multiple_downloads():
def demo_vector_search():
def is_complete_sentence(text):
todo: site scraper
todo: find package that extracts text from web page
todo: master list of what to index.
todo: PDFs and DOCXs
todo: fix urls w/ anchors
def clean_fn(s):
def format_html(html):
def visit(self, link, source=None):
def fail(self, link):
def crawl():
def txt_clean_index():
def samples():
courses.py courses.py
todo: todo:
def int_or_zero(x):
def float_or_zero(x):
# Gott 1 Bootcamp - report on who completed it. # Gott 1 Bootcamp - report on who completed it.
def get_gott1_passers(): def get_gott1_passers():
@ -179,15 +186,12 @@ courses.py
# Who, in a class, passed? # Who, in a class, passed?
def get_course_passers(course, min_passing, passers_filename, still_active_filename): def get_course_passers(course, min_passing, passers_filename, still_active_filename):
# Who, in a class and a quiz, passed?
def get_quiz_passers():
# Change courses to show 2 announcements # Change courses to show 2 announcements
def change_course_ann_homepage(id="10458"): def change_course_ann_homepage(id="10458"):
def scrape_bookstore():
todo: where does the most recent schedule come from?
# Input: xxxx_sched.json. Output: xxxx_latestarts.txt
def list_latestarts():
# All students enrolled in a class in the given semester. Simpler verson of below. Return SET of course_ids. # All students enrolled in a class in the given semester. Simpler verson of below. Return SET of course_ids.
def users_in_semester(): def users_in_semester():
todo: todo:
@ -203,13 +207,18 @@ courses.py
def getTerms(printme=1, ask=1): def getTerms(printme=1, ask=1):
todo: unsafe overwrite todo: unsafe overwrite
def getCourses(): # a dict def getCourses(x=0): # a dict
def update_course_conclude(courseid="13590",enddate='2021-12-23T01:00Z'):
# Relevant stuff trying to see if its even being used or not # Relevant stuff trying to see if its even being used or not
def course_term_summary(): def course_term_summary_local(term="176",term_label="FA22"):
# Relevant stuff trying to see if its even being used or not
def course_term_summary(term="176",term_label="FA22"):
# Fetch all courses in a given term # Fetch all courses in a given term
def getCoursesInTerm(term=0,show=1,active=0): # a list def getCoursesInTerm(term=0,get_fresh=1,show=0,active=0): # a list
def getCoursesTermSearch(term=0,search='',v=0): def getCoursesTermSearch(term=0,search='',v=0):
@ -217,6 +226,16 @@ courses.py
def xlistLineSummary(c,sections={}): def xlistLineSummary(c,sections={}):
def numbers_in_common(L):
def combined_name(nic,L):
def semester_cross_lister():
def xlist_ii(parasite_id,host_id,new_name,new_code):
def all_semester_course_sanity_check():
def eslCrosslister(): def eslCrosslister():
def xlist(parasite='', host=''): # section id , new course id def xlist(parasite='', host=''): # section id , new course id
@ -226,21 +245,93 @@ courses.py
def enroll_stem_students_live(): def enroll_stem_students_live():
def enroll_bulk_students_bydept(course_id, depts, the_term="172", cautious=1): # a string, a list of strings
todo: not done here
def enroll_art_students_live():
def enroll_orientation_students(): def enroll_orientation_students():
def summarize_proportion_online_classes(u): def enroll_o_s_students():
def summarize_num_term_classes(u):
def make_ztc_list(sem='sp20'): def make_ztc_list(sem='sp20'):
def course_search_by_sis(): def course_search_by_sis():
def mod_eval_visibility( shell_id, visible=True ):
def instructor_list_to_activate_evals():
def add_evals(section=0): def add_evals(section=0):
def course_dates_terms(section=0):
def remove_n_analytics(section=0):
def create_sandboxes():
def course_term_summary_2():
def get_ext_tools():
def set_ext_tools():
todo: wanted: group shell for each GP (guided pathway) as a basic student services gateway.... todo: wanted: group shell for each GP (guided pathway) as a basic student services gateway....
cq_demo.py
def fetch(target):
curric2022.py
def fetch_all_programs():
def nothing(x=0):
def clean(st):
def recur_matcher(item, depth=0):
def single_course_parse(c):
def match_style_test():
def single_program_path_parse(c):
def path_style_prog():
def term_txt_to_code(t):
def all_outcomes():
def ddl():
def splitclassline(cl, id=''):
def path_style_2_html():
def course_path_style_2_html():
def another_request(url,startat):
def fetch_all_classes():
def recur_path_matcher(item, path=[]):
def x2_path_update(x,y,z):
def pathstyle(theclass):
def single_course_path_parse(c):
def path_style_test():
def make_sl():
def course_rank():
curriculum.py curriculum.py
todo: These secrets
def dbg(x):
def another_request(url,startat): def another_request(url,startat):
@ -335,6 +426,9 @@ curriculum.py
def is_online_inblock(c): def is_online_inblock(c):
# 9/2021 clean programs to good json
def organize_programs_stage2():
# of all the programs, what can be accomplished online? # of all the programs, what can be accomplished online?
def find_online_programs(): def find_online_programs():
@ -408,6 +502,35 @@ curriculum.py
def cq_8020_start(): def cq_8020_start():
def recurse3(sec,path=''):
def get_id_sortorder(sec):
def include_exclude(str,inc,exc=[]):
def pbd3(str):
def handleField(f):
def boolToStr(b):
# Almost final formatting
def prog_info_to_entry(c):
def cbd_to_entry(c):
def pc5(str):
def remove_prefix(str,i):
def course_to_entry(c,order="0"):
def courseline_to_pretty(line):
# restarted oct 2019 and try to simplify
def prog_take_4(program):
todo:
curriculum2020.py curriculum2020.py
def to_md(s): def to_md(s):
@ -473,6 +596,18 @@ curriculum_patterns.py
def jj2(a,b,c,d): def jj2(a,b,c,d):
depricated.py depricated.py
todo: where does the most recent schedule come from?
# Input: xxxx_sched.json. Output: xxxx_latestarts.txt
def list_latestarts():
def prep_online_courses_df():
def course_is_online(crn):
def get_crn_from_name(name):
def get_enrlmts_for_user(user,enrollments):
# Don't know # Don't know
def demo(): def demo():
@ -504,10 +639,30 @@ depricated.py
def sd(): def sd():
def serve(): def serve():
def summarize_proportion_online_classes(u):
def summarize_num_term_classes(u):
todo: this duplicates courses.py ?? todo: this duplicates courses.py ??
# Prompt for course id, return list of user dicts. TODO this duplicates courses.py ?? # Prompt for course id, return list of user dicts. TODO this duplicates courses.py ??
def getUsersInCourse(id=0): # returns list def getUsersInCourse(id=0): # returns list
def recur_look_for_leafs(item,indent=0,show=1):
def am_i_a_leaf(item):
def sampleclass():
def matchstyle():
def user_role_and_online():
def more_unused_xreferencing():
def users_p_file():
def com_channel_dim():
todo: todo:
# NO LONGER USED - SEE COURSES # NO LONGER USED - SEE COURSES
@ -559,6 +714,22 @@ depricated.py
def get_schedule(term='201870', sem='fall'): def get_schedule(term='201870', sem='fall'):
def dates(s):
def parse_www_csv_sched():
def parse_json_test_sched():
def put_revised_pages():
def put_course_pages():
def freshdesk():
gpt.py
graphics.py
interactive.py interactive.py
def dict_generator(indict, pre=None): def dict_generator(indict, pre=None):
@ -579,8 +750,24 @@ interactive.py
def before_request(): def before_request():
def clears():
def dpi():
def dpi2():
def screenoff_a():
def light():
def do_image(filename):
def do_image_crop(filename,x,y,w,h,newname):
def save_post(): def save_post():
def writing_img(fname):
def restart(): def restart():
def dispatch3(func,arg,arrg): def dispatch3(func,arg,arrg):
@ -597,12 +784,12 @@ interactive.py
def home(): def home():
def send_jslib(path):
def send_cachedata(path): def send_cachedata(path):
def send_js(path): def send_js(path):
def send_jslib(path):
def s(key,val): def s(key,val):
def do_sample(): def do_sample():
@ -787,8 +974,6 @@ interactivex.py
def repl(): def repl():
ipython_log.py
localcache.py localcache.py
def db(): def db():
@ -893,7 +1078,12 @@ localcache.py
def f(x): def f(x):
# get student count and teacher name from local db def get_courses_in_term_local(term="172"):
# get student count
def course_student_stats(canvasid):
# get teacher name from local db
def course_quick_stats(canvasid): def course_quick_stats(canvasid):
# What a student has taken / teacher has taught # What a student has taken / teacher has taught
@ -932,24 +1122,37 @@ localcache.py
def qstrip(txt): return txt.strip('"') def qstrip(txt): return txt.strip('"')
def more_unused_xreferencing():
def user_role_and_online():
def comm_channel_file(): def comm_channel_file():
def pseudonym_file(): def pseudonym_file():
def users_p_file():
def com_channel_dim():
def abcd(): def abcd():
def crns_to_teachers(): def crns_to_teachers():
def all_sem_courses_teachers():
def to_sis_sem(s):
def build_db_schedule():
def finder(st):
def process_enrollment_data():
def sem_to_idx(s):
todo:
def do_encoding():
main.py main.py
myconsole.py
def handler(signum, frame):
def mainloop():
outcomes.py outcomes.py
def outcome_overview(term=21): def outcome_overview(term=21):
@ -958,12 +1161,10 @@ outcomes.py
def connect_acct_oc_to_course(course_id,oc_group_id): def connect_acct_oc_to_course(course_id,oc_group_id):
def outcome_groups(): def outcome_groups_dump():
def outcome_groups_backup(): def outcome_groups_backup():
def x_ref_dept_names():
def create_course_group(short,parent): def create_course_group(short,parent):
def create_dept_group(short): def create_dept_group(short):
@ -992,6 +1193,74 @@ outcomes.py
def slo_source_by_dept(): def slo_source_by_dept():
def printj(j):
def writej(o,j):
# Get root outcome group
def root_og():
def recur_og():
def recur_main(out,g_url=""):
def recur2(out,og={}):
def all_og():
def course_slo_getter(q):
def threaded_getter():
def demo_o_fetch():
def outcome_groups_2021():
def x_ref_dept_names():
def all_outcome_results_in_term(termid=''):
def all_outcome_results_in_term_sub(termid=''):
def all_linked_outcomes_in_term(termid=''):
def all_linked_outcomes_in_term_sub(termid=''):
def assemblerow(g,parent=''):
def recur_full_fetch(out,g,parent=""):
# return the appropriate cq course version.
def find_cq_course_version(code):
def outcome_groups():
def summary_string(s):
def add_outcomes_course_id(canvas_id):
def add_outcomes_course_code():
def add_outcomes_course_code_sub(target_code='AJ184',term=178,fresh=0):
def add_csis_sp22():
def quick_add_course_outcomes(ilearn_course_id, cq_outcome_id_list):
def stringpad(s,n):
def code_from_ilearn_name(n,verbose=0):
def parse_ilearn_course_names_ALLSEMESTERS():
def parse_ilearn_course_names(term='178',fresh=1,log=0):
outcomes2022.py
def course_slo_getter(q):
def ilearn_shell_slo_to_csv(shell_slos):
patterns_8020.py patterns_8020.py
patterns_topdown.py patterns_topdown.py
@ -1041,11 +1310,10 @@ patterns_topdown.py
def jj2(a,b,c,d): def jj2(a,b,c,d):
pipelines.py pipelines.py
todo: secrets
todo: all these constants for SSB -- line 1008 todo: all these constants for SSB -- line 1008
todo: secrets todo: https://stackoverflow.com/questions/42656247/how-can-i-use-canvas-data-rest-api-using-python
def d(s): def d(s,end=''):
# Main canvas querying fxn # Main canvas querying fxn
def fetch(target,verbose=0): def fetch(target,verbose=0):
@ -1073,12 +1341,6 @@ pipelines.py
def getSemesterSchedule(short='sp21'): # I used to be current_schedule def getSemesterSchedule(short='sp21'): # I used to be current_schedule
todo: Some semesters have a different format.... partofday type site xxx i just dL'd them again todo: Some semesters have a different format.... partofday type site xxx i just dL'd them again
def prep_online_courses_df():
def course_is_online(crn):
def get_crn_from_name(name):
def get_enrlmts_for_user(user,enrollments): def get_enrlmts_for_user(user,enrollments):
# Get something from Canvas Data # Get something from Canvas Data
@ -1110,6 +1372,7 @@ pipelines.py
### course is a list of 1-3 lists, each one being a line in the schedule's output. First one has section ### course is a list of 1-3 lists, each one being a line in the schedule's output. First one has section
def course_start(course): def course_start(course):
todo: use this to make a early/late/short field and store semester dates w/ other constants todo: use this to make a early/late/short field and store semester dates w/ other constants
todo: do these years matter?
def time_to_partofday(t): def time_to_partofday(t):
todo: account for multiple sites/rows todo: account for multiple sites/rows
@ -1132,9 +1395,10 @@ pipelines.py
# Use Firefox and log in to ssb and get full schedule. Only works where selenium is installed # Use Firefox and log in to ssb and get full schedule. Only works where selenium is installed
def scrape_schedule(): def scrape_schedule():
todo: my data here.... secret
todo: todo:
def dza_sched():
# recreate schedule json files with most current online schedule format. # recreate schedule json files with most current online schedule format.
def recent_schedules(): def recent_schedules():
todo: sems is a global in this file. Is that the right thing to do? todo: sems is a global in this file. Is that the right thing to do?
@ -1149,7 +1413,6 @@ pipelines.py
# From instructure sftp site # From instructure sftp site
def fetch_current_rosters(): def fetch_current_rosters():
todo: secret
def fetch_current_rosters_auto(): def fetch_current_rosters_auto():
@ -1159,7 +1422,6 @@ pipelines.py
# Upload a json file to www # Upload a json file to www
def put_file(remotepath,localpath, localfile,prompt=1): def put_file(remotepath,localpath, localfile,prompt=1):
todo: remove this secret
todo: these paths todo: these paths
def sec(t): return "<h3>"+t+"</h3>\n" def sec(t): return "<h3>"+t+"</h3>\n"
@ -1205,8 +1467,43 @@ pipelines.py
def scrape_schedule_py(): def scrape_schedule_py():
def scrape_schedule_multi():
def scrape_for_db():
def argos_data():
def days_times(s):
def remove_year(s):
def argos_data_from_cvc():
def expand_old_semesters():
# Input: xxxx_sched.json. Output: xxxx_latestarts.txt
def list_latestarts(term="su23"):
server.py server.py
def mqtt_loop():
# called when MQTT server connects
def on_connect(client, userdata, flags, rc):
# The callback for when a PUBLISH message is received from the server.
def on_message(client, userdata, msg):
def displaypi_on():
def displaypi_off():
def desklight():
def clearscreens():
def screenoff():
def tag(x,y): return "<%s>%s</%s>" % (x,y,x) def tag(x,y): return "<%s>%s</%s>" % (x,y,x)
def tagc(x,c,y): return '<%s class="%s">%s</%s>' % (x,c,y,x) def tagc(x,c,y): return '<%s class="%s">%s</%s>' % (x,c,y,x)
@ -1242,6 +1539,8 @@ server.py
def randPic(): def randPic():
def do_img_crop(im):
def sample(): def sample():
def sample2(a=""): def sample2(a=""):
@ -1286,6 +1585,16 @@ server.py
def staff_dir(search=''): def staff_dir(search=''):
def find_goo(n):
def byname(x):
def fn_to_struct( n, staff ):
def image_edit(filename=''):
def image_crop(filename,x,y,w,h,newname=''):
def server_save(key,value): def server_save(key,value):
def server_dispatch_json(function_name,arg='', arg2=''): def server_dispatch_json(function_name,arg='', arg2=''):
@ -1310,6 +1619,8 @@ stats.py
tasks.py tasks.py
def scrape_bookstore():
def survey_answer(q=0): def survey_answer(q=0):
def survey_organize(): def survey_organize():
@ -1360,6 +1671,36 @@ tasks.py
def pos_atten(): def pos_atten():
def lname(x):
def l_initial(x):
def job_titles2():
def job_titles():
# an early version, before tearing up...
def job_titles3():
def index_pics():
def cmtes():
def strip(x): return x.strip()
def esc_comma(x): return re.sub(',','[CMA]',x)
def by_sem(x): return x['sem']
def parse_schedule():
todo: check if i need to update it
todo: some weird hour offset issue w/ these activities
def cal():
todo: >
def file_renamer():
temp.py temp.py
tempget.py tempget.py
@ -1466,6 +1807,7 @@ users.py
def teacher_basic_info(sched, from_ilearn, names): def teacher_basic_info(sched, from_ilearn, names):
def find_that_name(x): def find_that_name(x):
todo: Old and broken
# Outputs: cache/teacher_by_semester.csv, # Outputs: cache/teacher_by_semester.csv,
def teacherModalityHistory(sched=[],names=[]): def teacherModalityHistory(sched=[],names=[]):
@ -1475,6 +1817,7 @@ users.py
# Outputs: cache/course_teacher_combos.csv, # Outputs: cache/course_teacher_combos.csv,
def teacherSharedCourses(a=[]): def teacherSharedCourses(a=[]):
todo: this is broken
# How many courses in each department were taught in the last year? # How many courses in each department were taught in the last year?
def departmentCountCourses(a=[]): def departmentCountCourses(a=[]):
@ -1512,6 +1855,7 @@ users.py
# Make one big csv file of everything I know about a teacher # Make one big csv file of everything I know about a teacher
def getTeachersInfoMain(): def getTeachersInfoMain():
todo: - broken
def enroll_staff_shell(): def enroll_staff_shell():
@ -1521,8 +1865,8 @@ users.py
# Get views counts on current teachers. todo: month is hardcoded here # Get views counts on current teachers. todo: month is hardcoded here
def get_recent_views(id=1): def get_recent_views(id=1):
todo: broken?
# Have they taught online or hybrid classes?
def categorize_user(u): def categorize_user(u):
todo: threaded todo: threaded
@ -1539,8 +1883,6 @@ users.py
# Go through my local profile pics, upload any that are missing. # Go through my local profile pics, upload any that are missing.
def uploadPhoto(): def uploadPhoto():
def test_email():
def create_ztc_list(): def create_ztc_list():
def get_user_info(id): def get_user_info(id):
@ -1583,8 +1925,20 @@ users.py
def one_course_enrol(): def one_course_enrol():
def find_new_teachers():
def user_db_sync():
def find_no_goo():
def track_a_user():
util.py util.py
def stripper(s):
def mycleaner(s):
def print_table(table): def print_table(table):
def remove_nl(str): def remove_nl(str):
@ -1597,6 +1951,10 @@ util.py
def clean_title(st): def clean_title(st):
def int_or_zero(x):
def float_or_zero(x):
def match59(x): def match59(x):
def item_2(x): return x[2] def item_2(x): return x[2]

View File

@ -1,17 +1,18 @@
#saved_titles = json.loads( codecs.open('cache/saved_youtube_titles.json','r','utf-8').read() ) #saved_titles = json.loads( codecs.open('cache/saved_youtube_titles.json','r','utf-8').read() )
import requests, codecs, os, re, json import requests, codecs, os, re, json, sys, pypandoc
import webbrowser, bs4, trafilatura, pickle, tomd, checker
import html2markdown as h2m
from pipelines import header, fetch, url, put_file from pipelines import header, fetch, url, put_file
from util import clean_title, to_file_friendly, minimal_string, stripper, mycleaner from util import clean_title, to_file_friendly, minimal_string, stripper, mycleaner
from bs4 import BeautifulSoup as bs from bs4 import BeautifulSoup as bs
from html.parser import HTMLParser from html.parser import HTMLParser
import tomd, checker from collections import defaultdict
import html2markdown as h2m from pdfminer.high_level import extract_text
import pypandoc from sentence_transformers import SentenceTransformer, util
import webbrowser
h = HTMLParser()
h = HTMLParser()
DBG = 1 DBG = 1
@ -21,8 +22,6 @@ def d(s):
# Download everything interesting in a course to a local folder # Download everything interesting in a course to a local folder
# Build a master file with the entire class content # Build a master file with the entire class content
def accessible_check(id=""): def accessible_check(id=""):
@ -828,64 +827,266 @@ Schedule an In-Person, Phone or Zoom Appointment"""
print(f"Vector for the word '{example_word}': {vector}") print(f"Vector for the word '{example_word}': {vector}")
def makedir():
files = os.listdir('cache/crawl')
#print(files)
files.sort()
for f in files:
m = re.match(r'https?..www\.gavilan\.edu\+(.*)\.\w\w\w\w?\.txt$',f)
if m:
name = m.groups()[0]
parts = name.split('+')
print(parts)
def manual_index():
files = os.listdir('cache/crawl')
#print(files)
ii = codecs.open('cache/crawl/index.html','w','utf-8')
ii.write('<html><body><h1>Site index</h1>\n')
files.sort()
for f in files:
m = re.match(r'https?..www\.gavilan\.edu\+(.*)\.\w\w\w\w?\.txt$',f)
if m:
name = m.groups()[0]
parts = name.split('+')
ii.write('<br /><a href="mirror/'+f+'">'+f+'</a>\n')
def my_site():
files = os.listdir('cache/crawl')
output = []
files.sort()
for f in files:
m = re.match(r'https?..www\.gavilan\.edu\+(.*)\.\w\w\w\w?\.txt$',f)
if m:
name = m.groups()[0]
parts = name.split('+')
output.append(parts)
return output
## TODO site scraper ## TODO site scraper
## TODO find package that extracts text from web page
## TODO finde package that extracts text from web page
### TODO master list of what to index. ### TODO master list of what to index.
from pattern.web import URL, plaintext, extension ## TODO PDFs and DOCXs
from pattern.web import download ## TODO fix urls w/ anchors
from pattern import URL, MIMETYPE_IMAGE
from pattern.web import Crawler
from util import clean_title
save_folder = 'cache/crawl'
class GavCrawl(Crawler):
def visit(self, link, source=None):
print 'visited:', repr(link.url), 'from:', link.referrer
txt = plaintext(link.source) ## , keep={'h1':[], 'h2':[], 'strong':[], 'a':['href']})
codecs.open(save_folder + '/' + clean_title(link.url) + '.txt').write(txt)
def fail(self, link):
print 'failed:', repr(link.url)
def crawl(): def crawl():
p = GavCrawl(links=['http://www.gavilan.edu/'], delay=3) import scrapy, logging
while not p.done: from scrapy.crawler import CrawlerProcess
p.crawl(method=DEPTH, cached=False, throttle=3)
logger = logging.getLogger()
logger.setLevel(level=logging.CRITICAL)
logging.basicConfig(level=logging.CRITICAL)
logger.disabled = True
avoid = ['ezproxy','community\.gavilan\.edu','archive\/tag','archive\/category', 'my\.gavilan\.edu', 'augusoft',
'eis-prod', 'ilearn\.gavilan', 'mailto', 'cgi-bin', 'edu\/old\/schedule', ]
class MySpider(scrapy.Spider):
name = 'myspider'
#start_urls = ['https://gavilan.curriqunet.com/catalog/iq/1826']
start_urls = ['https://www.gavilan.edu']
"""
logging.getLogger("scrapy").setLevel(logging.CRITICAL)
logging.getLogger("scrapy.utils.log").setLevel(logging.CRITICAL)
logging.getLogger("scrapy.extensions.telnet").setLevel(logging.CRITICAL)
logging.getLogger("scrapy.middleware").setLevel(logging.CRITICAL)
logging.getLogger("scrapy.core.engine").setLevel(logging.CRITICAL)
logging.getLogger("scrapy.middleware").setLevel(logging.CRITICAL)
logger.disabled = True"""
def parse(self, response):
print('visited:', repr(response.url), 'status:', response.status)
if re.search(r'\.pdf$', response.url):
m = re.search(r'\/([^\/]+\.'+ext+')$', response.url)
if m:
print("saving to ", save_folder + '/' + clean_fn(response.url))
pdf_response = requests.get(response.url)
with open(save_folder + '/' + clean_fn(response.url), 'wb') as f:
f.write(pdf_response.content)
text = extract_text(save_folder + '/' + clean_fn(response.url))
codecs.open(save_folder + '/' + clean_fn(response.url) + '.txt','w','utf-8').write(text)
for ext in ['doc','docx','ppt','pptx']:
if re.search(r'\.'+ext+'$', response.url):
m = re.search(r'\/([^\/]+\.'+ext+')$', response.url)
if m:
print("saving to ", save_folder + '/' + clean_fn(response.url))
pdf_response = requests.get(response.url)
with open(save_folder + '/' + clean_fn(response.url), 'wb') as f:
f.write(pdf_response.content)
#text = extract_text(save_folder + '/' + clean_fn(response.url) + '.txt')
output = pypandoc.convert_file(save_folder + '/' + clean_fn(response.url), 'html', extra_args=['--extract-media=%s' % hash ])
txt_output = trafilatura.extract(response.text,include_links=True, deduplicate=True, include_images=True, include_formatting=True)
if txt_output:
codecs.open(save_folder + '/' + clean_fn(response.url) + '.txt','w','utf-8').write(txt_output)
for ext in ['jpg','jpeg','gif','webp']:
if re.search(r'\.'+ext+'$', response.url):
m = re.search(r'\/([^\/]+\.'+ext+')$', response.url)
if m:
print("saving to ", save_folder + '/' + clean_fn(response.url))
pdf_response = requests.get(response.url)
with open(save_folder + '/' + clean_fn(response.url), 'wb') as f:
f.write(pdf_response.content)
f_out = codecs.open(save_folder + '/' + clean_fn(response.url) + '.txt','w','utf-8')
this_output = trafilatura.extract(response.text,include_links=True, deduplicate=True, include_images=True, include_formatting=True)
if this_output:
f_out.write(this_output)
f_out.close()
links = response.css('a::attr(href)').getall()
# Follow each link and parse its contents
for link in links:
go = 1
full_link = response.urljoin(link)
print('++++++ trying ', full_link)
if not re.search(r'gavilan\.edu',full_link):
go = 0
print('--- not gav edu')
else:
if re.search(r'hhh\.gavilan\.edu',full_link):
pass
elif not re.search(r'^https?:\/\/www\.gavilan\.edu',full_link):
# need to add www to gavilan.edu
m = re.search(r'^(https?:\/\/)gavilan\.edu(\/.*)$',full_link)
if m:
full_link = m.group(1) + 'www.' + m.group(2)
for a in avoid:
if re.search(a,full_link):
go = 0
print('--- avoid ', a)
if go: yield scrapy.Request(full_link, callback=self.parse,
headers={"User-Agent": "Mozilla/5.0 (iPad; CPU OS 12_2 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Mobile/15E148"})
else:
print("------ avoiding ", full_link)
# Instantiate a CrawlerProcess object
process = CrawlerProcess()
# Add the MySpider spider to the process
process.crawl(MySpider)
# Start the process
logging.basicConfig(level=logging.CRITICAL)
logging.getLogger('scrapy').propagate = False
logging.getLogger("trafilatura").setLevel(logging.CRITICAL)
logging.getLogger("trafilatura").propagate = False
logging.getLogger("pdfminer").setLevel(logging.CRITICAL)
logging.getLogger("pdfminer").propagate = False
logging.getLogger("urllib3").setLevel(logging.CRITICAL)
logging.getLogger("urllib3").propagate = False
logging.basicConfig(level=logging.CRITICAL)
process.start()
def samples(): save_folder = 'cache/crawl'
crawler = Crawler(links=[], domains=[], delay=20.0, sort=FIFO) clean_folder = 'cache/cleancrawl'
url = URL('http://www.clips.ua.ac.bemedia/pattern_schema.gif') def clean_fn(s):
print url.mimetype in MIMETYPE_IMAGE s = re.sub(r'[\s:]+','',s)
s = re.sub(r'\/','+',s)
return s
def format_html(html):
soup = bs4.BeautifulSoup(html, 'html.parser')
return soup.prettify()
#html = download('http://www.clips.ua.ac.be/', unicode=True)
s = URL('http://www.clips.ua.ac.be').download()
s = plaintext(s, keep={'h1':[], 'h2':[], 'strong':[], 'a':['href']})
# getting absolute urls def txt_clean_index():
from pattern.web import URL, DOM, abs files = os.listdir(save_folder)
line_freq = defaultdict(int)
url = URL('http://www.clips.ua.ac.be') # first pass
dom = DOM(url.download()) for f in files:
for link in dom('a'): lines = codecs.open(save_folder + '/' + f,'r','utf-8').readlines()
print abs(link.attributes.get('href',''), base=url.redirect or url.string) for L in lines:
L = L.strip()
line_freq[L] += 1
# get pdfs # second pass
from pattern.web import URL, PDF for f in files:
print("\n\n",f)
lines = codecs.open(save_folder + '/' + f,'r','utf-8').readlines()
out = codecs.open(clean_folder + '/' + f,'w','utf-8')
for L in lines:
L = L.strip()
if L in line_freq and line_freq[L] > 3:
continue
print(L)
out.write(L + '\n')
out.close()
url = URL('http://www.clips.ua.ac.be/sites/default/files/ctrs-002_0.pdf')
pdf = PDF(url.download())
print pdf.string
def search_embeddings():
model = SentenceTransformer('all-MiniLM-L6-v2')
save_embeds = pickle.load( open( "cache/embeddings.p", "rb" ) )
columns = list(zip(*save_embeds))
files = columns[0]
sentences = columns[1]
embeddings = columns[2]
print(files[:20])
print(sentences[:20])
print(embeddings[:20])
s = ''
while s != 'q':
s = input("search or 'q' to quit: ")
if s == 'q':
return
query_embedding = model.encode(s)
# Compute the cosine similarity between the query embedding and the sentence embeddings
cosine_scores = util.cos_sim(query_embedding, embeddings)
# Sort the sentences by their cosine similarity to the query sentence
results = sorted(zip(sentences, cosine_scores, files), key=lambda x: x[1], reverse=True)
# Print the top 5 results
for i, (sentence, score, file) in enumerate(results[:5]):
print(f'Top {i+1}: {file} - {sentence} - (Score: {score})')
def create_embeddings():
model = SentenceTransformer('all-MiniLM-L6-v2')
files = os.listdir('cache/crawl')
output = []
save_embeds = [] # ['file','sentence','embedding']
files.sort()
for f in files:
m = re.match(r'https?..www\.gavilan\.edu\+(.*)\.\w\w\w\w?\.txt$',f)
if m:
lines = codecs.open('cache/crawl/' + f,'r','utf-8').readlines()
lines = [L.strip() for L in lines]
lines = [L for L in lines if L]
embeddings = model.encode(lines)
print("\n-----", f)
#Print the embeddings
for sentence, embedding in zip(lines, embeddings):
print("Sentence:", sentence)
#print("Embedding:", embedding)
save_embeds.append([f,sentence,embedding])
pickle.dump( save_embeds, open( "cache/embeddings.p", "wb" ) )
if __name__ == "__main__": if __name__ == "__main__":
@ -897,8 +1098,19 @@ if __name__ == "__main__":
# 5: ['import freshdesk content', freshdesk ], # 5: ['import freshdesk content', freshdesk ],
6: ['download all a courses pages', grab_course_pages], 6: ['download all a courses pages', grab_course_pages],
7: ['demo vector search', demo_vector_search], 7: ['demo vector search', demo_vector_search],
8: ['crawl',crawl],
9: ['clean text index', txt_clean_index],
10: ['make web dir struct', manual_index],
11: ['create search embeddings', create_embeddings],
12: ['do a search', search_embeddings],
} }
if len(sys.argv) > 1 and re.search(r'^\d+',sys.argv[1]):
resp = int(sys.argv[1])
print("\n\nPerforming: %s\n\n" % options[resp][0])
else:
print ('')
for key in options: for key in options:
print(str(key) + '.\t' + options[key][0]) print(str(key) + '.\t' + options[key][0])
@ -908,5 +1120,3 @@ if __name__ == "__main__":
# Call the function in the options dict # Call the function in the options dict
options[ int(resp)][1]() options[ int(resp)][1]()

View File

@ -1086,6 +1086,7 @@ def add_evals(section=0):
s = [ x.strip() for x in codecs.open('cache/sp23_eval_sections.csv','r').readlines()] s = [ x.strip() for x in codecs.open('cache/sp23_eval_sections.csv','r').readlines()]
s = list(funcy.flatten(s)) s = list(funcy.flatten(s))
s.sort() s.sort()
print(s)
xyz = input('hit return to continue') xyz = input('hit return to continue')
#c = getCoursesInTerm(168,0,1) #c = getCoursesInTerm(168,0,1)
@ -1306,7 +1307,6 @@ def set_ext_tools():
if __name__ == "__main__": if __name__ == "__main__":
options = { 1: ['Cross check schedule with ztc responses',make_ztc_list] , options = { 1: ['Cross check schedule with ztc responses',make_ztc_list] ,
30: ['List latestart classes', list_latestarts ],
2: ['Add announcements to homepage', change_course_ann_homepage], 2: ['Add announcements to homepage', change_course_ann_homepage],
3: ['Cross-list classes', xlist ], 3: ['Cross-list classes', xlist ],
4: ['List students who passed quiz X', get_quiz_passers], 4: ['List students who passed quiz X', get_quiz_passers],
@ -1335,6 +1335,7 @@ if __name__ == "__main__":
27: ['Fine tune term dates and winter session', course_dates_terms], 27: ['Fine tune term dates and winter session', course_dates_terms],
28: ['Cross list a semester from file', semester_cross_lister], 28: ['Cross list a semester from file', semester_cross_lister],
29: ['Check all courses & their sections in semester', all_semester_course_sanity_check], 29: ['Check all courses & their sections in semester', all_semester_course_sanity_check],
#30: ['List latestart classes', list_latestarts ],
# TODO wanted: group shell for each GP (guided pathway) as a basic student services gateway.... # TODO wanted: group shell for each GP (guided pathway) as a basic student services gateway....
# #
} }

View File

@ -1807,3 +1807,95 @@ def freshdesk():
#### content.py
from pattern.web import plaintext, extension
from pattern.web import download
#from pattern import URL, MIMETYPE_IMAGE
from pattern.web import Crawler, DEPTH, FIFO, MIMETYPE_IMAGE, MIMETYPE_PDF
class GavCrawl(Crawler):
def visit(self, link, source=None):
print('visited:', repr(link.url), 'from:', link.referrer)
print(' ', link.url.mimetype)
#txt = plaintext(source, keep={'h1':[], 'h2':[], 'h3':[], 'h4':[], 'td':[], 'strong':[], 'b':[], 'a':['href'], 'img':['src'], 'ul':[], 'ol':[], 'li':[], 'dd':[], 'dt':[], 'i':[]})
#codecs.open(save_folder + '/' + mycleaner(clean_title(link.url)) + '.txt','w','utf-8').write(tomd.convert(txt))
codecs.open(save_folder + '/' + clean_fn(link.url) + '.txt','w','utf-8').write(trafilatura.extract(source,include_links=True, deduplicate=True, include_images=True, include_formatting=True))
def fail(self, link):
print('failed:', repr(link.url))
if re.search(r'\.pdf$', link.url):
m = re.search(r'\/([^\/]+\.pdf)$', link.url)
if m:
save_file = m.group(1)
print("saving to ", save_folder + '/' + save_file)
pdf_response = requests.get(link.url)
with open(save_folder + '/' + save_file, 'wb') as f:
f.write(pdf_response.content)
text = extract_text(save_folder + '/' + save_file)
#print(text)
codecs.open(save_folder + '/' + save_file + '.txt','w','utf-8').write(text)
else:
print("no match for pdf url: ", link.url)
for ext in ['jpg','jpeg','gif','webp']:
if re.search(r'\.'+ext+'$', link.url):
m = re.search(r'\/([^\/]+\.'+ext+')$', link.url)
if m:
save_file = m.group(1)
print("saving to ", save_folder + '/' + save_file)
pdf_response = requests.get(link.url)
with open(save_folder + '/' + save_file, 'wb') as f:
f.write(pdf_response.content)
else:
print('no match for '+ext+' url: ', link.url)
def crawl2():
#p = GavCrawl(links=['http://www.gavilan.edu/'], domains=['gavilan.edu', 'gavilan.curriqunet.com','www.boarddocs.com'], delay=0.75)
#p = GavCrawl(links=['https://gavilan.edu/finaid/2022-23DirectLoanApplication1.pdf'], domains=['gavilan.edu', 'gavilan.curriqunet.com','www.boarddocs.com'], delay=0.75)
p = GavCrawl(links=['https://gavilan.curriqunet.com/catalog/iq/1826'], domains=['gavilan.edu', 'gavilan.curriqunet.com','www.boarddocs.com'], delay=0.75)
while not p.done:
try:
p.crawl(method=DEPTH, cached=False, throttle=0.76)
except Exception as e:
print("Exception: ", e)
def samples():
crawler = Crawler(links=[], domains=[], delay=20.0, sort=FIFO)
url = URL('http://www.clips.ua.ac.bemedia/pattern_schema.gif')
print(url.mimetype in MIMETYPE_IMAGE)
#html = download('http://www.clips.ua.ac.be/', unicode=True)
s = URL('http://www.clips.ua.ac.be').download()
s = plaintext(s, keep={'h1':[], 'h2':[], 'strong':[], 'a':['href']})
# getting absolute urls
from pattern.web import URL, DOM, abs
url = URL('http://www.clips.ua.ac.be')
dom = DOM(url.download())
for link in dom('a'):
print(abs(link.attributes.get('href',''), base=url.redirect or url.string))
# get pdfs
from pattern.web import URL, PDF
url = URL('http://www.clips.ua.ac.be/sites/default/files/ctrs-002_0.pdf')
pdf = PDF(url.download())
print(pdf.string)

4
gpt.py
View File

@ -4,8 +4,8 @@ import openai
from canvas_secrets import openai_org, openai_api_key from canvas_secrets import openai_org, openai_api_key
openai.organization = "org-66WLoZQEtBrO42Z9S8rfd10M" openai.organization = openai_org
openai.api_key = "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" openai.api_key = openai_api_key
#print(openai.Model.list()) #print(openai.Model.list())
my_prompt = "Write a series of texts trying to sell a pen to a stranger." my_prompt = "Write a series of texts trying to sell a pen to a stranger."

View File

@ -1,4 +1,3 @@
import curses
import heapq, re, csv, os, shutil, datetime, urllib import heapq, re, csv, os, shutil, datetime, urllib
import itertools, time, markdown, csv, json, os.path, webbrowser, threading import itertools, time, markdown, csv, json, os.path, webbrowser, threading
from functools import wraps from functools import wraps
@ -15,6 +14,20 @@ import localcache
from server import * from server import *
from canvas_secrets import flask_secretkey from canvas_secrets import flask_secretkey
from content import my_site
import socket
this_host = socket.gethostname()
print('\n\n' + this_host, '\n\n')
has_curses = 0
if this_host != 'ROGDESKTOP':
import curses
has_curses = 1
else:
print("Skipping curses stuff")
q = Queue() q = Queue()
@ -25,7 +38,6 @@ PORT_NUMBER = 8080 # Maybe set this to 9000.
datafile = 'lambda.csv' datafile = 'lambda.csv'
#writing_path = 'c:/users/peter/Nextcloud/Documents/writing/'
#### ####
@ -95,6 +107,15 @@ def flask_thread(q):
@app.route('/mirror')
def mirror():
return codecs.open('cache/crawl/index.html','r','utf-8').read()
@app.route('/mirror/<filename>')
def mirror_file(filename):
return markdown.markdown( codecs.open('cache/crawl/'+filename,'r','utf-8').read() ) + \
"<pre>" + codecs.open('cache/crawl/'+filename,'r','utf-8').read() + "</pre>"
@app.route('/clearscreens') @app.route('/clearscreens')
def clears(): def clears():
@ -166,6 +187,7 @@ def flask_thread(q):
@app.route('/x/writing/images/<fname>') @app.route('/x/writing/images/<fname>')
def writing_img(fname): def writing_img(fname):
# TODO
img_path = "/media/hd2/peter_home/Documents/writing_img/" img_path = "/media/hd2/peter_home/Documents/writing_img/"
print(img_path + fname + " - writing images folder") print(img_path + fname + " - writing images folder")
img_ext = fname.split('.')[-1] img_ext = fname.split('.')[-1]

View File

@ -1,5 +1,5 @@
import json, codecs, re, markdown, os, pypandoc, striprtf, sqlite3, random, urllib import json, codecs, re, markdown, os, pypandoc, striprtf, sqlite3, random, urllib
import subprocess, html import subprocess, html, time
from striprtf.striprtf import rtf_to_text from striprtf.striprtf import rtf_to_text
from flask import render_template, Response from flask import render_template, Response
from flask import send_from_directory from flask import send_from_directory
@ -16,8 +16,33 @@ from localcache import arrange_data_for_web, depts_with_classcounts, dept_with_s
from yattag import Doc from yattag import Doc
import socket
this_host = socket.gethostname()
print('\n\n server host: ' + this_host, '\n\n')
LECPATH = "/media/hd2/peter_home_offload/lecture/" LECPATH = "/media/hd2/peter_home_offload/lecture/"
host = 'http://192.168.1.6:5000' host = 'http://192.168.1.6:5000'
news_path = '/media/hd2/peter_home/Documents/scripts/browser/'
writing_path = '/media/hd2/peter_home/Documents/writing/'
img_path = '/media/hd2/peter_home/Documents/writing_img/'
pics_path = '/media/hd2/peter_home/misc/'
if this_host == 'ROGDESKTOP':
LECPATH = "d:/peter_home_offload/lecture/"
host = 'http://192.168.1.7:5000'
news_path = 'd:/peter_home/Documents/scripts/browser/'
writing_path = 'd:/peter_home/Documents/writing/'
img_path = 'd:/peter_home/Documents/writing_img/'
pics_path = 'd:/peter_home/misc/'
import paho.mqtt.client as mqtt import paho.mqtt.client as mqtt
@ -55,7 +80,7 @@ def on_message(client, userdata, msg):
print(" %s mqtt msg: %s data: %s" % (now, msg.topic, msg.payload.decode())) print(" %s mqtt msg: %s data: %s" % (now, msg.topic, msg.payload.decode()))
if 0:
while(mqtt_offline): while(mqtt_offline):
try: try:
client = mqtt.Client() client = mqtt.Client()
@ -114,18 +139,6 @@ def screenoff():
###### ######
news_path = '/media/hd2/peter_home/Documents/scripts/browser/'
if platform.system() == 'Windows':
writing_path = 'c:/users/peter/Nextcloud/Documents/writing/'
else:
writing_path = '/media/hd2/peter_home/Documents/writing/'
img_path = '/media/hd2/peter_home/Documents/writing_img/'
if platform.system() == 'Windows':
pics_path = 'c:/users/peter/Nextcloud/misc/'
else:
pics_path = '/media/hd2/peter_home/misc/'
br = "<br />" br = "<br />"
nl = "\n" nl = "\n"