From 0e5a62472d97d71d690cf7c4362a2547a26005b7 Mon Sep 17 00:00:00 2001 From: Peter Howell Date: Sun, 9 Nov 2025 20:30:33 +0000 Subject: [PATCH] content and outcomes update --- content.py | 64 ++++++++++++++++++++++++++-- curric2022.py | 116 +++++++++++++++++++++++++++++++++++--------------- 2 files changed, 142 insertions(+), 38 deletions(-) diff --git a/content.py b/content.py index a2587f0..79263ce 100644 --- a/content.py +++ b/content.py @@ -169,6 +169,59 @@ def extract_forums(id, course_folder, item_id_to_index, verbose=0, discussion_li # # Download everything interesting in a course to a local folder # Build a master file with the entire class content +# Adjust image paths in aggregated snippets so they work from the course root. +def adjust_fullcourse_image_sources(html_fragment): + if not html_fragment: + return html_fragment + + def _prefix_images(match): + prefix = match.group(1) + path = match.group(2) + normalized = path.lstrip('./') + if normalized.lower().startswith('pages/'): + return f"{prefix}{normalized}" + return f"{prefix}pages/{normalized}" + + src_pattern = re.compile(r'(]+?\bsrc\s*=\s*[\'"])(?:\./)?(images/[^\'"]*)', re.IGNORECASE) + html_fragment = src_pattern.sub(_prefix_images, html_fragment) + + canvas_pattern = re.compile(r'(]+?\bdata-canvas-src\s*=\s*[\'"])(?:\./)?(images/[^\'"]*)', re.IGNORECASE) + html_fragment = canvas_pattern.sub(_prefix_images, html_fragment) + + srcset_pattern = re.compile(r'(]+?\bsrcset\s*=\s*[\'"])([^\'"]*)([\'"])', re.IGNORECASE | re.DOTALL) + + def _prefix_srcset(match): + prefix = match.group(1) + value = match.group(2) + suffix = match.group(3) + entries = [] + changed = False + for chunk in value.split(','): + chunk = chunk.strip() + if not chunk: + continue + parts = chunk.split() + url = parts[0] + descriptors = parts[1:] + normalized = url.lstrip('./') + if normalized.lower().startswith('pages/'): + new_url = url + elif normalized.lower().startswith('images/'): + new_url = f"pages/{normalized}" + changed = True + else: + new_url = url + descriptor_text = ' '.join(descriptors) + entry = f"{new_url} {descriptor_text}".strip() + entries.append(entry) + if not changed: + return match.group(0) + return f"{prefix}{', '.join(entries)}{suffix}" + + html_fragment = srcset_pattern.sub(_prefix_srcset, html_fragment) + + return html_fragment + def course_download(id=""): global items @@ -203,6 +256,7 @@ def course_download(id=""): file_local_map = {} discussion_local_map = {} module_details = [] + canvas_host = urlparse(url).hostname if url else None for m in modules: items[running_index] = '

%s

%s\n' % ( m['name'], pagebreak ) @@ -352,7 +406,9 @@ def course_download(id=""): return mapped, canvas_override or absolute_src try: - response = requests.get(absolute_src, headers=header, stream=True, timeout=30) + target_host = urlparse(absolute_src).hostname + request_headers = header if not canvas_host or target_host == canvas_host else None + response = requests.get(absolute_src, headers=request_headers, stream=True, timeout=30) response.raise_for_status() except Exception as e: d(f" * error downloading image {absolute_src}: {e}") @@ -662,7 +718,7 @@ def course_download(id=""): for I in items: if I: - mycourse.write( I ) + mycourse.write(adjust_fullcourse_image_sources(I)) mycourse.write("\n") @@ -684,8 +740,8 @@ def course_download(id=""): if video_link_list: mycourse.write('\n

Videos Linked in Pages

\n') for V in video_link_list: - (url, txt, pg) = V - mycourse.write("\n") + video_url, txt, pg = V + mycourse.write("\n") mycourse.write("
"+txt+" on " + pg + "
"+txt+" on " + pg + "
\n") mycourse.close() diff --git a/curric2022.py b/curric2022.py index 9de983e..9adbeaf 100644 --- a/curric2022.py +++ b/curric2022.py @@ -116,6 +116,7 @@ def recur_matcher(item, depth=0): num_failed_course = 1 +# Capture a single course payload for structured traversal. def single_course_parse(c): global num_failed_course this_course = [] @@ -129,14 +130,37 @@ def single_course_parse(c): ooops.close() num_failed_course = num_failed_course + 1 return ("-1", []) - + +# Normalize course file payloads so downstream code always gets a list of instances. +def load_course_file(path): + try: + raw_data = json.loads(codecs.open(path, 'r', 'utf-8').read(), strict=False) + except Exception as e: + print(f"Unable to read {path}: {e}") + return [] + + if isinstance(raw_data, dict): + if 'entityInstances' in raw_data: + return raw_data.get('entityInstances', []) + return [raw_data] + + if isinstance(raw_data, list): + if raw_data and isinstance(raw_data[0], dict) and 'entityInstances' in raw_data[0]: + instances = [] + for block in raw_data: + if isinstance(block, dict) and 'entityInstances' in block: + instances.extend(block.get('entityInstances', [])) + return instances + return raw_data + + return [] def match_style_test(): classes = {} oo = codecs.open("cache/courses/curric2022test.json","w","utf-8") for f in os.listdir('cache/courses'): - if re.search('classes_',f): + if re.search(r'classes_',f): print(f) - cls = json.loads(codecs.open('cache/courses/'+f,'r','utf-8').read()) + cls = load_course_file('cache/courses/'+f) for c in cls: id,output = single_course_parse(c) classes[id] = "\n".join(output) @@ -170,7 +194,7 @@ def path_style_prog(): classes = {} oo = codecs.open("cache/programs/allprogrampaths.txt","w","utf-8") for f in os.listdir('cache/programs'): - if re.search('^programs_',f): + if re.search(r'^programs_',f): print(f) cls = json.loads(codecs.open('cache/programs/'+f,'r','utf-8').read()) for c in cls: @@ -195,6 +219,30 @@ def all_outcomes(): csvwriter = csv.writer(csvfile) csvwriter.writerow('code cqcourseid coursestatus termineffect dept num cqoutcomeid outcome'.split(' ')) +# Export sorted course titles from the raw course path dump. +def export_course_titles(): + source_path = 'cache/courses/allclasspaths.txt' + dest_path = 'cache/courses/allclasstitles.txt' + pattern = re.compile(r'^Course\/(\d+)\/Course Description\/entityTitle\/(.*)$') + titles = [] + + try: + with codecs.open(source_path, 'r', 'utf-8') as infile: + for line in infile: + match = pattern.match(line.strip()) + if match: + title = match.group(2).strip() + if title: + titles.append(title) + except FileNotFoundError: + print(f"Source file not found: {source_path}") + return + + titles.sort(key=lambda s: s.lower()) + with codecs.open(dest_path, 'w', 'utf-8') as outfile: + for title in titles: + outfile.write(title + '\n') + csvfile2 = codecs.open('cache/courses/all_active_outcomes.csv','w','utf-8') csvwriter2 = csv.writer(csvfile2) csvwriter2.writerow('code cqcourseid coursestatus termineffect dept num cqoutcomeid outcome'.split(' ')) @@ -211,7 +259,7 @@ def all_outcomes(): count = 0 for L in rr: - a = re.search('Course\/(\d+)',L) + a = re.search(r'Course/(\d+)',L) if a: course_num = a.group(1) #print(course_num, current_course_num) @@ -234,25 +282,25 @@ def all_outcomes(): current_course['c'] = course_num - a = re.search('Course\/(\d+)\/1\/Course\ Description\/0\/Course\ Discipline\/(.*)$',L) + a = re.search(r'Course/(\d+)/1/Course Description/0/Course Discipline/(.*)$',L) if a: current_course['d'] = a.group(2) - a = re.search('Course\/(\d+)\/1\/Course\ Description\/0\/Course\ Number\/(.*)$',L) + a = re.search(r'Course/(\d+)/1/Course Description/0/Course Number/(.*)$',L) if a: current_course['n'] = a.group(2) - a = re.search('Course\/(\d+)\/1\/Course\ Description\/0\/Course\ Title\/(.*)$',L) + a = re.search(r'Course/(\d+)/1/Course Description/0/Course Title/(.*)$',L) if a: current_course['T'] = a.group(2) - a = re.search('Course\/(\d+)\/1\/Course\ Description\/0\/Short\ Title\/(.*)$',L) + a = re.search(r'Course/(\d+)/1/Course Description/0/Short Title/(.*)$',L) if a: current_course['t'] = a.group(2) - a = re.search('Course\ Description\/status\/(.*)$',L) + a = re.search(r'Course Description/status/(.*)$',L) if a: current_course['s'] = a.group(1) - a = re.search('Course\ Content\/\d+\/Lecture\ Content\/Curriculum\ Approval\ Date:\s*(.*)$',L) + a = re.search(r'Course Content/\d+/Lecture Content/Curriculum Approval Date:\s*(.*)$',L) if a: current_course['a'] = a.group(1) - a = re.search('Course\ Description\/\d+\/Internal\ Processing\ Term\/(.*)$',L) + a = re.search(r'Course Description/\d+/Internal Processing Term/(.*)$',L) if a: t_code = term_txt_to_code(a.group(1)) current_course['m'] = t_code @@ -262,20 +310,20 @@ def all_outcomes(): # Course/3091/1/Course Description/0/Internal Processing Term/Spring 2018 - a = re.search('Learning\ Outcomes\/\d+\/(cqid_\d+)\/Learning\ Outcomes\/Description\/(.*)$',L) + a = re.search(r'Learning Outcomes/\d+/(cqid_\d+)/Learning Outcomes/Description/(.*)$',L) if a: current_course['o'].append(a.group(2)) current_course['i'] = a.group(1) - csvwriter.writerow([current_course['d']+current_course['n'], current_course_num, current_course['s'], current_course['m'], current_course['d'], current_course['n'], current_course['i'], a.group(2)]) + csvwriter2.writerow([current_course['d']+current_course['n'], current_course_num, current_course['s'], current_course['m'], current_course['d'], current_course['n'], current_course['i'], a.group(2)]) if current_course['s']=='Active': csvwriter2.writerow([current_course['d']+current_course['n'], current_course_num, current_course['s'], current_course['m'], current_course['d'], current_course['n'], current_course['i'], a.group(2)]) - if re.search('Learning\ Outcomes\/Description\/',L): + if re.search(r'Learning Outcomes/Description/',L): ww.write(L) - if re.search('Description\/entityTitle\/',L): + if re.search(r'Description/entityTitle/',L): ww.write(L) - if re.search('Description\/status\/',L): + if re.search(r'Description/status/',L): ww.write(L) xx = codecs.open("cache/courses/course_cq_index.json","w", "utf-8") @@ -601,20 +649,20 @@ def course_path_style_2_html(): active_courses = {} lookup_table = { 'entityTitle':'title', 'proposalType':'type', - '\/Course\sDescription\/status':'status', 'Course\sDiscipline':'dept', - 'Course\sNumber':'number', 'Course\sTitle':'name', 'Course Description\/\d\/Justification':'justification', - 'Short\sTitle':'shortname', 'Course Description\/\d\/Internal\sProcessing\sTerm':'term', 'This\sCourse\sIs\sDegree\sApplicable':'degree_applicable', - '\/Course\sDescription\/\d+\/Course\sDescription\/':'desc', - 'Minimum\sUnits':'min_units', 'Minimum\sLecture\sHour':'min_lec_hour', 'Minimum\sLab\sHour':'min_lab_hour', 'Course\shas\svariable\shours':'has_var_hours', - 'Number\sWeeks':'weeks', - 'Maximum\sUnits':'max_units', 'Credit\sStatus':'credit_status', - 'TOP\sCode':'top_code', 'Classification':'classification', 'Non\sCredit\sCategory':'noncredit_category', 'Stand-Alone\sClass?':'stand_alone', - 'Grade\sOption':'grade_option', 'Is\sRepeatable':'repeatable', 'Learning\sOutcomes\/Description':'slo', - 'Is\sThis\sCourse\sis\sRecommended\sfor\sTransfer\sto\sState\sUniversities\sand\sColleges?':'transfer_csu', - 'Is\sThis\sCourse\sis\sRecommended\sfor\sTransfer\sto\sUniversity\sof\sCalifornia?':'transfer_uc', - '\/Catalog\sCourse\sSummary\sView\/':'catalog', - '\/Course\sContent/\d+/Lecture\sContent\/':'content', - '\/ASSIST\sPreview\/\d+\/Outcomes\sand\sObjectives\/':'objectives'} + r'/Course\sDescription/status':'status', r'Course\sDiscipline':'dept', + r'Course\sNumber':'number', r'Course\sTitle':'name', r'Course Description/\d/Justification':'justification', + r'Short\sTitle':'shortname', r'Course Description/\d/Internal\sProcessing\sTerm':'term', r'This\sCourse\sIs\sDegree\sApplicable':'degree_applicable', + r'/Course\sDescription/\d+/Course\sDescription/':'desc', + r'Minimum\sUnits':'min_units', r'Minimum\sLecture\sHour':'min_lec_hour', r'Minimum\sLab\sHour':'min_lab_hour', r'Course\shas\svariable\shours':'has_var_hours', + r'Number\sWeeks':'weeks', + r'Maximum\sUnits':'max_units', r'Credit\sStatus':'credit_status', + r'TOP\sCode':'top_code', r'Classification':'classification', r'Non\sCredit\sCategory':'noncredit_category', r'Stand-Alone\sClass\?':'stand_alone', + r'Grade\sOption':'grade_option', r'Is\sRepeatable':'repeatable', r'Learning\sOutcomes/Description':'slo', + r'Is\sThis\sCourse\sis\sRecommended\sfor\sTransfer\sto\sState\sUniversities\sand\sColleges\?':'transfer_csu', + r'Is\sThis\sCourse\sis\sRecommended\sfor\sTransfer\sto\sUniversity\sof\sCalifornia\?':'transfer_uc', + r'/Catalog\sCourse\sSummary\sView/':'catalog', + r'/Course\sContent/\d+/Lecture\sContent/':'content', + r'/ASSIST\sPreview/\d+/Outcomes\sand\sObjectives/':'objectives'} for C in sorted(list(course_prebuild.keys()),key=int): v = 0 @@ -817,9 +865,9 @@ def path_style_test(): classes = {} oo = codecs.open("cache/courses/allclasspaths.txt","w","utf-8") for f in os.listdir('cache/courses'): - if re.search('^classes_',f): + if re.search(r'^classes_',f): print(f) - cls = json.loads(codecs.open('cache/courses/'+f,'r','utf-8').read(),strict=False) + cls = load_course_file('cache/courses/'+f) for c in cls: id,output = single_course_path_parse(c) classes[id] = "\n".join(output) @@ -976,6 +1024,7 @@ if __name__ == "__main__": 6: ['extract de info from class paths', de_classpaths], 7: ['build schedule or summary for SLO planning', slo_summary_report], 8: ['remove deactivated courses', filter_classes], + 9: ['export sorted course titles', export_course_titles], 10: ['fetch all programs', fetch_all_programs], 11: ['process all programs', path_style_prog], 12: ['programs - path style to html catalog', path_style_2_html], @@ -997,4 +1046,3 @@ if __name__ == "__main__": # Call the function in the options dict options[ int(resp)][1]() -