content and outcomes update

This commit is contained in:
Peter Howell 2025-11-09 20:30:33 +00:00
parent 08514ad69e
commit 0e5a62472d
2 changed files with 142 additions and 38 deletions

View File

@ -169,6 +169,59 @@ def extract_forums(id, course_folder, item_id_to_index, verbose=0, discussion_li
#
# Download everything interesting in a course to a local folder
# Build a master file with the entire class content
# Adjust image paths in aggregated snippets so they work from the course root.
def adjust_fullcourse_image_sources(html_fragment):
if not html_fragment:
return html_fragment
def _prefix_images(match):
prefix = match.group(1)
path = match.group(2)
normalized = path.lstrip('./')
if normalized.lower().startswith('pages/'):
return f"{prefix}{normalized}"
return f"{prefix}pages/{normalized}"
src_pattern = re.compile(r'(<img[^>]+?\bsrc\s*=\s*[\'"])(?:\./)?(images/[^\'"]*)', re.IGNORECASE)
html_fragment = src_pattern.sub(_prefix_images, html_fragment)
canvas_pattern = re.compile(r'(<img[^>]+?\bdata-canvas-src\s*=\s*[\'"])(?:\./)?(images/[^\'"]*)', re.IGNORECASE)
html_fragment = canvas_pattern.sub(_prefix_images, html_fragment)
srcset_pattern = re.compile(r'(<img[^>]+?\bsrcset\s*=\s*[\'"])([^\'"]*)([\'"])', re.IGNORECASE | re.DOTALL)
def _prefix_srcset(match):
prefix = match.group(1)
value = match.group(2)
suffix = match.group(3)
entries = []
changed = False
for chunk in value.split(','):
chunk = chunk.strip()
if not chunk:
continue
parts = chunk.split()
url = parts[0]
descriptors = parts[1:]
normalized = url.lstrip('./')
if normalized.lower().startswith('pages/'):
new_url = url
elif normalized.lower().startswith('images/'):
new_url = f"pages/{normalized}"
changed = True
else:
new_url = url
descriptor_text = ' '.join(descriptors)
entry = f"{new_url} {descriptor_text}".strip()
entries.append(entry)
if not changed:
return match.group(0)
return f"{prefix}{', '.join(entries)}{suffix}"
html_fragment = srcset_pattern.sub(_prefix_srcset, html_fragment)
return html_fragment
def course_download(id=""):
global items
@ -203,6 +256,7 @@ def course_download(id=""):
file_local_map = {}
discussion_local_map = {}
module_details = []
canvas_host = urlparse(url).hostname if url else None
for m in modules:
items[running_index] = '<h2>%s</h2>%s\n' % ( m['name'], pagebreak )
@ -352,7 +406,9 @@ def course_download(id=""):
return mapped, canvas_override or absolute_src
try:
response = requests.get(absolute_src, headers=header, stream=True, timeout=30)
target_host = urlparse(absolute_src).hostname
request_headers = header if not canvas_host or target_host == canvas_host else None
response = requests.get(absolute_src, headers=request_headers, stream=True, timeout=30)
response.raise_for_status()
except Exception as e:
d(f" * error downloading image {absolute_src}: {e}")
@ -662,7 +718,7 @@ def course_download(id=""):
for I in items:
if I:
mycourse.write( I )
mycourse.write(adjust_fullcourse_image_sources(I))
mycourse.write("\n</body></html>")
@ -684,8 +740,8 @@ def course_download(id=""):
if video_link_list:
mycourse.write('\n<h1>Videos Linked in Pages</h1>\n<table>')
for V in video_link_list:
(url, txt, pg) = V
mycourse.write("<tr><td><a target='_blank' href='"+url+"'>"+txt+"</a></td><td> on <a target='_blank' href='" + pg + "'>" + pg + "</a></td></tr>\n")
video_url, txt, pg = V
mycourse.write("<tr><td><a target='_blank' href='"+video_url+"'>"+txt+"</a></td><td> on <a target='_blank' href='" + pg + "'>" + pg + "</a></td></tr>\n")
mycourse.write("</table>\n")
mycourse.close()

View File

@ -116,6 +116,7 @@ def recur_matcher(item, depth=0):
num_failed_course = 1
# Capture a single course payload for structured traversal.
def single_course_parse(c):
global num_failed_course
this_course = []
@ -129,14 +130,37 @@ def single_course_parse(c):
ooops.close()
num_failed_course = num_failed_course + 1
return ("-1", [])
# Normalize course file payloads so downstream code always gets a list of instances.
def load_course_file(path):
try:
raw_data = json.loads(codecs.open(path, 'r', 'utf-8').read(), strict=False)
except Exception as e:
print(f"Unable to read {path}: {e}")
return []
if isinstance(raw_data, dict):
if 'entityInstances' in raw_data:
return raw_data.get('entityInstances', [])
return [raw_data]
if isinstance(raw_data, list):
if raw_data and isinstance(raw_data[0], dict) and 'entityInstances' in raw_data[0]:
instances = []
for block in raw_data:
if isinstance(block, dict) and 'entityInstances' in block:
instances.extend(block.get('entityInstances', []))
return instances
return raw_data
return []
def match_style_test():
classes = {}
oo = codecs.open("cache/courses/curric2022test.json","w","utf-8")
for f in os.listdir('cache/courses'):
if re.search('classes_',f):
if re.search(r'classes_',f):
print(f)
cls = json.loads(codecs.open('cache/courses/'+f,'r','utf-8').read())
cls = load_course_file('cache/courses/'+f)
for c in cls:
id,output = single_course_parse(c)
classes[id] = "\n".join(output)
@ -170,7 +194,7 @@ def path_style_prog():
classes = {}
oo = codecs.open("cache/programs/allprogrampaths.txt","w","utf-8")
for f in os.listdir('cache/programs'):
if re.search('^programs_',f):
if re.search(r'^programs_',f):
print(f)
cls = json.loads(codecs.open('cache/programs/'+f,'r','utf-8').read())
for c in cls:
@ -195,6 +219,30 @@ def all_outcomes():
csvwriter = csv.writer(csvfile)
csvwriter.writerow('code cqcourseid coursestatus termineffect dept num cqoutcomeid outcome'.split(' '))
# Export sorted course titles from the raw course path dump.
def export_course_titles():
source_path = 'cache/courses/allclasspaths.txt'
dest_path = 'cache/courses/allclasstitles.txt'
pattern = re.compile(r'^Course\/(\d+)\/Course Description\/entityTitle\/(.*)$')
titles = []
try:
with codecs.open(source_path, 'r', 'utf-8') as infile:
for line in infile:
match = pattern.match(line.strip())
if match:
title = match.group(2).strip()
if title:
titles.append(title)
except FileNotFoundError:
print(f"Source file not found: {source_path}")
return
titles.sort(key=lambda s: s.lower())
with codecs.open(dest_path, 'w', 'utf-8') as outfile:
for title in titles:
outfile.write(title + '\n')
csvfile2 = codecs.open('cache/courses/all_active_outcomes.csv','w','utf-8')
csvwriter2 = csv.writer(csvfile2)
csvwriter2.writerow('code cqcourseid coursestatus termineffect dept num cqoutcomeid outcome'.split(' '))
@ -211,7 +259,7 @@ def all_outcomes():
count = 0
for L in rr:
a = re.search('Course\/(\d+)',L)
a = re.search(r'Course/(\d+)',L)
if a:
course_num = a.group(1)
#print(course_num, current_course_num)
@ -234,25 +282,25 @@ def all_outcomes():
current_course['c'] = course_num
a = re.search('Course\/(\d+)\/1\/Course\ Description\/0\/Course\ Discipline\/(.*)$',L)
a = re.search(r'Course/(\d+)/1/Course Description/0/Course Discipline/(.*)$',L)
if a:
current_course['d'] = a.group(2)
a = re.search('Course\/(\d+)\/1\/Course\ Description\/0\/Course\ Number\/(.*)$',L)
a = re.search(r'Course/(\d+)/1/Course Description/0/Course Number/(.*)$',L)
if a:
current_course['n'] = a.group(2)
a = re.search('Course\/(\d+)\/1\/Course\ Description\/0\/Course\ Title\/(.*)$',L)
a = re.search(r'Course/(\d+)/1/Course Description/0/Course Title/(.*)$',L)
if a:
current_course['T'] = a.group(2)
a = re.search('Course\/(\d+)\/1\/Course\ Description\/0\/Short\ Title\/(.*)$',L)
a = re.search(r'Course/(\d+)/1/Course Description/0/Short Title/(.*)$',L)
if a:
current_course['t'] = a.group(2)
a = re.search('Course\ Description\/status\/(.*)$',L)
a = re.search(r'Course Description/status/(.*)$',L)
if a:
current_course['s'] = a.group(1)
a = re.search('Course\ Content\/\d+\/Lecture\ Content\/Curriculum\ Approval\ Date:\s*(.*)$',L)
a = re.search(r'Course Content/\d+/Lecture Content/Curriculum Approval Date:\s*(.*)$',L)
if a:
current_course['a'] = a.group(1)
a = re.search('Course\ Description\/\d+\/Internal\ Processing\ Term\/(.*)$',L)
a = re.search(r'Course Description/\d+/Internal Processing Term/(.*)$',L)
if a:
t_code = term_txt_to_code(a.group(1))
current_course['m'] = t_code
@ -262,20 +310,20 @@ def all_outcomes():
# Course/3091/1/Course Description/0/Internal Processing Term/Spring 2018
a = re.search('Learning\ Outcomes\/\d+\/(cqid_\d+)\/Learning\ Outcomes\/Description\/(.*)$',L)
a = re.search(r'Learning Outcomes/\d+/(cqid_\d+)/Learning Outcomes/Description/(.*)$',L)
if a:
current_course['o'].append(a.group(2))
current_course['i'] = a.group(1)
csvwriter.writerow([current_course['d']+current_course['n'], current_course_num, current_course['s'], current_course['m'], current_course['d'], current_course['n'], current_course['i'], a.group(2)])
csvwriter2.writerow([current_course['d']+current_course['n'], current_course_num, current_course['s'], current_course['m'], current_course['d'], current_course['n'], current_course['i'], a.group(2)])
if current_course['s']=='Active':
csvwriter2.writerow([current_course['d']+current_course['n'], current_course_num, current_course['s'], current_course['m'], current_course['d'], current_course['n'], current_course['i'], a.group(2)])
if re.search('Learning\ Outcomes\/Description\/',L):
if re.search(r'Learning Outcomes/Description/',L):
ww.write(L)
if re.search('Description\/entityTitle\/',L):
if re.search(r'Description/entityTitle/',L):
ww.write(L)
if re.search('Description\/status\/',L):
if re.search(r'Description/status/',L):
ww.write(L)
xx = codecs.open("cache/courses/course_cq_index.json","w", "utf-8")
@ -601,20 +649,20 @@ def course_path_style_2_html():
active_courses = {}
lookup_table = { 'entityTitle':'title', 'proposalType':'type',
'\/Course\sDescription\/status':'status', 'Course\sDiscipline':'dept',
'Course\sNumber':'number', 'Course\sTitle':'name', 'Course Description\/\d\/Justification':'justification',
'Short\sTitle':'shortname', 'Course Description\/\d\/Internal\sProcessing\sTerm':'term', 'This\sCourse\sIs\sDegree\sApplicable':'degree_applicable',
'\/Course\sDescription\/\d+\/Course\sDescription\/':'desc',
'Minimum\sUnits':'min_units', 'Minimum\sLecture\sHour':'min_lec_hour', 'Minimum\sLab\sHour':'min_lab_hour', 'Course\shas\svariable\shours':'has_var_hours',
'Number\sWeeks':'weeks',
'Maximum\sUnits':'max_units', 'Credit\sStatus':'credit_status',
'TOP\sCode':'top_code', 'Classification':'classification', 'Non\sCredit\sCategory':'noncredit_category', 'Stand-Alone\sClass?':'stand_alone',
'Grade\sOption':'grade_option', 'Is\sRepeatable':'repeatable', 'Learning\sOutcomes\/Description':'slo',
'Is\sThis\sCourse\sis\sRecommended\sfor\sTransfer\sto\sState\sUniversities\sand\sColleges?':'transfer_csu',
'Is\sThis\sCourse\sis\sRecommended\sfor\sTransfer\sto\sUniversity\sof\sCalifornia?':'transfer_uc',
'\/Catalog\sCourse\sSummary\sView\/':'catalog',
'\/Course\sContent/\d+/Lecture\sContent\/':'content',
'\/ASSIST\sPreview\/\d+\/Outcomes\sand\sObjectives\/':'objectives'}
r'/Course\sDescription/status':'status', r'Course\sDiscipline':'dept',
r'Course\sNumber':'number', r'Course\sTitle':'name', r'Course Description/\d/Justification':'justification',
r'Short\sTitle':'shortname', r'Course Description/\d/Internal\sProcessing\sTerm':'term', r'This\sCourse\sIs\sDegree\sApplicable':'degree_applicable',
r'/Course\sDescription/\d+/Course\sDescription/':'desc',
r'Minimum\sUnits':'min_units', r'Minimum\sLecture\sHour':'min_lec_hour', r'Minimum\sLab\sHour':'min_lab_hour', r'Course\shas\svariable\shours':'has_var_hours',
r'Number\sWeeks':'weeks',
r'Maximum\sUnits':'max_units', r'Credit\sStatus':'credit_status',
r'TOP\sCode':'top_code', r'Classification':'classification', r'Non\sCredit\sCategory':'noncredit_category', r'Stand-Alone\sClass\?':'stand_alone',
r'Grade\sOption':'grade_option', r'Is\sRepeatable':'repeatable', r'Learning\sOutcomes/Description':'slo',
r'Is\sThis\sCourse\sis\sRecommended\sfor\sTransfer\sto\sState\sUniversities\sand\sColleges\?':'transfer_csu',
r'Is\sThis\sCourse\sis\sRecommended\sfor\sTransfer\sto\sUniversity\sof\sCalifornia\?':'transfer_uc',
r'/Catalog\sCourse\sSummary\sView/':'catalog',
r'/Course\sContent/\d+/Lecture\sContent/':'content',
r'/ASSIST\sPreview/\d+/Outcomes\sand\sObjectives/':'objectives'}
for C in sorted(list(course_prebuild.keys()),key=int):
v = 0
@ -817,9 +865,9 @@ def path_style_test():
classes = {}
oo = codecs.open("cache/courses/allclasspaths.txt","w","utf-8")
for f in os.listdir('cache/courses'):
if re.search('^classes_',f):
if re.search(r'^classes_',f):
print(f)
cls = json.loads(codecs.open('cache/courses/'+f,'r','utf-8').read(),strict=False)
cls = load_course_file('cache/courses/'+f)
for c in cls:
id,output = single_course_path_parse(c)
classes[id] = "\n".join(output)
@ -976,6 +1024,7 @@ if __name__ == "__main__":
6: ['extract de info from class paths', de_classpaths],
7: ['build schedule or summary for SLO planning', slo_summary_report],
8: ['remove deactivated courses', filter_classes],
9: ['export sorted course titles', export_course_titles],
10: ['fetch all programs', fetch_all_programs],
11: ['process all programs', path_style_prog],
12: ['programs - path style to html catalog', path_style_2_html],
@ -997,4 +1046,3 @@ if __name__ == "__main__":
# Call the function in the options dict
options[ int(resp)][1]()