From 0e5a62472d97d71d690cf7c4362a2547a26005b7 Mon Sep 17 00:00:00 2001
From: Peter Howell <peter.howell@gmail.com>
Date: Sun, 9 Nov 2025 20:30:33 +0000
Subject: [PATCH] content and outcomes update

---
 content.py    |  64 ++++++++++++++++++++++++++--
 curric2022.py | 116 +++++++++++++++++++++++++++++++++++---------------
 2 files changed, 142 insertions(+), 38 deletions(-)
diff --git a/content.py b/content.py
index a2587f0..79263ce 100644
--- a/content.py
+++ b/content.py
@@ -169,6 +169,59 @@ def extract_forums(id, course_folder, item_id_to_index, verbose=0, discussion_li
 #
 # Download everything interesting in a course to a local folder
 # Build a master file with the entire class content
+# Adjust image paths in aggregated snippets so they work from the course root.
+def adjust_fullcourse_image_sources(html_fragment):
+    if not html_fragment:
+        return html_fragment
+
+    def _prefix_images(match):
+        prefix = match.group(1)
+        path = match.group(2)
+        normalized = path.lstrip('./')
+        if normalized.lower().startswith('pages/'):
+            return f"{prefix}{normalized}"
+        return f"{prefix}pages/{normalized}"
+
+    src_pattern = re.compile(r'(<img[^>]+?\bsrc\s*=\s*[\'"])(?:\./)?(images/[^\'"]*)', re.IGNORECASE)
+    html_fragment = src_pattern.sub(_prefix_images, html_fragment)
+
+    canvas_pattern = re.compile(r'(<img[^>]+?\bdata-canvas-src\s*=\s*[\'"])(?:\./)?(images/[^\'"]*)', re.IGNORECASE)
+    html_fragment = canvas_pattern.sub(_prefix_images, html_fragment)
+
+    srcset_pattern = re.compile(r'(<img[^>]+?\bsrcset\s*=\s*[\'"])([^\'"]*)([\'"])', re.IGNORECASE | re.DOTALL)
+
+    def _prefix_srcset(match):
+        prefix = match.group(1)
+        value = match.group(2)
+        suffix = match.group(3)
+        entries = []
+        changed = False
+        for chunk in value.split(','):
+            chunk = chunk.strip()
+            if not chunk:
+                continue
+            parts = chunk.split()
+            url = parts[0]
+            descriptors = parts[1:]
+            normalized = url.lstrip('./')
+            if normalized.lower().startswith('pages/'):
+                new_url = url
+            elif normalized.lower().startswith('images/'):
+                new_url = f"pages/{normalized}"
+                changed = True
+            else:
+                new_url = url
+            descriptor_text = ' '.join(descriptors)
+            entry = f"{new_url} {descriptor_text}".strip()
+            entries.append(entry)
+        if not changed:
+            return match.group(0)
+        return f"{prefix}{', '.join(entries)}{suffix}"
+
+    html_fragment = srcset_pattern.sub(_prefix_srcset, html_fragment)
+
+    return html_fragment
+
 def course_download(id=""):
     global items
 
@@ -203,6 +256,7 @@ def course_download(id=""):
     file_local_map = {}
     discussion_local_map = {}
     module_details = []
+    canvas_host = urlparse(url).hostname if url else None
     
     for m in modules:
         items[running_index] = '<h2>%s</h2>%s\n' % ( m['name'], pagebreak )
@@ -352,7 +406,9 @@ def course_download(id=""):
             return mapped, canvas_override or absolute_src
 
         try:
-            response = requests.get(absolute_src, headers=header, stream=True, timeout=30)
+            target_host = urlparse(absolute_src).hostname
+            request_headers = header if not canvas_host or target_host == canvas_host else None
+            response = requests.get(absolute_src, headers=request_headers, stream=True, timeout=30)
             response.raise_for_status()
         except Exception as e:
             d(f"   * error downloading image {absolute_src}: {e}")
@@ -662,7 +718,7 @@ def course_download(id=""):
     
     for I in items:
         if I:
-            mycourse.write(  I  )
+            mycourse.write(adjust_fullcourse_image_sources(I))
     mycourse.write("\n</body></html>")
     
     
@@ -684,8 +740,8 @@ def course_download(id=""):
     if video_link_list:
         mycourse.write('\n<h1>Videos Linked in Pages</h1>\n<table>')
         for V in video_link_list:
-            (url, txt, pg) = V
-            mycourse.write("<tr><td><a target='_blank' href='"+url+"'>"+txt+"</a></td><td> on <a target='_blank' href='" + pg + "'>" + pg + "</a></td></tr>\n")
+            video_url, txt, pg = V
+            mycourse.write("<tr><td><a target='_blank' href='"+video_url+"'>"+txt+"</a></td><td> on <a target='_blank' href='" + pg + "'>" + pg + "</a></td></tr>\n")
         mycourse.write("</table>\n")
             
     mycourse.close()
diff --git a/curric2022.py b/curric2022.py
index 9de983e..9adbeaf 100644
--- a/curric2022.py
+++ b/curric2022.py
@@ -116,6 +116,7 @@ def recur_matcher(item, depth=0):
 
 num_failed_course = 1
 
+# Capture a single course payload for structured traversal.
 def single_course_parse(c):
     global num_failed_course
     this_course = []
@@ -129,14 +130,37 @@ def single_course_parse(c):
         ooops.close()
         num_failed_course = num_failed_course + 1
         return ("-1", [])
-        
+
+# Normalize course file payloads so downstream code always gets a list of instances.
+def load_course_file(path):
+    try:
+        raw_data = json.loads(codecs.open(path, 'r', 'utf-8').read(), strict=False)
+    except Exception as e:
+        print(f"Unable to read {path}: {e}")
+        return []
+
+    if isinstance(raw_data, dict):
+        if 'entityInstances' in raw_data:
+            return raw_data.get('entityInstances', [])
+        return [raw_data]
+
+    if isinstance(raw_data, list):
+        if raw_data and isinstance(raw_data[0], dict) and 'entityInstances' in raw_data[0]:
+            instances = []
+            for block in raw_data:
+                if isinstance(block, dict) and 'entityInstances' in block:
+                    instances.extend(block.get('entityInstances', []))
+            return instances
+        return raw_data
+
+    return []
 def match_style_test():
     classes = {}
     oo = codecs.open("cache/courses/curric2022test.json","w","utf-8")
     for f in os.listdir('cache/courses'):
-        if re.search('classes_',f):
+        if re.search(r'classes_',f):
             print(f)
-            cls = json.loads(codecs.open('cache/courses/'+f,'r','utf-8').read())
+            cls = load_course_file('cache/courses/'+f)
             for c in cls:
                 id,output = single_course_parse(c)
                 classes[id] = "\n".join(output)
@@ -170,7 +194,7 @@ def path_style_prog():
     classes = {}
     oo = codecs.open("cache/programs/allprogrampaths.txt","w","utf-8")
     for f in os.listdir('cache/programs'):
-        if re.search('^programs_',f):
+        if re.search(r'^programs_',f):
             print(f)
             cls = json.loads(codecs.open('cache/programs/'+f,'r','utf-8').read())
             for c in cls:
@@ -195,6 +219,30 @@ def all_outcomes():
     csvwriter = csv.writer(csvfile)
     csvwriter.writerow('code cqcourseid coursestatus termineffect dept num cqoutcomeid outcome'.split(' '))
 
+# Export sorted course titles from the raw course path dump.
+def export_course_titles():
+    source_path = 'cache/courses/allclasspaths.txt'
+    dest_path = 'cache/courses/allclasstitles.txt'
+    pattern = re.compile(r'^Course\/(\d+)\/Course Description\/entityTitle\/(.*)$')
+    titles = []
+
+    try:
+        with codecs.open(source_path, 'r', 'utf-8') as infile:
+            for line in infile:
+                match = pattern.match(line.strip())
+                if match:
+                    title = match.group(2).strip()
+                    if title:
+                        titles.append(title)
+    except FileNotFoundError:
+        print(f"Source file not found: {source_path}")
+        return
+
+    titles.sort(key=lambda s: s.lower())
+    with codecs.open(dest_path, 'w', 'utf-8') as outfile:
+        for title in titles:
+            outfile.write(title + '\n')
+
     csvfile2 = codecs.open('cache/courses/all_active_outcomes.csv','w','utf-8')
     csvwriter2 = csv.writer(csvfile2)
     csvwriter2.writerow('code cqcourseid coursestatus termineffect dept num cqoutcomeid outcome'.split(' '))
@@ -211,7 +259,7 @@ def all_outcomes():
     count = 0
 
     for L in rr:
-        a = re.search('Course\/(\d+)',L)
+        a = re.search(r'Course/(\d+)',L)
         if a:
             course_num = a.group(1)
             #print(course_num, current_course_num)
@@ -234,25 +282,25 @@ def all_outcomes():
             current_course['c'] = course_num
         
         
-        a = re.search('Course\/(\d+)\/1\/Course\ Description\/0\/Course\ Discipline\/(.*)$',L)
+        a = re.search(r'Course/(\d+)/1/Course Description/0/Course Discipline/(.*)$',L)
         if a:
             current_course['d'] = a.group(2)
-        a = re.search('Course\/(\d+)\/1\/Course\ Description\/0\/Course\ Number\/(.*)$',L)
+        a = re.search(r'Course/(\d+)/1/Course Description/0/Course Number/(.*)$',L)
         if a:
             current_course['n'] = a.group(2)
-        a = re.search('Course\/(\d+)\/1\/Course\ Description\/0\/Course\ Title\/(.*)$',L)
+        a = re.search(r'Course/(\d+)/1/Course Description/0/Course Title/(.*)$',L)
         if a:
             current_course['T'] = a.group(2)
-        a = re.search('Course\/(\d+)\/1\/Course\ Description\/0\/Short\ Title\/(.*)$',L)
+        a = re.search(r'Course/(\d+)/1/Course Description/0/Short Title/(.*)$',L)
         if a:
             current_course['t'] = a.group(2)
-        a = re.search('Course\ Description\/status\/(.*)$',L)
+        a = re.search(r'Course Description/status/(.*)$',L)
         if a:
             current_course['s'] = a.group(1)
-        a = re.search('Course\ Content\/\d+\/Lecture\ Content\/Curriculum\ Approval\ Date:\s*(.*)$',L)
+        a = re.search(r'Course Content/\d+/Lecture Content/Curriculum Approval Date:\s*(.*)$',L)
         if a:
             current_course['a'] = a.group(1)
-        a = re.search('Course\ Description\/\d+\/Internal\ Processing\ Term\/(.*)$',L)
+        a = re.search(r'Course Description/\d+/Internal Processing Term/(.*)$',L)
         if a:
             t_code = term_txt_to_code(a.group(1))
             current_course['m'] = t_code
@@ -262,20 +310,20 @@ def all_outcomes():
         
         # Course/3091/1/Course Description/0/Internal Processing Term/Spring 2018
 
-        a = re.search('Learning\ Outcomes\/\d+\/(cqid_\d+)\/Learning\ Outcomes\/Description\/(.*)$',L)
+        a = re.search(r'Learning Outcomes/\d+/(cqid_\d+)/Learning Outcomes/Description/(.*)$',L)
         if a:
             current_course['o'].append(a.group(2))
             current_course['i'] = a.group(1)
-            csvwriter.writerow([current_course['d']+current_course['n'], current_course_num, current_course['s'], current_course['m'], current_course['d'], current_course['n'], current_course['i'], a.group(2)])
+            csvwriter2.writerow([current_course['d']+current_course['n'], current_course_num, current_course['s'], current_course['m'], current_course['d'], current_course['n'], current_course['i'], a.group(2)])
             if current_course['s']=='Active':
                 csvwriter2.writerow([current_course['d']+current_course['n'], current_course_num, current_course['s'], current_course['m'], current_course['d'], current_course['n'], current_course['i'], a.group(2)])
             
             
-        if re.search('Learning\ Outcomes\/Description\/',L):
+        if re.search(r'Learning Outcomes/Description/',L):
             ww.write(L)
-        if re.search('Description\/entityTitle\/',L):
+        if re.search(r'Description/entityTitle/',L):
             ww.write(L)
-        if re.search('Description\/status\/',L):
+        if re.search(r'Description/status/',L):
             ww.write(L)
             
     xx = codecs.open("cache/courses/course_cq_index.json","w", "utf-8")
@@ -601,20 +649,20 @@ def course_path_style_2_html():
     active_courses = {}
     
     lookup_table = {    'entityTitle':'title', 'proposalType':'type', 
-                        '\/Course\sDescription\/status':'status', 'Course\sDiscipline':'dept', 
-                        'Course\sNumber':'number', 'Course\sTitle':'name', 'Course Description\/\d\/Justification':'justification',
-                        'Short\sTitle':'shortname', 'Course Description\/\d\/Internal\sProcessing\sTerm':'term', 'This\sCourse\sIs\sDegree\sApplicable':'degree_applicable',
-                        '\/Course\sDescription\/\d+\/Course\sDescription\/':'desc', 
-                        'Minimum\sUnits':'min_units', 'Minimum\sLecture\sHour':'min_lec_hour', 'Minimum\sLab\sHour':'min_lab_hour', 'Course\shas\svariable\shours':'has_var_hours',
-                        'Number\sWeeks':'weeks', 
-                        'Maximum\sUnits':'max_units', 'Credit\sStatus':'credit_status', 
-                        'TOP\sCode':'top_code', 'Classification':'classification', 'Non\sCredit\sCategory':'noncredit_category', 'Stand-Alone\sClass?':'stand_alone', 
-                        'Grade\sOption':'grade_option', 'Is\sRepeatable':'repeatable', 'Learning\sOutcomes\/Description':'slo',
-                        'Is\sThis\sCourse\sis\sRecommended\sfor\sTransfer\sto\sState\sUniversities\sand\sColleges?':'transfer_csu',
-                        'Is\sThis\sCourse\sis\sRecommended\sfor\sTransfer\sto\sUniversity\sof\sCalifornia?':'transfer_uc',
-                        '\/Catalog\sCourse\sSummary\sView\/':'catalog',
-                        '\/Course\sContent/\d+/Lecture\sContent\/':'content',
-                        '\/ASSIST\sPreview\/\d+\/Outcomes\sand\sObjectives\/':'objectives'}
+                        r'/Course\sDescription/status':'status', r'Course\sDiscipline':'dept', 
+                        r'Course\sNumber':'number', r'Course\sTitle':'name', r'Course Description/\d/Justification':'justification',
+                        r'Short\sTitle':'shortname', r'Course Description/\d/Internal\sProcessing\sTerm':'term', r'This\sCourse\sIs\sDegree\sApplicable':'degree_applicable',
+                        r'/Course\sDescription/\d+/Course\sDescription/':'desc', 
+                        r'Minimum\sUnits':'min_units', r'Minimum\sLecture\sHour':'min_lec_hour', r'Minimum\sLab\sHour':'min_lab_hour', r'Course\shas\svariable\shours':'has_var_hours',
+                        r'Number\sWeeks':'weeks', 
+                        r'Maximum\sUnits':'max_units', r'Credit\sStatus':'credit_status', 
+                        r'TOP\sCode':'top_code', r'Classification':'classification', r'Non\sCredit\sCategory':'noncredit_category', r'Stand-Alone\sClass\?':'stand_alone', 
+                        r'Grade\sOption':'grade_option', r'Is\sRepeatable':'repeatable', r'Learning\sOutcomes/Description':'slo',
+                        r'Is\sThis\sCourse\sis\sRecommended\sfor\sTransfer\sto\sState\sUniversities\sand\sColleges\?':'transfer_csu',
+                        r'Is\sThis\sCourse\sis\sRecommended\sfor\sTransfer\sto\sUniversity\sof\sCalifornia\?':'transfer_uc',
+                        r'/Catalog\sCourse\sSummary\sView/':'catalog',
+                        r'/Course\sContent/\d+/Lecture\sContent/':'content',
+                        r'/ASSIST\sPreview/\d+/Outcomes\sand\sObjectives/':'objectives'}
     
     for C in sorted(list(course_prebuild.keys()),key=int):
         v = 0
@@ -817,9 +865,9 @@ def path_style_test():
     classes = {}
     oo = codecs.open("cache/courses/allclasspaths.txt","w","utf-8")
     for f in os.listdir('cache/courses'):
-        if re.search('^classes_',f):
+        if re.search(r'^classes_',f):
             print(f)
-            cls = json.loads(codecs.open('cache/courses/'+f,'r','utf-8').read(),strict=False)
+            cls = load_course_file('cache/courses/'+f)
             for c in cls:
                 id,output = single_course_path_parse(c)
                 classes[id] = "\n".join(output)
@@ -976,6 +1024,7 @@ if __name__ == "__main__":
                 6: ['extract de info from class paths', de_classpaths],
                 7: ['build schedule or summary for SLO planning', slo_summary_report],
                 8: ['remove deactivated courses', filter_classes],
+                9: ['export sorted course titles', export_course_titles],
                 10: ['fetch all programs', fetch_all_programs],
                 11: ['process all programs', path_style_prog],
                 12: ['programs - path style to html catalog', path_style_2_html],
@@ -997,4 +1046,3 @@ if __name__ == "__main__":
     
     # Call the function in the options dict
     options[ int(resp)][1]() 
-