recreate reg_data. what broke it?

2025-08-16 14:33:15 -07:00 · 2025-08-16 14:33:15 -07:00 · f15d103fb8
parent a0a2845198
commit f15d103fb8
1 changed files with 101 additions and 20 deletions
--- a/pipelines.py
+++ b/pipelines.py
@ -1145,6 +1145,17 @@ def process_reg_history():
            grouped[ts] = {r['crn']: r for r in group}
        return grouped

+    def crossed_threshold(old_val, new_val, max_val):
+        thresholds = [0.25, 0.5, 0.75, 1.0]
+        if int(max_val) == 0:
+            return False, None
+        old_ratio = int(old_val) / int(max_val)
+        new_ratio = int(new_val) / int(max_val)
+        for t in thresholds:
+            if old_ratio < t <= new_ratio:
+                return True, int(t * 100)
+        return False, None
+
    def detect_changes(prev, curr):
        changes = defaultdict(list)

@ -1152,42 +1163,111 @@ def process_reg_history():
        for crn in all_crns:
            o, n = prev.get(crn), curr.get(crn)
            if not o:
-                changes[crn].append("Section was added.")
+                changes[crn].append((n['datetime'], "Section was added."))
            elif not n:
-                changes[crn].append("Section was removed.")
+                changes[crn].append((
+                    o['datetime'],
+                    f"Section was removed (last seen: teacher {o['teacher']}, "
+                    f"{o['enrolled']}/{o['max']} enrolled, {o['waitlisted']}/{o['waitlistmax']} waitlisted)."
+                ))
            else:
+                dt = n['datetime']
                if o['teacher'] != n['teacher']:
-                    changes[crn].append(f"Changed teacher to {n['teacher']}.")
+                    changes[crn].append((dt, f"Teacher changed from {o['teacher']} to {n['teacher']}."))
                if o['enrolled'] != n['enrolled']:
-                    if int(n['enrolled']) >= int(n.get('max', 9999)):
-                        changes[crn].append("Filled up.")
-                    else:
-                        changes[crn].append(f"Enrollment changed to {n['enrolled']}.")
-                if int(n.get('waitlisted', 0)) > 10 and o['waitlisted'] != n['waitlisted']:
-                    changes[crn].append(f"Waitlist exceeds 10: {n['waitlisted']}.")
+                    crossed, percent = crossed_threshold(o['enrolled'], n['enrolled'], n['max'])
+                    if crossed:
+                        changes[crn].append((dt, f"Enrollment crossed {percent}% ({n['enrolled']}/{n['max']})."))
+                if int(n['waitlisted']) > 10 and o['waitlisted'] != n['waitlisted']:
+                    changes[crn].append((dt, f"Waitlist exceeds 10: {n['waitlisted']}."))
        return changes

    def process_diff_timeline(path):
        snapshots = read_grouped_csv(path)
        timeline = sorted(snapshots.keys())
-        reports = []
+        timeline_diffs = []
+        course_names = {}  # crn -> latest known course name

        for i in range(1, len(timeline)):
            prev_ts, curr_ts = timeline[i-1], timeline[i]
            prev, curr = snapshots[prev_ts], snapshots[curr_ts]
+
+            # update course name map
+            for crn, row in curr.items():
+                course_names[crn] = row['course']
+
            delta = detect_changes(prev, curr)
-            if delta:
-                reports.append((curr_ts, delta))
-        return reports
+            timeline_diffs.append(delta)

-    result = process_diff_timeline("cache/reg_history_fa25.csv")
-    for timestamp, changes in result:
-        print(f"\n[{timestamp}]")
-        for crn, msgs in sorted(changes.items()):
-            print(f"  CRN {crn}:")
-            for msg in msgs:
-                print(f"    - {msg}")    
+        # Flatten and group by crn
+        crn_changes = defaultdict(list)
+        for delta in timeline_diffs:
+            for crn, changes in delta.items():
+                crn_changes[crn].extend(changes)

+        # Sort changes for each CRN by datetime
+        for crn in crn_changes:
+            crn_changes[crn].sort(key=lambda x: x[0])
+
+        return crn_changes, course_names
+    
+    output1 = codecs.open('cache/reg_timeline_fa25.txt','w','utf-8')
+    changes, course_names = process_diff_timeline("cache/reg_history_fa25.csv")
+    for crn in sorted(changes, key=lambda c: course_names.get(c, "")):
+        course = course_names.get(crn, "")
+        print(f"\n{course} (CRN {crn}):")
+        output1.write(f"\n{course} (CRN {crn}):\n")
+        for dt, msg in changes[crn]:
+            print(f"  [{dt}] {msg}")
+            output1.write(f"  [{dt}] {msg}\n")
+
+def recreate_reg_data():
+    import csv
+    from collections import defaultdict
+    from datetime import datetime
+
+    def parse_row(row):
+        dt = datetime.strptime(row['datetime'], "%Y-%m-%dT%H-%M")
+        crn = row['crn']
+        enrolled = int(row['enrolled'])
+        return dt, crn, enrolled
+
+    def reduce_latest_per_day(rows):
+        latest = defaultdict(dict)  # latest[crn][date_str] = (dt, enrolled)
+
+        for row in rows:
+            dt, crn, enrolled = parse_row(row)
+            day = dt.date().isoformat()
+            if day not in latest[crn] or dt > latest[crn][day][0]:
+                latest[crn][day] = (dt, enrolled)
+        return latest
+
+    def pivot_to_table(latest_data):
+        all_dates = sorted({day for crn in latest_data for day in latest_data[crn]})
+        crns = sorted(latest_data)
+        table = []
+
+        for crn in crns:
+            row = [crn]
+            for day in all_dates:
+                val = latest_data[crn].get(day, (None, None))[1]
+                row.append(str(val) if val is not None else "")
+            table.append(row)
+
+        return ["crn"] + all_dates, table
+
+    with open("cache/reg_history_fa25.csv", newline='') as f:
+        fieldnames = ['datetime', 'crn', 'course', 'teacher', 'max', 'enrolled', 'waitlistmax', 'waitlisted']
+        reader = csv.DictReader(f, fieldnames=fieldnames)
+        rows = list(reader)
+
+    latest = reduce_latest_per_day(rows)
+    header, table = pivot_to_table(latest)
+
+    with open("cache/reg_data_fa25.csv", "w", newline='') as f:
+        writer = csv.writer(f)
+        writer.writerow(header)
+        writer.writerows(table)


 if __name__ == "__main__":
@ -1197,6 +1277,7 @@ if __name__ == "__main__":
                2: ['Get canvas data 2024 style', canvas_data_2024_run ],
                3: ['Set up canvas data 2024 style', setup_canvas_data_2024_run],
                4: ['Narrative timeline of section updates', process_reg_history],
+                5: ['Recreate reg_data from full reg history', recreate_reg_data],
    }
        
    '''1: ['Re-create schedule csv and json files from raw html',recent_schedules] ,