diff --git a/pipelines.py b/pipelines.py index 5d5604c..9006825 100644 --- a/pipelines.py +++ b/pipelines.py @@ -1145,6 +1145,17 @@ def process_reg_history(): grouped[ts] = {r['crn']: r for r in group} return grouped + def crossed_threshold(old_val, new_val, max_val): + thresholds = [0.25, 0.5, 0.75, 1.0] + if int(max_val) == 0: + return False, None + old_ratio = int(old_val) / int(max_val) + new_ratio = int(new_val) / int(max_val) + for t in thresholds: + if old_ratio < t <= new_ratio: + return True, int(t * 100) + return False, None + def detect_changes(prev, curr): changes = defaultdict(list) @@ -1152,42 +1163,111 @@ def process_reg_history(): for crn in all_crns: o, n = prev.get(crn), curr.get(crn) if not o: - changes[crn].append("Section was added.") + changes[crn].append((n['datetime'], "Section was added.")) elif not n: - changes[crn].append("Section was removed.") + changes[crn].append(( + o['datetime'], + f"Section was removed (last seen: teacher {o['teacher']}, " + f"{o['enrolled']}/{o['max']} enrolled, {o['waitlisted']}/{o['waitlistmax']} waitlisted)." + )) else: + dt = n['datetime'] if o['teacher'] != n['teacher']: - changes[crn].append(f"Changed teacher to {n['teacher']}.") + changes[crn].append((dt, f"Teacher changed from {o['teacher']} to {n['teacher']}.")) if o['enrolled'] != n['enrolled']: - if int(n['enrolled']) >= int(n.get('max', 9999)): - changes[crn].append("Filled up.") - else: - changes[crn].append(f"Enrollment changed to {n['enrolled']}.") - if int(n.get('waitlisted', 0)) > 10 and o['waitlisted'] != n['waitlisted']: - changes[crn].append(f"Waitlist exceeds 10: {n['waitlisted']}.") + crossed, percent = crossed_threshold(o['enrolled'], n['enrolled'], n['max']) + if crossed: + changes[crn].append((dt, f"Enrollment crossed {percent}% ({n['enrolled']}/{n['max']}).")) + if int(n['waitlisted']) > 10 and o['waitlisted'] != n['waitlisted']: + changes[crn].append((dt, f"Waitlist exceeds 10: {n['waitlisted']}.")) return changes def process_diff_timeline(path): snapshots = read_grouped_csv(path) timeline = sorted(snapshots.keys()) - reports = [] + timeline_diffs = [] + course_names = {} # crn -> latest known course name for i in range(1, len(timeline)): prev_ts, curr_ts = timeline[i-1], timeline[i] prev, curr = snapshots[prev_ts], snapshots[curr_ts] + + # update course name map + for crn, row in curr.items(): + course_names[crn] = row['course'] + delta = detect_changes(prev, curr) - if delta: - reports.append((curr_ts, delta)) - return reports + timeline_diffs.append(delta) - result = process_diff_timeline("cache/reg_history_fa25.csv") - for timestamp, changes in result: - print(f"\n[{timestamp}]") - for crn, msgs in sorted(changes.items()): - print(f" CRN {crn}:") - for msg in msgs: - print(f" - {msg}") + # Flatten and group by crn + crn_changes = defaultdict(list) + for delta in timeline_diffs: + for crn, changes in delta.items(): + crn_changes[crn].extend(changes) + # Sort changes for each CRN by datetime + for crn in crn_changes: + crn_changes[crn].sort(key=lambda x: x[0]) + + return crn_changes, course_names + + output1 = codecs.open('cache/reg_timeline_fa25.txt','w','utf-8') + changes, course_names = process_diff_timeline("cache/reg_history_fa25.csv") + for crn in sorted(changes, key=lambda c: course_names.get(c, "")): + course = course_names.get(crn, "") + print(f"\n{course} (CRN {crn}):") + output1.write(f"\n{course} (CRN {crn}):\n") + for dt, msg in changes[crn]: + print(f" [{dt}] {msg}") + output1.write(f" [{dt}] {msg}\n") + +def recreate_reg_data(): + import csv + from collections import defaultdict + from datetime import datetime + + def parse_row(row): + dt = datetime.strptime(row['datetime'], "%Y-%m-%dT%H-%M") + crn = row['crn'] + enrolled = int(row['enrolled']) + return dt, crn, enrolled + + def reduce_latest_per_day(rows): + latest = defaultdict(dict) # latest[crn][date_str] = (dt, enrolled) + + for row in rows: + dt, crn, enrolled = parse_row(row) + day = dt.date().isoformat() + if day not in latest[crn] or dt > latest[crn][day][0]: + latest[crn][day] = (dt, enrolled) + return latest + + def pivot_to_table(latest_data): + all_dates = sorted({day for crn in latest_data for day in latest_data[crn]}) + crns = sorted(latest_data) + table = [] + + for crn in crns: + row = [crn] + for day in all_dates: + val = latest_data[crn].get(day, (None, None))[1] + row.append(str(val) if val is not None else "") + table.append(row) + + return ["crn"] + all_dates, table + + with open("cache/reg_history_fa25.csv", newline='') as f: + fieldnames = ['datetime', 'crn', 'course', 'teacher', 'max', 'enrolled', 'waitlistmax', 'waitlisted'] + reader = csv.DictReader(f, fieldnames=fieldnames) + rows = list(reader) + + latest = reduce_latest_per_day(rows) + header, table = pivot_to_table(latest) + + with open("cache/reg_data_fa25.csv", "w", newline='') as f: + writer = csv.writer(f) + writer.writerow(header) + writer.writerows(table) if __name__ == "__main__": @@ -1197,6 +1277,7 @@ if __name__ == "__main__": 2: ['Get canvas data 2024 style', canvas_data_2024_run ], 3: ['Set up canvas data 2024 style', setup_canvas_data_2024_run], 4: ['Narrative timeline of section updates', process_reg_history], + 5: ['Recreate reg_data from full reg history', recreate_reg_data], } '''1: ['Re-create schedule csv and json files from raw html',recent_schedules] ,