From 2e20c0c528a1b030add86fc36cb59f7460a7664f Mon Sep 17 00:00:00 2001 From: Peter Howell Date: Thu, 4 Sep 2025 19:31:31 +0000 Subject: [PATCH] gpt --- gpt.py | 50 ++++++++++++++++++++- localcache2.py | 115 +++++++++++++++++++++++++++++++++++++++++++++---- 2 files changed, 155 insertions(+), 10 deletions(-) diff --git a/gpt.py b/gpt.py index 7a4cc0b..c40a16e 100644 --- a/gpt.py +++ b/gpt.py @@ -209,7 +209,31 @@ def fetch_useful_info(save_attachments=True, folder_name='useful info ref'): # Collect first to a list so we can sort/group records = [] items = uinfo.Items - # Optional: sort by SentOn ascending inside Outlook (helps performance for big folders) + + # Incremental restrict by last seen sent time (with 2-day backoff), and schema upgrade for entry_id + try: + from localcache2 import db as _db, upgrade_usefulinfo_schema + upgrade_usefulinfo_schema() + CON, CUR = _db() + last_iso = None + try: + CUR.execute("SELECT MAX(sent_iso) FROM useful_info_email") + row = CUR.fetchone() + if row and row[0]: + last_iso = row[0] + finally: + CUR.close(); CON.close() + if last_iso: + from dateutil import parser as _p + from datetime import timedelta + dt = _p.parse(str(last_iso)) - timedelta(days=2) + start_str = dt.strftime("%m/%d/%Y %I:%M %p") + items = items.Restrict(f"[ReceivedTime] >= '{start_str}'") + except Exception as ex_restrict: + # If anything fails, fall back to full set + pass + + # Sort by SentOn ascending inside Outlook (helps performance for big folders) try: items.Sort("[SentOn]", True) # True => ascending except Exception: @@ -235,6 +259,11 @@ def fetch_useful_info(save_attachments=True, folder_name='useful info ref'): sent_on = getattr(message, "SentOn", None) sent_iso = iso(sent_on) + entry_id = None + try: + entry_id = getattr(message, 'EntryID', None) + except Exception: + entry_id = None attachments_saved = [] if save_attachments: @@ -252,12 +281,29 @@ def fetch_useful_info(save_attachments=True, folder_name='useful info ref'): except Exception: body = "" + # Decode Outlook SafeLinks inside body text (basic replacement) + try: + import urllib.parse as _up + def _decode_match(m): + u = m.group(0) + try: + q = _up.urlparse(u).query + params = _up.parse_qs(q) + real = params.get('url', [''])[0] or params.get('target', [''])[0] + return _up.unquote(real) if real else u + except Exception: + return u + body = re.sub(r"https?://[\w\.-]*safelinks\.protection\.outlook\.com/[^\s\)\]\>]+", _decode_match, body) + except Exception: + pass + records.append({ "subject_norm": subj_norm, "subject_raw": subj_raw, "sender": sender, "sent_on": sent_on, "sent_iso": sent_iso, + "entry_id": entry_id, "attachments": attachments_saved, "body": body, }) @@ -365,6 +411,7 @@ def fetch_useful_info(save_attachments=True, folder_name='useful info ref'): body=rec['body'], attachments=atts, summary_id=None, + entry_id=rec.get('entry_id') ) except Exception as e: print('[usefulinfo][email-insert-failed]', rec.get('subject_raw'), str(e)) @@ -784,6 +831,7 @@ if __name__ == "__main__": 5: ['list faq mailbox', list_faq], 6: ['process useful info msgs', process_useful_info], 7: ['export useful info events to .ics', lambda: (__import__('localcache2').localcache2.export_usefulinfo_events_to_ics() or True)], + 8: ['fix safelinks in DB', lambda: (__import__('localcache2').localcache2.fix_safelinks_in_db() or True)], } diff --git a/localcache2.py b/localcache2.py index 56d776b..ff4a3a0 100644 --- a/localcache2.py +++ b/localcache2.py @@ -640,6 +640,7 @@ def init_usefulinfo_schema(): CREATE TABLE IF NOT EXISTS useful_info_email ( id BIGSERIAL PRIMARY KEY, summary_id BIGINT NULL REFERENCES useful_info_summary(id) ON DELETE SET NULL, + entry_id TEXT UNIQUE, subject_raw TEXT, subject_norm TEXT, sender TEXT, @@ -922,7 +923,74 @@ def insert_usefulinfo_record(parsed): return summary_id -def insert_usefulinfo_email(subject_raw, subject_norm, sender, sent_iso, body, attachments=None, summary_id=None): +def _decode_safelinks_text(text): + """Replace Outlook SafeLinks with their original URL inside given text. + Returns modified text (or original if no changes). + """ + if not text: + return text + try: + import urllib.parse as _up + import re as _re + def _decode_match(m): + u = m.group(0) + try: + q = _up.urlparse(u).query + params = _up.parse_qs(q) + real = params.get('url', [''])[0] or params.get('target', [''])[0] + return _up.unquote(real) if real else u + except Exception: + return u + return _re.sub(r"https?://[\w\.-]*safelinks\.protection\.outlook\.com/[^\s\)\]\>]+", _decode_match, text) + except Exception: + return text + + +def fix_safelinks_in_db(batch_size=500): + """Decode SafeLinks URLs in existing DB rows for: + - useful_info_email.body + - useful_info_summary.summary_text + - useful_info_attachment.text + Returns a dict of counts updated. + """ + CON, CUR = db() + updated = {'email_body': 0, 'summary_text': 0, 'attachment_text': 0} + try: + # Emails + CUR.execute("SELECT id, body FROM useful_info_email") + rows = CUR.fetchall() + for rid, body in rows: + new = _decode_safelinks_text(body) + if new != body: + CUR.execute("UPDATE useful_info_email SET body=%s WHERE id=%s", (new, rid)) + updated['email_body'] += 1 + CON.commit() + + # Summaries + CUR.execute("SELECT id, summary_text FROM useful_info_summary") + rows = CUR.fetchall() + for rid, st in rows: + new = _decode_safelinks_text(st) + if new != st: + CUR.execute("UPDATE useful_info_summary SET summary_text=%s WHERE id=%s", (new, rid)) + updated['summary_text'] += 1 + CON.commit() + + # Attachments text + CUR.execute("SELECT id, text FROM useful_info_attachment WHERE text IS NOT NULL") + rows = CUR.fetchall() + for rid, tx in rows: + new = _decode_safelinks_text(tx) + if new != tx: + CUR.execute("UPDATE useful_info_attachment SET text=%s WHERE id=%s", (new, rid)) + updated['attachment_text'] += 1 + CON.commit() + finally: + CUR.close(); CON.close() + print('[usefulinfo][fix_safelinks] updated:', updated) + return updated + +def insert_usefulinfo_email(subject_raw, subject_norm, sender, sent_iso, body, attachments=None, summary_id=None, entry_id=None): """Insert an original email and any attachments. attachments: list of dicts like {'path': str, 'text': str or None} summary_id: optional FK to useful_info_summary; can be None and linked later. @@ -932,14 +1000,26 @@ def insert_usefulinfo_email(subject_raw, subject_norm, sender, sent_iso, body, a CON, CUR = db() email_id = None try: - CUR.execute( - """ - INSERT INTO useful_info_email (summary_id, subject_raw, subject_norm, sender, sent_iso, body) - VALUES (%s, %s, %s, %s, %s, %s) - RETURNING id - """, - (summary_id, subject_raw, subject_norm, sender, sent_iso, body) - ) + try: + CUR.execute( + """ + INSERT INTO useful_info_email (summary_id, entry_id, subject_raw, subject_norm, sender, sent_iso, body) + VALUES (%s, %s, %s, %s, %s, %s, %s) + ON CONFLICT (entry_id) DO NOTHING + RETURNING id + """, + (summary_id, entry_id, subject_raw, subject_norm, sender, sent_iso, body) + ) + except Exception: + # Fallback if entry_id column not present + CUR.execute( + """ + INSERT INTO useful_info_email (summary_id, subject_raw, subject_norm, sender, sent_iso, body) + VALUES (%s, %s, %s, %s, %s, %s) + RETURNING id + """, + (summary_id, subject_raw, subject_norm, sender, sent_iso, body) + ) row = CUR.fetchone() email_id = row[0] if row else None @@ -980,6 +1060,23 @@ def insert_usefulinfo_email(subject_raw, subject_norm, sender, sent_iso, body, a return email_id +def upgrade_usefulinfo_schema(): + """Ensure incremental-friendly schema (entry_id unique) is present.""" + CON, CUR = db() + try: + try: + CUR.execute("ALTER TABLE useful_info_email ADD COLUMN IF NOT EXISTS entry_id TEXT") + except Exception: + pass + try: + CUR.execute("CREATE UNIQUE INDEX IF NOT EXISTS useful_info_email_entry_id_idx ON useful_info_email(entry_id)") + except Exception: + pass + CON.commit() + finally: + CUR.close(); CON.close() + + def link_emails_to_summary(subject_norm, summary_id): """Link any emails with the given normalized subject to the provided summary. Returns number of rows updated.