This commit is contained in:
Peter Howell 2025-08-29 20:19:44 +00:00
parent 5ad02cf07e
commit 20366246db
1 changed files with 154 additions and 21 deletions

175
gpt.py
View File

@ -109,7 +109,7 @@ def fetch_useful_info(save_attachments=True, folder_name='useful info ref'):
prefix_re = re.compile(r'^\s*(re|fw|fwd|aw|sv|vb|tr|wg)\s*:\s*', re.I) # common locales too
bracket_tag_re = re.compile(r'^\s*(\[[^\]]+\]\s*)+', re.I)
def normalize_subject(s: str) -> str:
def normalize_subject(s):
if not s:
return "(no subject)"
s = s.strip()
@ -122,7 +122,7 @@ def fetch_useful_info(save_attachments=True, folder_name='useful info ref'):
s = re.sub(r'\s+', ' ', s).strip()
return s or "(no subject)"
def safe_name(s: str) -> str:
def safe_name(s):
# reasonable Windows-safe folder name for a subject
s = re.sub(r'[<>:"/\\|?*\x00-\x1F]', '_', s)
return s[:120]
@ -253,8 +253,6 @@ def fetch_useful_info(save_attachments=True, folder_name='useful info ref'):
def process_useful_info():
import re
from pathlib import Path
from typing import Callable, List, Dict, Optional
from typing import Iterable
# Optional import heavy libs only when needed
try:
@ -273,7 +271,7 @@ def process_useful_info():
HEADER_RE = re.compile(r'\r?\n###\s(.*)\r?\n') # your pattern, CRLF-safe
COUNT_RE = re.compile(r'^(?P<subject>.*?)\s+—\s+(?P<count>\d+)\s+message', re.I)
def parse_groups(text: str) -> List[Dict]:
def parse_groups(text):
"""
Return a list of groups found in the log file.
Each group is a dict: {header, subject, count, content}
@ -291,7 +289,7 @@ def process_useful_info():
# Try to extract subject and count if present
subject = header
count: Optional[int] = None
count = None
cm = COUNT_RE.search(header)
if cm:
subject = cm.group('subject').strip()
@ -308,12 +306,9 @@ def process_useful_info():
})
return groups
def for_each_group(
log_path: str = "cache/email_usefulinfo_sorted.txt",
f: Callable[[int, Dict], None] = lambda idx, g: None,
start: int = 1,
count: int = -1
) -> None:
from usefulinfo_db import init_schema, insert_summary_and_events
def for_each_group(log_path="cache/email_usefulinfo_sorted.txt", f=lambda idx, g: None, start=1, count=-1):
"""
Read the grouped log, split into groups, and call f(index, group) on each.
start: 1-based index to begin processing (useful for resuming).
@ -335,8 +330,8 @@ def process_useful_info():
if count != -1 and done >= count:
return
def _parse_attachment_paths(block: str) -> List[str]:
paths: List[str] = []
def _parse_attachment_paths(block):
paths = []
for line in block.splitlines():
if line.startswith("Attachments:"):
# After the colon, comma-separated file paths
@ -353,13 +348,13 @@ def process_useful_info():
uniq.append(p)
return uniq
def _safe_read_textfile(p: Path, max_chars: int = 8000) -> str:
def _safe_read_textfile(p, max_chars=8000):
try:
return p.read_text(encoding="utf-8", errors="replace")[:max_chars]
except Exception:
return ""
def _extract_pdf_text(p: Path, max_pages: int = 10, max_chars: int = 12000) -> str:
def _extract_pdf_text(p, max_pages=10, max_chars=12000):
if not PyPDF2:
return ""
text = []
@ -376,7 +371,7 @@ def process_useful_info():
return ""
return "\n".join(text)[:max_chars]
def _extract_docx_text(p: Path, max_chars: int = 12000) -> str:
def _extract_docx_text(p, max_chars=12000):
if not docx:
return ""
try:
@ -386,12 +381,12 @@ def process_useful_info():
except Exception:
return ""
def _extract_pptx_text(p: Path, max_chars: int = 12000) -> str:
def _extract_pptx_text(p, max_chars=12000):
if not Presentation:
return ""
try:
pres = Presentation(str(p))
chunks: List[str] = []
chunks = []
for slide in pres.slides:
for shape in slide.shapes:
try:
@ -406,8 +401,8 @@ def process_useful_info():
except Exception:
return ""
def _extract_attachment_text(paths: Iterable[str]) -> str:
out_chunks: List[str] = []
def _extract_attachment_text(paths):
out_chunks = []
for raw in paths:
p = Path(raw)
# Ensure relative paths still resolve from repo root
@ -432,6 +427,134 @@ def process_useful_info():
OUT_JSONL = Path("cache/useful_info_summaries.jsonl")
# --- PostgreSQL schema + insert helpers (via localcache2.db) ---
def _pg_init_schema():
try:
from localcache2 import db as _db
CON, CURSOR = _db()
try:
CURSOR.execute(
"""
CREATE TABLE IF NOT EXISTS useful_info_summary (
id BIGSERIAL PRIMARY KEY,
summary_hash CHAR(64) UNIQUE,
grp_index INTEGER,
subject TEXT,
thread_count INTEGER,
source TEXT,
date_label TEXT,
tags_json JSONB,
short_text TEXT,
summary_text TEXT,
attachments_json JSONB,
raw_json TEXT,
created_at TIMESTAMPTZ DEFAULT now()
);
"""
)
CURSOR.execute(
"""
CREATE TABLE IF NOT EXISTS useful_info_event (
id BIGSERIAL PRIMARY KEY,
summary_id BIGINT NOT NULL REFERENCES useful_info_summary(id) ON DELETE CASCADE,
dt TEXT,
length TEXT,
title TEXT,
description TEXT,
created_at TIMESTAMPTZ DEFAULT now()
);
"""
)
CON.commit()
finally:
CURSOR.close()
CON.close()
except Exception as e:
print("[warn] could not init PostgreSQL schema:", e)
def _sha256(s):
import hashlib
return hashlib.sha256(s.encode('utf-8', 'ignore')).hexdigest()
def _pg_insert_summary_and_events(idx, subject, count, attachments, parsed, raw):
try:
from localcache2 import db as _db
import json as _json
CON, CURSOR = _db()
try:
source = None
date_label = None
tags = None
short_text = ''
summary_text = ''
events = []
if parsed:
source = parsed.get('source')
date_label = parsed.get('date')
tags = parsed.get('tags')
short_text = parsed.get('short') or ''
summary_text = parsed.get('summary') or ''
events = parsed.get('events') or []
s_hash = _sha256((subject or '') + "\n" + short_text + "\n" + summary_text)
CURSOR.execute(
"""
INSERT INTO useful_info_summary
(summary_hash, grp_index, subject, thread_count, source, date_label,
tags_json, short_text, summary_text, attachments_json, raw_json)
VALUES
(%s, %s, %s, %s, %s, %s,
CAST(%s AS JSONB), %s, %s, CAST(%s AS JSONB), %s)
ON CONFLICT (summary_hash)
DO UPDATE SET grp_index = EXCLUDED.grp_index
RETURNING id
""",
(
s_hash,
idx,
subject,
count,
source,
date_label,
_json.dumps(tags) if tags is not None else None,
short_text,
summary_text,
_json.dumps(attachments) if attachments else None,
raw,
),
)
row = CURSOR.fetchone()
summary_id = row[0] if row else None
if summary_id and isinstance(events, list):
for e in events:
try:
CURSOR.execute(
"""
INSERT INTO useful_info_event
(summary_id, dt, length, title, description)
VALUES (%s, %s, %s, %s, %s)
""",
(
summary_id,
(e or {}).get('dt'),
(e or {}).get('length'),
(e or {}).get('title'),
(e or {}).get('description'),
),
)
except Exception:
pass
CON.commit()
finally:
CURSOR.close()
CON.close()
except Exception as e:
print("[warn] PostgreSQL insert failed:", e)
# Ensure DB schema exists
_pg_init_schema()
def demo_f(idx, g):
print(f"[{idx}] {g['subject']} (count: {g['count']})")
content = g['content']
@ -457,6 +580,16 @@ def process_useful_info():
with open(OUT_JSONL, "a", encoding="utf-8") as outf:
outf.write(json.dumps(record, ensure_ascii=False) + "\n")
# Also persist to PostgreSQL using localcache2
if 'summary' in record:
_pg_insert_summary_and_events(
idx, record.get('subject'), record.get('count'), attach_paths, record['summary'], None
)
else:
_pg_insert_summary_and_events(
idx, record.get('subject'), record.get('count'), attach_paths, None, record.get('summary_raw')
)
for_each_group(
log_path="cache/email_usefulinfo_sorted.txt",
f=demo_f,