From 5ad02cf07e2de250973230769459e25e0817273f Mon Sep 17 00:00:00 2001 From: Peter Howell Date: Fri, 29 Aug 2025 19:47:50 +0000 Subject: [PATCH] gpt --- gpt.py | 140 ++++++++++++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 135 insertions(+), 5 deletions(-) diff --git a/gpt.py b/gpt.py index 8abc4a8..8f1c851 100644 --- a/gpt.py +++ b/gpt.py @@ -254,6 +254,21 @@ def process_useful_info(): import re from pathlib import Path from typing import Callable, List, Dict, Optional + from typing import Iterable + + # Optional import heavy libs only when needed + try: + import PyPDF2 + except Exception: + PyPDF2 = None + try: + import docx # python-docx + except Exception: + docx = None + try: + from pptx import Presentation + except Exception: + Presentation = None HEADER_RE = re.compile(r'\r?\n###\s(.*)\r?\n') # your pattern, CRLF-safe COUNT_RE = re.compile(r'^(?P.*?)\s+—\s+(?P\d+)\s+message', re.I) @@ -320,16 +335,133 @@ def process_useful_info(): if count != -1 and done >= count: return + def _parse_attachment_paths(block: str) -> List[str]: + paths: List[str] = [] + for line in block.splitlines(): + if line.startswith("Attachments:"): + # After the colon, comma-separated file paths + rest = line.split(":", 1)[1].strip() + if rest: + parts = [p.strip() for p in rest.split(",") if p.strip()] + paths.extend(parts) + # Deduplicate, keep order + seen = set() + uniq = [] + for p in paths: + if p not in seen: + seen.add(p) + uniq.append(p) + return uniq + + def _safe_read_textfile(p: Path, max_chars: int = 8000) -> str: + try: + return p.read_text(encoding="utf-8", errors="replace")[:max_chars] + except Exception: + return "" + + def _extract_pdf_text(p: Path, max_pages: int = 10, max_chars: int = 12000) -> str: + if not PyPDF2: + return "" + text = [] + try: + with p.open('rb') as fh: + reader = PyPDF2.PdfReader(fh) + pages = min(len(reader.pages), max_pages) + for i in range(pages): + try: + text.append(reader.pages[i].extract_text() or "") + except Exception: + pass + except Exception: + return "" + return "\n".join(text)[:max_chars] + + def _extract_docx_text(p: Path, max_chars: int = 12000) -> str: + if not docx: + return "" + try: + d = docx.Document(str(p)) + paras = [para.text for para in d.paragraphs if para.text] + return "\n".join(paras)[:max_chars] + except Exception: + return "" + + def _extract_pptx_text(p: Path, max_chars: int = 12000) -> str: + if not Presentation: + return "" + try: + pres = Presentation(str(p)) + chunks: List[str] = [] + for slide in pres.slides: + for shape in slide.shapes: + try: + if hasattr(shape, "has_text_frame") and shape.has_text_frame: + for para in shape.text_frame.paragraphs: + text = "".join(run.text for run in para.runs) + if text: + chunks.append(text) + except Exception: + pass + return "\n".join(chunks)[:max_chars] + except Exception: + return "" + + def _extract_attachment_text(paths: Iterable[str]) -> str: + out_chunks: List[str] = [] + for raw in paths: + p = Path(raw) + # Ensure relative paths still resolve from repo root + if not p.is_absolute(): + p = Path.cwd() / p + if not p.exists(): + continue + ext = p.suffix.lower() + text = "" + if ext == ".pdf": + text = _extract_pdf_text(p) + elif ext == ".docx": + text = _extract_docx_text(p) + elif ext == ".pptx": + text = _extract_pptx_text(p) + # Fallback: try as utf-8 text + if not text and ext in {".txt", ".md", ".csv"}: + text = _safe_read_textfile(p) + if text: + out_chunks.append(f"--- Attachment: {p.name} ---\n{text}") + return "\n\n".join(out_chunks) + + OUT_JSONL = Path("cache/useful_info_summaries.jsonl") + def demo_f(idx, g): print(f"[{idx}] {g['subject']} (count: {g['count']})") - x = summarize_u_info(g['content']) - # Do your processing here. g['content'] is the whole thread block. + content = g['content'] + attach_paths = _parse_attachment_paths(content) + if attach_paths: + attach_text = _extract_attachment_text(attach_paths) + if attach_text: + content = content + "\n\n[ATTACHMENT_TEXT]\n" + attach_text + x = summarize_u_info(content) + + # Persist JSONL record (robust to non-JSON responses) + record = { + "index": idx, + "subject": g.get('subject'), + "count": g.get('count'), + "attachments": attach_paths, + } + try: + parsed = json.loads(x) + record["summary"] = parsed + except Exception: + record["summary_raw"] = x + with open(OUT_JSONL, "a", encoding="utf-8") as outf: + outf.write(json.dumps(record, ensure_ascii=False) + "\n") for_each_group( log_path="cache/email_usefulinfo_sorted.txt", f=demo_f, start=1, # change to resume at Nth group - count=10 + count=100 ) @@ -493,5 +625,3 @@ if __name__ == "__main__": # Call the function in the options dict options[ int(resp)][1]() - -