gpt

2025-08-29 19:47:50 +00:00 · 2025-08-29 19:47:50 +00:00 · 5ad02cf07e
parent 8a3924afa4
commit 5ad02cf07e
1 changed files with 135 additions and 5 deletions
--- a/gpt.py
+++ b/gpt.py
@ -254,6 +254,21 @@ def process_useful_info():
    import re
    from pathlib import Path
    from typing import Callable, List, Dict, Optional
+    from typing import Iterable
+
+    # Optional import heavy libs only when needed
+    try:
+        import PyPDF2
+    except Exception:
+        PyPDF2 = None
+    try:
+        import docx  # python-docx
+    except Exception:
+        docx = None
+    try:
+        from pptx import Presentation
+    except Exception:
+        Presentation = None

    HEADER_RE = re.compile(r'\r?\n###\s(.*)\r?\n')  # your pattern, CRLF-safe
    COUNT_RE  = re.compile(r'^(?P<subject>.*?)\s+—\s+(?P<count>\d+)\s+message', re.I)
@ -320,16 +335,133 @@ def process_useful_info():
            if count != -1 and done >= count:
                return

+    def _parse_attachment_paths(block: str) -> List[str]:
+        paths: List[str] = []
+        for line in block.splitlines():
+            if line.startswith("Attachments:"):
+                # After the colon, comma-separated file paths
+                rest = line.split(":", 1)[1].strip()
+                if rest:
+                    parts = [p.strip() for p in rest.split(",") if p.strip()]
+                    paths.extend(parts)
+        # Deduplicate, keep order
+        seen = set()
+        uniq = []
+        for p in paths:
+            if p not in seen:
+                seen.add(p)
+                uniq.append(p)
+        return uniq
+
+    def _safe_read_textfile(p: Path, max_chars: int = 8000) -> str:
+        try:
+            return p.read_text(encoding="utf-8", errors="replace")[:max_chars]
+        except Exception:
+            return ""
+
+    def _extract_pdf_text(p: Path, max_pages: int = 10, max_chars: int = 12000) -> str:
+        if not PyPDF2:
+            return ""
+        text = []
+        try:
+            with p.open('rb') as fh:
+                reader = PyPDF2.PdfReader(fh)
+                pages = min(len(reader.pages), max_pages)
+                for i in range(pages):
+                    try:
+                        text.append(reader.pages[i].extract_text() or "")
+                    except Exception:
+                        pass
+        except Exception:
+            return ""
+        return "\n".join(text)[:max_chars]
+
+    def _extract_docx_text(p: Path, max_chars: int = 12000) -> str:
+        if not docx:
+            return ""
+        try:
+            d = docx.Document(str(p))
+            paras = [para.text for para in d.paragraphs if para.text]
+            return "\n".join(paras)[:max_chars]
+        except Exception:
+            return ""
+
+    def _extract_pptx_text(p: Path, max_chars: int = 12000) -> str:
+        if not Presentation:
+            return ""
+        try:
+            pres = Presentation(str(p))
+            chunks: List[str] = []
+            for slide in pres.slides:
+                for shape in slide.shapes:
+                    try:
+                        if hasattr(shape, "has_text_frame") and shape.has_text_frame:
+                            for para in shape.text_frame.paragraphs:
+                                text = "".join(run.text for run in para.runs)
+                                if text:
+                                    chunks.append(text)
+                    except Exception:
+                        pass
+            return "\n".join(chunks)[:max_chars]
+        except Exception:
+            return ""
+
+    def _extract_attachment_text(paths: Iterable[str]) -> str:
+        out_chunks: List[str] = []
+        for raw in paths:
+            p = Path(raw)
+            # Ensure relative paths still resolve from repo root
+            if not p.is_absolute():
+                p = Path.cwd() / p
+            if not p.exists():
+                continue
+            ext = p.suffix.lower()
+            text = ""
+            if ext == ".pdf":
+                text = _extract_pdf_text(p)
+            elif ext == ".docx":
+                text = _extract_docx_text(p)
+            elif ext == ".pptx":
+                text = _extract_pptx_text(p)
+            # Fallback: try as utf-8 text
+            if not text and ext in {".txt", ".md", ".csv"}:
+                text = _safe_read_textfile(p)
+            if text:
+                out_chunks.append(f"--- Attachment: {p.name} ---\n{text}")
+        return "\n\n".join(out_chunks)
+
+    OUT_JSONL = Path("cache/useful_info_summaries.jsonl")
+
    def demo_f(idx, g):
        print(f"[{idx}] {g['subject']}  (count: {g['count']})")
-        x = summarize_u_info(g['content'])
-        # Do your processing here. g['content'] is the whole thread block.
+        content = g['content']
+        attach_paths = _parse_attachment_paths(content)
+        if attach_paths:
+            attach_text = _extract_attachment_text(attach_paths)
+            if attach_text:
+                content = content + "\n\n[ATTACHMENT_TEXT]\n" + attach_text
+        x = summarize_u_info(content)
+
+        # Persist JSONL record (robust to non-JSON responses)
+        record = {
+            "index": idx,
+            "subject": g.get('subject'),
+            "count": g.get('count'),
+            "attachments": attach_paths,
+        }
+        try:
+            parsed = json.loads(x)
+            record["summary"] = parsed
+        except Exception:
+            record["summary_raw"] = x
+        with open(OUT_JSONL, "a", encoding="utf-8") as outf:
+            outf.write(json.dumps(record, ensure_ascii=False) + "\n")

    for_each_group(
        log_path="cache/email_usefulinfo_sorted.txt",
        f=demo_f,
        start=1,  # change to resume at Nth group
-        count=10
+        count=100
    )


@ -493,5 +625,3 @@ if __name__ == "__main__":
        
    # Call the function in the options dict
    options[ int(resp)][1]() 
-
-