gpt

2025-08-29 19:47:50 +00:00 · 2025-08-29 19:47:50 +00:00 · 5ad02cf07e
parent 8a3924afa4
commit 5ad02cf07e
1 changed files with 135 additions and 5 deletions
--- a/gpt.py
+++ b/gpt.py
@ -254,6 +254,21 @@ def process_useful_info():
    import re
    from pathlib import Path
    from typing import Callable, List, Dict, Optional
    from typing import Iterable
    # Optional import heavy libs only when needed
    try:
        import PyPDF2
    except Exception:
        PyPDF2 = None
    try:
        import docx  # python-docx
    except Exception:
        docx = None
    try:
        from pptx import Presentation
    except Exception:
        Presentation = None
    HEADER_RE = re.compile(r'\r?\n###\s(.*)\r?\n')  # your pattern, CRLF-safe
    COUNT_RE  = re.compile(r'^(?P<subject>.*?)\s+—\s+(?P<count>\d+)\s+message', re.I)
@ -320,16 +335,133 @@ def process_useful_info():
            if count != -1 and done >= count:
                return
    def _parse_attachment_paths(block: str) -> List[str]:
        paths: List[str] = []
        for line in block.splitlines():
            if line.startswith("Attachments:"):
                # After the colon, comma-separated file paths
                rest = line.split(":", 1)[1].strip()
                if rest:
                    parts = [p.strip() for p in rest.split(",") if p.strip()]
                    paths.extend(parts)
        # Deduplicate, keep order
        seen = set()
        uniq = []
        for p in paths:
            if p not in seen:
                seen.add(p)
                uniq.append(p)
        return uniq
    def _safe_read_textfile(p: Path, max_chars: int = 8000) -> str:
        try:
            return p.read_text(encoding="utf-8", errors="replace")[:max_chars]
        except Exception:
            return ""
    def _extract_pdf_text(p: Path, max_pages: int = 10, max_chars: int = 12000) -> str:
        if not PyPDF2:
            return ""
        text = []
        try:
            with p.open('rb') as fh:
                reader = PyPDF2.PdfReader(fh)
                pages = min(len(reader.pages), max_pages)
                for i in range(pages):
                    try:
                        text.append(reader.pages[i].extract_text() or "")
                    except Exception:
                        pass
        except Exception:
            return ""
        return "\n".join(text)[:max_chars]
    def _extract_docx_text(p: Path, max_chars: int = 12000) -> str:
        if not docx:
            return ""
        try:
            d = docx.Document(str(p))
            paras = [para.text for para in d.paragraphs if para.text]
            return "\n".join(paras)[:max_chars]
        except Exception:
            return ""
    def _extract_pptx_text(p: Path, max_chars: int = 12000) -> str:
        if not Presentation:
            return ""
        try:
            pres = Presentation(str(p))
            chunks: List[str] = []
            for slide in pres.slides:
                for shape in slide.shapes:
                    try:
                        if hasattr(shape, "has_text_frame") and shape.has_text_frame:
                            for para in shape.text_frame.paragraphs:
                                text = "".join(run.text for run in para.runs)
                                if text:
                                    chunks.append(text)
                    except Exception:
                        pass
            return "\n".join(chunks)[:max_chars]
        except Exception:
            return ""
    def _extract_attachment_text(paths: Iterable[str]) -> str:
        out_chunks: List[str] = []
        for raw in paths:
            p = Path(raw)
            # Ensure relative paths still resolve from repo root
            if not p.is_absolute():
                p = Path.cwd() / p
            if not p.exists():
                continue
            ext = p.suffix.lower()
            text = ""
            if ext == ".pdf":
                text = _extract_pdf_text(p)
            elif ext == ".docx":
                text = _extract_docx_text(p)
            elif ext == ".pptx":
                text = _extract_pptx_text(p)
            # Fallback: try as utf-8 text
            if not text and ext in {".txt", ".md", ".csv"}:
                text = _safe_read_textfile(p)
            if text:
                out_chunks.append(f"--- Attachment: {p.name} ---\n{text}")
        return "\n\n".join(out_chunks)
    OUT_JSONL = Path("cache/useful_info_summaries.jsonl")
    def demo_f(idx, g):
        print(f"[{idx}] {g['subject']}  (count: {g['count']})")
-        x = summarize_u_info(g['content'])
+        content = g['content']
-        # Do your processing here. g['content'] is the whole thread block.
+        attach_paths = _parse_attachment_paths(content)
        if attach_paths:
            attach_text = _extract_attachment_text(attach_paths)
            if attach_text:
                content = content + "\n\n[ATTACHMENT_TEXT]\n" + attach_text
        x = summarize_u_info(content)
        # Persist JSONL record (robust to non-JSON responses)
        record = {
            "index": idx,
            "subject": g.get('subject'),
            "count": g.get('count'),
            "attachments": attach_paths,
        }
        try:
            parsed = json.loads(x)
            record["summary"] = parsed
        except Exception:
            record["summary_raw"] = x
        with open(OUT_JSONL, "a", encoding="utf-8") as outf:
            outf.write(json.dumps(record, ensure_ascii=False) + "\n")
    for_each_group(
        log_path="cache/email_usefulinfo_sorted.txt",
        f=demo_f,
        start=1,  # change to resume at Nth group
-        count=10
+        count=100
    )
@ -493,5 +625,3 @@ if __name__ == "__main__":
    # Call the function in the options dict
    options[ int(resp)][1]()