This commit is contained in:
Peter Howell 2025-08-29 19:47:50 +00:00
parent 8a3924afa4
commit 5ad02cf07e
1 changed files with 135 additions and 5 deletions

140
gpt.py
View File

@ -254,6 +254,21 @@ def process_useful_info():
import re
from pathlib import Path
from typing import Callable, List, Dict, Optional
from typing import Iterable
# Optional import heavy libs only when needed
try:
import PyPDF2
except Exception:
PyPDF2 = None
try:
import docx # python-docx
except Exception:
docx = None
try:
from pptx import Presentation
except Exception:
Presentation = None
HEADER_RE = re.compile(r'\r?\n###\s(.*)\r?\n') # your pattern, CRLF-safe
COUNT_RE = re.compile(r'^(?P<subject>.*?)\s+—\s+(?P<count>\d+)\s+message', re.I)
@ -320,16 +335,133 @@ def process_useful_info():
if count != -1 and done >= count:
return
def _parse_attachment_paths(block: str) -> List[str]:
paths: List[str] = []
for line in block.splitlines():
if line.startswith("Attachments:"):
# After the colon, comma-separated file paths
rest = line.split(":", 1)[1].strip()
if rest:
parts = [p.strip() for p in rest.split(",") if p.strip()]
paths.extend(parts)
# Deduplicate, keep order
seen = set()
uniq = []
for p in paths:
if p not in seen:
seen.add(p)
uniq.append(p)
return uniq
def _safe_read_textfile(p: Path, max_chars: int = 8000) -> str:
try:
return p.read_text(encoding="utf-8", errors="replace")[:max_chars]
except Exception:
return ""
def _extract_pdf_text(p: Path, max_pages: int = 10, max_chars: int = 12000) -> str:
if not PyPDF2:
return ""
text = []
try:
with p.open('rb') as fh:
reader = PyPDF2.PdfReader(fh)
pages = min(len(reader.pages), max_pages)
for i in range(pages):
try:
text.append(reader.pages[i].extract_text() or "")
except Exception:
pass
except Exception:
return ""
return "\n".join(text)[:max_chars]
def _extract_docx_text(p: Path, max_chars: int = 12000) -> str:
if not docx:
return ""
try:
d = docx.Document(str(p))
paras = [para.text for para in d.paragraphs if para.text]
return "\n".join(paras)[:max_chars]
except Exception:
return ""
def _extract_pptx_text(p: Path, max_chars: int = 12000) -> str:
if not Presentation:
return ""
try:
pres = Presentation(str(p))
chunks: List[str] = []
for slide in pres.slides:
for shape in slide.shapes:
try:
if hasattr(shape, "has_text_frame") and shape.has_text_frame:
for para in shape.text_frame.paragraphs:
text = "".join(run.text for run in para.runs)
if text:
chunks.append(text)
except Exception:
pass
return "\n".join(chunks)[:max_chars]
except Exception:
return ""
def _extract_attachment_text(paths: Iterable[str]) -> str:
out_chunks: List[str] = []
for raw in paths:
p = Path(raw)
# Ensure relative paths still resolve from repo root
if not p.is_absolute():
p = Path.cwd() / p
if not p.exists():
continue
ext = p.suffix.lower()
text = ""
if ext == ".pdf":
text = _extract_pdf_text(p)
elif ext == ".docx":
text = _extract_docx_text(p)
elif ext == ".pptx":
text = _extract_pptx_text(p)
# Fallback: try as utf-8 text
if not text and ext in {".txt", ".md", ".csv"}:
text = _safe_read_textfile(p)
if text:
out_chunks.append(f"--- Attachment: {p.name} ---\n{text}")
return "\n\n".join(out_chunks)
OUT_JSONL = Path("cache/useful_info_summaries.jsonl")
def demo_f(idx, g):
print(f"[{idx}] {g['subject']} (count: {g['count']})")
x = summarize_u_info(g['content'])
# Do your processing here. g['content'] is the whole thread block.
content = g['content']
attach_paths = _parse_attachment_paths(content)
if attach_paths:
attach_text = _extract_attachment_text(attach_paths)
if attach_text:
content = content + "\n\n[ATTACHMENT_TEXT]\n" + attach_text
x = summarize_u_info(content)
# Persist JSONL record (robust to non-JSON responses)
record = {
"index": idx,
"subject": g.get('subject'),
"count": g.get('count'),
"attachments": attach_paths,
}
try:
parsed = json.loads(x)
record["summary"] = parsed
except Exception:
record["summary_raw"] = x
with open(OUT_JSONL, "a", encoding="utf-8") as outf:
outf.write(json.dumps(record, ensure_ascii=False) + "\n")
for_each_group(
log_path="cache/email_usefulinfo_sorted.txt",
f=demo_f,
start=1, # change to resume at Nth group
count=10
count=100
)
@ -493,5 +625,3 @@ if __name__ == "__main__":
# Call the function in the options dict
options[ int(resp)][1]()