gpt
This commit is contained in:
parent
8a3924afa4
commit
5ad02cf07e
140
gpt.py
140
gpt.py
|
|
@ -254,6 +254,21 @@ def process_useful_info():
|
|||
import re
|
||||
from pathlib import Path
|
||||
from typing import Callable, List, Dict, Optional
|
||||
from typing import Iterable
|
||||
|
||||
# Optional import heavy libs only when needed
|
||||
try:
|
||||
import PyPDF2
|
||||
except Exception:
|
||||
PyPDF2 = None
|
||||
try:
|
||||
import docx # python-docx
|
||||
except Exception:
|
||||
docx = None
|
||||
try:
|
||||
from pptx import Presentation
|
||||
except Exception:
|
||||
Presentation = None
|
||||
|
||||
HEADER_RE = re.compile(r'\r?\n###\s(.*)\r?\n') # your pattern, CRLF-safe
|
||||
COUNT_RE = re.compile(r'^(?P<subject>.*?)\s+—\s+(?P<count>\d+)\s+message', re.I)
|
||||
|
|
@ -320,16 +335,133 @@ def process_useful_info():
|
|||
if count != -1 and done >= count:
|
||||
return
|
||||
|
||||
def _parse_attachment_paths(block: str) -> List[str]:
|
||||
paths: List[str] = []
|
||||
for line in block.splitlines():
|
||||
if line.startswith("Attachments:"):
|
||||
# After the colon, comma-separated file paths
|
||||
rest = line.split(":", 1)[1].strip()
|
||||
if rest:
|
||||
parts = [p.strip() for p in rest.split(",") if p.strip()]
|
||||
paths.extend(parts)
|
||||
# Deduplicate, keep order
|
||||
seen = set()
|
||||
uniq = []
|
||||
for p in paths:
|
||||
if p not in seen:
|
||||
seen.add(p)
|
||||
uniq.append(p)
|
||||
return uniq
|
||||
|
||||
def _safe_read_textfile(p: Path, max_chars: int = 8000) -> str:
|
||||
try:
|
||||
return p.read_text(encoding="utf-8", errors="replace")[:max_chars]
|
||||
except Exception:
|
||||
return ""
|
||||
|
||||
def _extract_pdf_text(p: Path, max_pages: int = 10, max_chars: int = 12000) -> str:
|
||||
if not PyPDF2:
|
||||
return ""
|
||||
text = []
|
||||
try:
|
||||
with p.open('rb') as fh:
|
||||
reader = PyPDF2.PdfReader(fh)
|
||||
pages = min(len(reader.pages), max_pages)
|
||||
for i in range(pages):
|
||||
try:
|
||||
text.append(reader.pages[i].extract_text() or "")
|
||||
except Exception:
|
||||
pass
|
||||
except Exception:
|
||||
return ""
|
||||
return "\n".join(text)[:max_chars]
|
||||
|
||||
def _extract_docx_text(p: Path, max_chars: int = 12000) -> str:
|
||||
if not docx:
|
||||
return ""
|
||||
try:
|
||||
d = docx.Document(str(p))
|
||||
paras = [para.text for para in d.paragraphs if para.text]
|
||||
return "\n".join(paras)[:max_chars]
|
||||
except Exception:
|
||||
return ""
|
||||
|
||||
def _extract_pptx_text(p: Path, max_chars: int = 12000) -> str:
|
||||
if not Presentation:
|
||||
return ""
|
||||
try:
|
||||
pres = Presentation(str(p))
|
||||
chunks: List[str] = []
|
||||
for slide in pres.slides:
|
||||
for shape in slide.shapes:
|
||||
try:
|
||||
if hasattr(shape, "has_text_frame") and shape.has_text_frame:
|
||||
for para in shape.text_frame.paragraphs:
|
||||
text = "".join(run.text for run in para.runs)
|
||||
if text:
|
||||
chunks.append(text)
|
||||
except Exception:
|
||||
pass
|
||||
return "\n".join(chunks)[:max_chars]
|
||||
except Exception:
|
||||
return ""
|
||||
|
||||
def _extract_attachment_text(paths: Iterable[str]) -> str:
|
||||
out_chunks: List[str] = []
|
||||
for raw in paths:
|
||||
p = Path(raw)
|
||||
# Ensure relative paths still resolve from repo root
|
||||
if not p.is_absolute():
|
||||
p = Path.cwd() / p
|
||||
if not p.exists():
|
||||
continue
|
||||
ext = p.suffix.lower()
|
||||
text = ""
|
||||
if ext == ".pdf":
|
||||
text = _extract_pdf_text(p)
|
||||
elif ext == ".docx":
|
||||
text = _extract_docx_text(p)
|
||||
elif ext == ".pptx":
|
||||
text = _extract_pptx_text(p)
|
||||
# Fallback: try as utf-8 text
|
||||
if not text and ext in {".txt", ".md", ".csv"}:
|
||||
text = _safe_read_textfile(p)
|
||||
if text:
|
||||
out_chunks.append(f"--- Attachment: {p.name} ---\n{text}")
|
||||
return "\n\n".join(out_chunks)
|
||||
|
||||
OUT_JSONL = Path("cache/useful_info_summaries.jsonl")
|
||||
|
||||
def demo_f(idx, g):
|
||||
print(f"[{idx}] {g['subject']} (count: {g['count']})")
|
||||
x = summarize_u_info(g['content'])
|
||||
# Do your processing here. g['content'] is the whole thread block.
|
||||
content = g['content']
|
||||
attach_paths = _parse_attachment_paths(content)
|
||||
if attach_paths:
|
||||
attach_text = _extract_attachment_text(attach_paths)
|
||||
if attach_text:
|
||||
content = content + "\n\n[ATTACHMENT_TEXT]\n" + attach_text
|
||||
x = summarize_u_info(content)
|
||||
|
||||
# Persist JSONL record (robust to non-JSON responses)
|
||||
record = {
|
||||
"index": idx,
|
||||
"subject": g.get('subject'),
|
||||
"count": g.get('count'),
|
||||
"attachments": attach_paths,
|
||||
}
|
||||
try:
|
||||
parsed = json.loads(x)
|
||||
record["summary"] = parsed
|
||||
except Exception:
|
||||
record["summary_raw"] = x
|
||||
with open(OUT_JSONL, "a", encoding="utf-8") as outf:
|
||||
outf.write(json.dumps(record, ensure_ascii=False) + "\n")
|
||||
|
||||
for_each_group(
|
||||
log_path="cache/email_usefulinfo_sorted.txt",
|
||||
f=demo_f,
|
||||
start=1, # change to resume at Nth group
|
||||
count=10
|
||||
count=100
|
||||
)
|
||||
|
||||
|
||||
|
|
@ -493,5 +625,3 @@ if __name__ == "__main__":
|
|||
|
||||
# Call the function in the options dict
|
||||
options[ int(resp)][1]()
|
||||
|
||||
|
||||
|
|
|
|||
Loading…
Reference in New Issue