gpt
This commit is contained in:
parent
8a3924afa4
commit
5ad02cf07e
140
gpt.py
140
gpt.py
|
|
@ -254,6 +254,21 @@ def process_useful_info():
|
||||||
import re
|
import re
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Callable, List, Dict, Optional
|
from typing import Callable, List, Dict, Optional
|
||||||
|
from typing import Iterable
|
||||||
|
|
||||||
|
# Optional import heavy libs only when needed
|
||||||
|
try:
|
||||||
|
import PyPDF2
|
||||||
|
except Exception:
|
||||||
|
PyPDF2 = None
|
||||||
|
try:
|
||||||
|
import docx # python-docx
|
||||||
|
except Exception:
|
||||||
|
docx = None
|
||||||
|
try:
|
||||||
|
from pptx import Presentation
|
||||||
|
except Exception:
|
||||||
|
Presentation = None
|
||||||
|
|
||||||
HEADER_RE = re.compile(r'\r?\n###\s(.*)\r?\n') # your pattern, CRLF-safe
|
HEADER_RE = re.compile(r'\r?\n###\s(.*)\r?\n') # your pattern, CRLF-safe
|
||||||
COUNT_RE = re.compile(r'^(?P<subject>.*?)\s+—\s+(?P<count>\d+)\s+message', re.I)
|
COUNT_RE = re.compile(r'^(?P<subject>.*?)\s+—\s+(?P<count>\d+)\s+message', re.I)
|
||||||
|
|
@ -320,16 +335,133 @@ def process_useful_info():
|
||||||
if count != -1 and done >= count:
|
if count != -1 and done >= count:
|
||||||
return
|
return
|
||||||
|
|
||||||
|
def _parse_attachment_paths(block: str) -> List[str]:
|
||||||
|
paths: List[str] = []
|
||||||
|
for line in block.splitlines():
|
||||||
|
if line.startswith("Attachments:"):
|
||||||
|
# After the colon, comma-separated file paths
|
||||||
|
rest = line.split(":", 1)[1].strip()
|
||||||
|
if rest:
|
||||||
|
parts = [p.strip() for p in rest.split(",") if p.strip()]
|
||||||
|
paths.extend(parts)
|
||||||
|
# Deduplicate, keep order
|
||||||
|
seen = set()
|
||||||
|
uniq = []
|
||||||
|
for p in paths:
|
||||||
|
if p not in seen:
|
||||||
|
seen.add(p)
|
||||||
|
uniq.append(p)
|
||||||
|
return uniq
|
||||||
|
|
||||||
|
def _safe_read_textfile(p: Path, max_chars: int = 8000) -> str:
|
||||||
|
try:
|
||||||
|
return p.read_text(encoding="utf-8", errors="replace")[:max_chars]
|
||||||
|
except Exception:
|
||||||
|
return ""
|
||||||
|
|
||||||
|
def _extract_pdf_text(p: Path, max_pages: int = 10, max_chars: int = 12000) -> str:
|
||||||
|
if not PyPDF2:
|
||||||
|
return ""
|
||||||
|
text = []
|
||||||
|
try:
|
||||||
|
with p.open('rb') as fh:
|
||||||
|
reader = PyPDF2.PdfReader(fh)
|
||||||
|
pages = min(len(reader.pages), max_pages)
|
||||||
|
for i in range(pages):
|
||||||
|
try:
|
||||||
|
text.append(reader.pages[i].extract_text() or "")
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
except Exception:
|
||||||
|
return ""
|
||||||
|
return "\n".join(text)[:max_chars]
|
||||||
|
|
||||||
|
def _extract_docx_text(p: Path, max_chars: int = 12000) -> str:
|
||||||
|
if not docx:
|
||||||
|
return ""
|
||||||
|
try:
|
||||||
|
d = docx.Document(str(p))
|
||||||
|
paras = [para.text for para in d.paragraphs if para.text]
|
||||||
|
return "\n".join(paras)[:max_chars]
|
||||||
|
except Exception:
|
||||||
|
return ""
|
||||||
|
|
||||||
|
def _extract_pptx_text(p: Path, max_chars: int = 12000) -> str:
|
||||||
|
if not Presentation:
|
||||||
|
return ""
|
||||||
|
try:
|
||||||
|
pres = Presentation(str(p))
|
||||||
|
chunks: List[str] = []
|
||||||
|
for slide in pres.slides:
|
||||||
|
for shape in slide.shapes:
|
||||||
|
try:
|
||||||
|
if hasattr(shape, "has_text_frame") and shape.has_text_frame:
|
||||||
|
for para in shape.text_frame.paragraphs:
|
||||||
|
text = "".join(run.text for run in para.runs)
|
||||||
|
if text:
|
||||||
|
chunks.append(text)
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
return "\n".join(chunks)[:max_chars]
|
||||||
|
except Exception:
|
||||||
|
return ""
|
||||||
|
|
||||||
|
def _extract_attachment_text(paths: Iterable[str]) -> str:
|
||||||
|
out_chunks: List[str] = []
|
||||||
|
for raw in paths:
|
||||||
|
p = Path(raw)
|
||||||
|
# Ensure relative paths still resolve from repo root
|
||||||
|
if not p.is_absolute():
|
||||||
|
p = Path.cwd() / p
|
||||||
|
if not p.exists():
|
||||||
|
continue
|
||||||
|
ext = p.suffix.lower()
|
||||||
|
text = ""
|
||||||
|
if ext == ".pdf":
|
||||||
|
text = _extract_pdf_text(p)
|
||||||
|
elif ext == ".docx":
|
||||||
|
text = _extract_docx_text(p)
|
||||||
|
elif ext == ".pptx":
|
||||||
|
text = _extract_pptx_text(p)
|
||||||
|
# Fallback: try as utf-8 text
|
||||||
|
if not text and ext in {".txt", ".md", ".csv"}:
|
||||||
|
text = _safe_read_textfile(p)
|
||||||
|
if text:
|
||||||
|
out_chunks.append(f"--- Attachment: {p.name} ---\n{text}")
|
||||||
|
return "\n\n".join(out_chunks)
|
||||||
|
|
||||||
|
OUT_JSONL = Path("cache/useful_info_summaries.jsonl")
|
||||||
|
|
||||||
def demo_f(idx, g):
|
def demo_f(idx, g):
|
||||||
print(f"[{idx}] {g['subject']} (count: {g['count']})")
|
print(f"[{idx}] {g['subject']} (count: {g['count']})")
|
||||||
x = summarize_u_info(g['content'])
|
content = g['content']
|
||||||
# Do your processing here. g['content'] is the whole thread block.
|
attach_paths = _parse_attachment_paths(content)
|
||||||
|
if attach_paths:
|
||||||
|
attach_text = _extract_attachment_text(attach_paths)
|
||||||
|
if attach_text:
|
||||||
|
content = content + "\n\n[ATTACHMENT_TEXT]\n" + attach_text
|
||||||
|
x = summarize_u_info(content)
|
||||||
|
|
||||||
|
# Persist JSONL record (robust to non-JSON responses)
|
||||||
|
record = {
|
||||||
|
"index": idx,
|
||||||
|
"subject": g.get('subject'),
|
||||||
|
"count": g.get('count'),
|
||||||
|
"attachments": attach_paths,
|
||||||
|
}
|
||||||
|
try:
|
||||||
|
parsed = json.loads(x)
|
||||||
|
record["summary"] = parsed
|
||||||
|
except Exception:
|
||||||
|
record["summary_raw"] = x
|
||||||
|
with open(OUT_JSONL, "a", encoding="utf-8") as outf:
|
||||||
|
outf.write(json.dumps(record, ensure_ascii=False) + "\n")
|
||||||
|
|
||||||
for_each_group(
|
for_each_group(
|
||||||
log_path="cache/email_usefulinfo_sorted.txt",
|
log_path="cache/email_usefulinfo_sorted.txt",
|
||||||
f=demo_f,
|
f=demo_f,
|
||||||
start=1, # change to resume at Nth group
|
start=1, # change to resume at Nth group
|
||||||
count=10
|
count=100
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -493,5 +625,3 @@ if __name__ == "__main__":
|
||||||
|
|
||||||
# Call the function in the options dict
|
# Call the function in the options dict
|
||||||
options[ int(resp)][1]()
|
options[ int(resp)][1]()
|
||||||
|
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue