This commit is contained in:
parent
5ad02cf07e
commit
20366246db
175
gpt.py
175
gpt.py
|
|
@ -109,7 +109,7 @@ def fetch_useful_info(save_attachments=True, folder_name='useful info ref'):
|
||||||
prefix_re = re.compile(r'^\s*(re|fw|fwd|aw|sv|vb|tr|wg)\s*:\s*', re.I) # common locales too
|
prefix_re = re.compile(r'^\s*(re|fw|fwd|aw|sv|vb|tr|wg)\s*:\s*', re.I) # common locales too
|
||||||
bracket_tag_re = re.compile(r'^\s*(\[[^\]]+\]\s*)+', re.I)
|
bracket_tag_re = re.compile(r'^\s*(\[[^\]]+\]\s*)+', re.I)
|
||||||
|
|
||||||
def normalize_subject(s: str) -> str:
|
def normalize_subject(s):
|
||||||
if not s:
|
if not s:
|
||||||
return "(no subject)"
|
return "(no subject)"
|
||||||
s = s.strip()
|
s = s.strip()
|
||||||
|
|
@ -122,7 +122,7 @@ def fetch_useful_info(save_attachments=True, folder_name='useful info ref'):
|
||||||
s = re.sub(r'\s+', ' ', s).strip()
|
s = re.sub(r'\s+', ' ', s).strip()
|
||||||
return s or "(no subject)"
|
return s or "(no subject)"
|
||||||
|
|
||||||
def safe_name(s: str) -> str:
|
def safe_name(s):
|
||||||
# reasonable Windows-safe folder name for a subject
|
# reasonable Windows-safe folder name for a subject
|
||||||
s = re.sub(r'[<>:"/\\|?*\x00-\x1F]', '_', s)
|
s = re.sub(r'[<>:"/\\|?*\x00-\x1F]', '_', s)
|
||||||
return s[:120]
|
return s[:120]
|
||||||
|
|
@ -253,8 +253,6 @@ def fetch_useful_info(save_attachments=True, folder_name='useful info ref'):
|
||||||
def process_useful_info():
|
def process_useful_info():
|
||||||
import re
|
import re
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Callable, List, Dict, Optional
|
|
||||||
from typing import Iterable
|
|
||||||
|
|
||||||
# Optional import heavy libs only when needed
|
# Optional import heavy libs only when needed
|
||||||
try:
|
try:
|
||||||
|
|
@ -273,7 +271,7 @@ def process_useful_info():
|
||||||
HEADER_RE = re.compile(r'\r?\n###\s(.*)\r?\n') # your pattern, CRLF-safe
|
HEADER_RE = re.compile(r'\r?\n###\s(.*)\r?\n') # your pattern, CRLF-safe
|
||||||
COUNT_RE = re.compile(r'^(?P<subject>.*?)\s+—\s+(?P<count>\d+)\s+message', re.I)
|
COUNT_RE = re.compile(r'^(?P<subject>.*?)\s+—\s+(?P<count>\d+)\s+message', re.I)
|
||||||
|
|
||||||
def parse_groups(text: str) -> List[Dict]:
|
def parse_groups(text):
|
||||||
"""
|
"""
|
||||||
Return a list of groups found in the log file.
|
Return a list of groups found in the log file.
|
||||||
Each group is a dict: {header, subject, count, content}
|
Each group is a dict: {header, subject, count, content}
|
||||||
|
|
@ -291,7 +289,7 @@ def process_useful_info():
|
||||||
|
|
||||||
# Try to extract subject and count if present
|
# Try to extract subject and count if present
|
||||||
subject = header
|
subject = header
|
||||||
count: Optional[int] = None
|
count = None
|
||||||
cm = COUNT_RE.search(header)
|
cm = COUNT_RE.search(header)
|
||||||
if cm:
|
if cm:
|
||||||
subject = cm.group('subject').strip()
|
subject = cm.group('subject').strip()
|
||||||
|
|
@ -308,12 +306,9 @@ def process_useful_info():
|
||||||
})
|
})
|
||||||
return groups
|
return groups
|
||||||
|
|
||||||
def for_each_group(
|
from usefulinfo_db import init_schema, insert_summary_and_events
|
||||||
log_path: str = "cache/email_usefulinfo_sorted.txt",
|
|
||||||
f: Callable[[int, Dict], None] = lambda idx, g: None,
|
def for_each_group(log_path="cache/email_usefulinfo_sorted.txt", f=lambda idx, g: None, start=1, count=-1):
|
||||||
start: int = 1,
|
|
||||||
count: int = -1
|
|
||||||
) -> None:
|
|
||||||
"""
|
"""
|
||||||
Read the grouped log, split into groups, and call f(index, group) on each.
|
Read the grouped log, split into groups, and call f(index, group) on each.
|
||||||
start: 1-based index to begin processing (useful for resuming).
|
start: 1-based index to begin processing (useful for resuming).
|
||||||
|
|
@ -335,8 +330,8 @@ def process_useful_info():
|
||||||
if count != -1 and done >= count:
|
if count != -1 and done >= count:
|
||||||
return
|
return
|
||||||
|
|
||||||
def _parse_attachment_paths(block: str) -> List[str]:
|
def _parse_attachment_paths(block):
|
||||||
paths: List[str] = []
|
paths = []
|
||||||
for line in block.splitlines():
|
for line in block.splitlines():
|
||||||
if line.startswith("Attachments:"):
|
if line.startswith("Attachments:"):
|
||||||
# After the colon, comma-separated file paths
|
# After the colon, comma-separated file paths
|
||||||
|
|
@ -353,13 +348,13 @@ def process_useful_info():
|
||||||
uniq.append(p)
|
uniq.append(p)
|
||||||
return uniq
|
return uniq
|
||||||
|
|
||||||
def _safe_read_textfile(p: Path, max_chars: int = 8000) -> str:
|
def _safe_read_textfile(p, max_chars=8000):
|
||||||
try:
|
try:
|
||||||
return p.read_text(encoding="utf-8", errors="replace")[:max_chars]
|
return p.read_text(encoding="utf-8", errors="replace")[:max_chars]
|
||||||
except Exception:
|
except Exception:
|
||||||
return ""
|
return ""
|
||||||
|
|
||||||
def _extract_pdf_text(p: Path, max_pages: int = 10, max_chars: int = 12000) -> str:
|
def _extract_pdf_text(p, max_pages=10, max_chars=12000):
|
||||||
if not PyPDF2:
|
if not PyPDF2:
|
||||||
return ""
|
return ""
|
||||||
text = []
|
text = []
|
||||||
|
|
@ -376,7 +371,7 @@ def process_useful_info():
|
||||||
return ""
|
return ""
|
||||||
return "\n".join(text)[:max_chars]
|
return "\n".join(text)[:max_chars]
|
||||||
|
|
||||||
def _extract_docx_text(p: Path, max_chars: int = 12000) -> str:
|
def _extract_docx_text(p, max_chars=12000):
|
||||||
if not docx:
|
if not docx:
|
||||||
return ""
|
return ""
|
||||||
try:
|
try:
|
||||||
|
|
@ -386,12 +381,12 @@ def process_useful_info():
|
||||||
except Exception:
|
except Exception:
|
||||||
return ""
|
return ""
|
||||||
|
|
||||||
def _extract_pptx_text(p: Path, max_chars: int = 12000) -> str:
|
def _extract_pptx_text(p, max_chars=12000):
|
||||||
if not Presentation:
|
if not Presentation:
|
||||||
return ""
|
return ""
|
||||||
try:
|
try:
|
||||||
pres = Presentation(str(p))
|
pres = Presentation(str(p))
|
||||||
chunks: List[str] = []
|
chunks = []
|
||||||
for slide in pres.slides:
|
for slide in pres.slides:
|
||||||
for shape in slide.shapes:
|
for shape in slide.shapes:
|
||||||
try:
|
try:
|
||||||
|
|
@ -406,8 +401,8 @@ def process_useful_info():
|
||||||
except Exception:
|
except Exception:
|
||||||
return ""
|
return ""
|
||||||
|
|
||||||
def _extract_attachment_text(paths: Iterable[str]) -> str:
|
def _extract_attachment_text(paths):
|
||||||
out_chunks: List[str] = []
|
out_chunks = []
|
||||||
for raw in paths:
|
for raw in paths:
|
||||||
p = Path(raw)
|
p = Path(raw)
|
||||||
# Ensure relative paths still resolve from repo root
|
# Ensure relative paths still resolve from repo root
|
||||||
|
|
@ -432,6 +427,134 @@ def process_useful_info():
|
||||||
|
|
||||||
OUT_JSONL = Path("cache/useful_info_summaries.jsonl")
|
OUT_JSONL = Path("cache/useful_info_summaries.jsonl")
|
||||||
|
|
||||||
|
# --- PostgreSQL schema + insert helpers (via localcache2.db) ---
|
||||||
|
def _pg_init_schema():
|
||||||
|
try:
|
||||||
|
from localcache2 import db as _db
|
||||||
|
CON, CURSOR = _db()
|
||||||
|
try:
|
||||||
|
CURSOR.execute(
|
||||||
|
"""
|
||||||
|
CREATE TABLE IF NOT EXISTS useful_info_summary (
|
||||||
|
id BIGSERIAL PRIMARY KEY,
|
||||||
|
summary_hash CHAR(64) UNIQUE,
|
||||||
|
grp_index INTEGER,
|
||||||
|
subject TEXT,
|
||||||
|
thread_count INTEGER,
|
||||||
|
source TEXT,
|
||||||
|
date_label TEXT,
|
||||||
|
tags_json JSONB,
|
||||||
|
short_text TEXT,
|
||||||
|
summary_text TEXT,
|
||||||
|
attachments_json JSONB,
|
||||||
|
raw_json TEXT,
|
||||||
|
created_at TIMESTAMPTZ DEFAULT now()
|
||||||
|
);
|
||||||
|
"""
|
||||||
|
)
|
||||||
|
CURSOR.execute(
|
||||||
|
"""
|
||||||
|
CREATE TABLE IF NOT EXISTS useful_info_event (
|
||||||
|
id BIGSERIAL PRIMARY KEY,
|
||||||
|
summary_id BIGINT NOT NULL REFERENCES useful_info_summary(id) ON DELETE CASCADE,
|
||||||
|
dt TEXT,
|
||||||
|
length TEXT,
|
||||||
|
title TEXT,
|
||||||
|
description TEXT,
|
||||||
|
created_at TIMESTAMPTZ DEFAULT now()
|
||||||
|
);
|
||||||
|
"""
|
||||||
|
)
|
||||||
|
CON.commit()
|
||||||
|
finally:
|
||||||
|
CURSOR.close()
|
||||||
|
CON.close()
|
||||||
|
except Exception as e:
|
||||||
|
print("[warn] could not init PostgreSQL schema:", e)
|
||||||
|
|
||||||
|
def _sha256(s):
|
||||||
|
import hashlib
|
||||||
|
return hashlib.sha256(s.encode('utf-8', 'ignore')).hexdigest()
|
||||||
|
|
||||||
|
def _pg_insert_summary_and_events(idx, subject, count, attachments, parsed, raw):
|
||||||
|
try:
|
||||||
|
from localcache2 import db as _db
|
||||||
|
import json as _json
|
||||||
|
CON, CURSOR = _db()
|
||||||
|
try:
|
||||||
|
source = None
|
||||||
|
date_label = None
|
||||||
|
tags = None
|
||||||
|
short_text = ''
|
||||||
|
summary_text = ''
|
||||||
|
events = []
|
||||||
|
if parsed:
|
||||||
|
source = parsed.get('source')
|
||||||
|
date_label = parsed.get('date')
|
||||||
|
tags = parsed.get('tags')
|
||||||
|
short_text = parsed.get('short') or ''
|
||||||
|
summary_text = parsed.get('summary') or ''
|
||||||
|
events = parsed.get('events') or []
|
||||||
|
|
||||||
|
s_hash = _sha256((subject or '') + "\n" + short_text + "\n" + summary_text)
|
||||||
|
CURSOR.execute(
|
||||||
|
"""
|
||||||
|
INSERT INTO useful_info_summary
|
||||||
|
(summary_hash, grp_index, subject, thread_count, source, date_label,
|
||||||
|
tags_json, short_text, summary_text, attachments_json, raw_json)
|
||||||
|
VALUES
|
||||||
|
(%s, %s, %s, %s, %s, %s,
|
||||||
|
CAST(%s AS JSONB), %s, %s, CAST(%s AS JSONB), %s)
|
||||||
|
ON CONFLICT (summary_hash)
|
||||||
|
DO UPDATE SET grp_index = EXCLUDED.grp_index
|
||||||
|
RETURNING id
|
||||||
|
""",
|
||||||
|
(
|
||||||
|
s_hash,
|
||||||
|
idx,
|
||||||
|
subject,
|
||||||
|
count,
|
||||||
|
source,
|
||||||
|
date_label,
|
||||||
|
_json.dumps(tags) if tags is not None else None,
|
||||||
|
short_text,
|
||||||
|
summary_text,
|
||||||
|
_json.dumps(attachments) if attachments else None,
|
||||||
|
raw,
|
||||||
|
),
|
||||||
|
)
|
||||||
|
row = CURSOR.fetchone()
|
||||||
|
summary_id = row[0] if row else None
|
||||||
|
|
||||||
|
if summary_id and isinstance(events, list):
|
||||||
|
for e in events:
|
||||||
|
try:
|
||||||
|
CURSOR.execute(
|
||||||
|
"""
|
||||||
|
INSERT INTO useful_info_event
|
||||||
|
(summary_id, dt, length, title, description)
|
||||||
|
VALUES (%s, %s, %s, %s, %s)
|
||||||
|
""",
|
||||||
|
(
|
||||||
|
summary_id,
|
||||||
|
(e or {}).get('dt'),
|
||||||
|
(e or {}).get('length'),
|
||||||
|
(e or {}).get('title'),
|
||||||
|
(e or {}).get('description'),
|
||||||
|
),
|
||||||
|
)
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
CON.commit()
|
||||||
|
finally:
|
||||||
|
CURSOR.close()
|
||||||
|
CON.close()
|
||||||
|
except Exception as e:
|
||||||
|
print("[warn] PostgreSQL insert failed:", e)
|
||||||
|
|
||||||
|
# Ensure DB schema exists
|
||||||
|
_pg_init_schema()
|
||||||
|
|
||||||
def demo_f(idx, g):
|
def demo_f(idx, g):
|
||||||
print(f"[{idx}] {g['subject']} (count: {g['count']})")
|
print(f"[{idx}] {g['subject']} (count: {g['count']})")
|
||||||
content = g['content']
|
content = g['content']
|
||||||
|
|
@ -457,6 +580,16 @@ def process_useful_info():
|
||||||
with open(OUT_JSONL, "a", encoding="utf-8") as outf:
|
with open(OUT_JSONL, "a", encoding="utf-8") as outf:
|
||||||
outf.write(json.dumps(record, ensure_ascii=False) + "\n")
|
outf.write(json.dumps(record, ensure_ascii=False) + "\n")
|
||||||
|
|
||||||
|
# Also persist to PostgreSQL using localcache2
|
||||||
|
if 'summary' in record:
|
||||||
|
_pg_insert_summary_and_events(
|
||||||
|
idx, record.get('subject'), record.get('count'), attach_paths, record['summary'], None
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
_pg_insert_summary_and_events(
|
||||||
|
idx, record.get('subject'), record.get('count'), attach_paths, None, record.get('summary_raw')
|
||||||
|
)
|
||||||
|
|
||||||
for_each_group(
|
for_each_group(
|
||||||
log_path="cache/email_usefulinfo_sorted.txt",
|
log_path="cache/email_usefulinfo_sorted.txt",
|
||||||
f=demo_f,
|
f=demo_f,
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue