264 lines
7.9 KiB
Python
264 lines
7.9 KiB
Python
import os, re, uuid, yaml, glob, docx, PyPDF2, json
|
||
from flask import Flask, request, jsonify
|
||
|
||
|
||
INDEX_FILE = "kb_index.json"
|
||
WORD_THRESHOLD = 500 # Only generate summary/tags if text > 500 words
|
||
|
||
KB_DIR = "kb" # Directory where your Markdown notes are stored
|
||
UPLOAD_DIR = 'uploads'
|
||
os.makedirs(UPLOAD_DIR, exist_ok=True)
|
||
os.makedirs(KB_DIR, exist_ok=True)
|
||
|
||
|
||
# ------------------------------
|
||
# LLM API Stub for Summarization & Tagging
|
||
# ------------------------------
|
||
def llm_summarize_and_tag(text):
|
||
"""
|
||
Stub for calling an LLM API.
|
||
Replace with your API call.
|
||
Returns a tuple: (summary, [tags])
|
||
"""
|
||
summary = f"Auto summary: {text[:100]}..." # Simple stub summary
|
||
tags = ["auto", "generated"]
|
||
return summary, tags
|
||
|
||
# ------------------------------
|
||
# File Extraction Helpers
|
||
# ------------------------------
|
||
def extract_text_from_pdf(file_path):
|
||
text = ""
|
||
try:
|
||
with open(file_path, "rb") as f:
|
||
reader = PyPDF2.PdfReader(f)
|
||
for page in reader.pages:
|
||
page_text = page.extract_text()
|
||
if page_text:
|
||
text += page_text + "\n"
|
||
except Exception as e:
|
||
print(f"Error reading PDF {file_path}: {e}")
|
||
return text
|
||
|
||
def extract_text_from_docx(file_path):
|
||
text = ""
|
||
try:
|
||
doc = docx.Document(file_path)
|
||
for para in doc.paragraphs:
|
||
text += para.text + "\n"
|
||
except Exception as e:
|
||
print(f"Error reading DOCX {file_path}: {e}")
|
||
return text
|
||
|
||
# ------------------------------
|
||
# Segmentation Helper
|
||
# ------------------------------
|
||
def segment_text(text):
|
||
"""
|
||
Splits text into segments based on double newlines.
|
||
Returns a list of non-empty segments.
|
||
"""
|
||
segments = [seg.strip() for seg in re.split(r'\n\s*\n', text) if seg.strip()]
|
||
return segments
|
||
|
||
|
||
|
||
def load_note_from_file(file_path):
|
||
"""Load a note from a Markdown file by parsing its YAML front matter."""
|
||
with open(file_path, "r", encoding="utf-8") as f:
|
||
content = f.read()
|
||
metadata = {}
|
||
body = content
|
||
if content.startswith("---"):
|
||
# Look for YAML front matter between the first two '---' lines
|
||
match = re.search(r"^---\s*\n(.*?)\n---\s*\n", content, re.DOTALL)
|
||
if match:
|
||
frontmatter_str = match.group(1)
|
||
try:
|
||
metadata = yaml.safe_load(frontmatter_str) or {}
|
||
except Exception as e:
|
||
print(f"Error parsing YAML in {file_path}: {e}")
|
||
body = content[match.end():].strip()
|
||
metadata["body"] = body
|
||
metadata["file_path"] = file_path # Used for updating an existing note
|
||
return metadata
|
||
|
||
def get_all_notes():
|
||
"""Return a list of all notes in the KB."""
|
||
notes = []
|
||
for file_path in glob.glob(os.path.join(KB_DIR, "*.md")):
|
||
note = load_note_from_file(file_path)
|
||
notes.append(note)
|
||
return notes
|
||
|
||
def get_notes_with_tag(tag):
|
||
"""Return only the notes that contain a specific tag."""
|
||
notes = get_all_notes()
|
||
filtered = [note for note in notes if "tags" in note and tag in note["tags"]]
|
||
return filtered
|
||
|
||
def get_related_tags(tag):
|
||
"""
|
||
For a given tag, return a dict of other tags that appear in the same notes
|
||
along with their frequency.
|
||
"""
|
||
notes = get_notes_with_tag(tag)
|
||
related = {}
|
||
for note in notes:
|
||
for t in note.get("tags", []):
|
||
if t != tag:
|
||
related[t] = related.get(t, 0) + 1
|
||
return related
|
||
|
||
app = Flask(__name__)
|
||
|
||
|
||
|
||
|
||
# ------------------------------
|
||
# Flask Endpoints
|
||
# ------------------------------
|
||
|
||
|
||
|
||
@app.route("/api/notes", methods=["GET"])
|
||
def api_notes():
|
||
"""
|
||
If a query parameter 'tag' is provided, return only notes with that tag.
|
||
Otherwise, return all notes.
|
||
"""
|
||
tag = request.args.get("tag")
|
||
if tag:
|
||
notes = get_notes_with_tag(tag)
|
||
else:
|
||
notes = get_all_notes()
|
||
# Remove file_path from the returned data
|
||
for note in notes:
|
||
note.pop("file_path", None)
|
||
return jsonify(notes)
|
||
|
||
@app.route("/api/related_tags", methods=["GET"])
|
||
def api_related_tags():
|
||
"""Return tags that appear with a given tag (with frequency counts)."""
|
||
tag = request.args.get("tag")
|
||
if not tag:
|
||
return jsonify({"error": "tag parameter is required"}), 400
|
||
related = get_related_tags(tag)
|
||
return jsonify(related)
|
||
|
||
@app.route("/api/note/<note_id>", methods=["GET"])
|
||
def get_note(note_id):
|
||
"""Return a single note by its ID."""
|
||
notes = get_all_notes()
|
||
for note in notes:
|
||
if note.get("id") == note_id:
|
||
note_copy = note.copy()
|
||
note_copy.pop("file_path", None)
|
||
return jsonify(note_copy)
|
||
return jsonify({"error": "Note not found"}), 404
|
||
|
||
|
||
|
||
|
||
|
||
|
||
# ------------------------------
|
||
# SAVE or update a note
|
||
# ------------------------------
|
||
|
||
@app.route("/api/note", methods=["POST"])
|
||
def save_note():
|
||
"""
|
||
Save or update a note. The expected JSON payload should contain:
|
||
- id (optional; if provided, the note is updated)
|
||
- tags (a comma‑separated string)
|
||
- summary (optional)
|
||
- body (the note content)
|
||
The note is stored as a Markdown file with YAML front matter.
|
||
"""
|
||
data = request.get_json()
|
||
if not data:
|
||
return jsonify({"error": "No JSON provided"}), 400
|
||
|
||
note_id = data.get("id")
|
||
if note_id:
|
||
# Update existing note: find the file by matching note_id
|
||
notes = get_all_notes()
|
||
file_path = None
|
||
for note in notes:
|
||
if note.get("id") == note_id:
|
||
file_path = note.get("file_path")
|
||
break
|
||
if not file_path:
|
||
return jsonify({"error": "Note not found"}), 404
|
||
else:
|
||
# Create a new note
|
||
note_id = str(uuid.uuid4())
|
||
file_path = os.path.join(KB_DIR, f"{note_id}.md")
|
||
|
||
# Process tags (assumes a comma‑separated string)
|
||
tags = [tag.strip() for tag in data.get("tags", "").split(",") if tag.strip()]
|
||
metadata = {
|
||
"id": note_id,
|
||
"tags": tags,
|
||
"summary": data.get("summary", "")
|
||
}
|
||
frontmatter = "---\n" + yaml.dump(metadata, default_flow_style=False) + "---\n\n"
|
||
content = frontmatter + data.get("body", "")
|
||
|
||
with open(file_path, "w", encoding="utf-8") as f:
|
||
f.write(content)
|
||
|
||
return jsonify({"status": "success", "id": note_id})
|
||
|
||
|
||
|
||
@app.route('/confirm', methods=['POST'])
|
||
def confirm():
|
||
"""
|
||
Receives a JSON payload with a file_id and reviewed segments.
|
||
For each segment longer than 500 words, automatically generate a summary and tags.
|
||
Each segment is then saved as a Markdown note with YAML front matter.
|
||
"""
|
||
data = request.get_json()
|
||
if not data:
|
||
return jsonify({"error": "No JSON provided"}), 400
|
||
|
||
file_id = data.get("file_id")
|
||
segments = data.get("segments") # List of segments with potential manual edits
|
||
if not file_id or segments is None:
|
||
return jsonify({"error": "file_id and segments are required"}), 400
|
||
|
||
note_ids = []
|
||
for segment in segments:
|
||
seg_text = segment.get("text", "")
|
||
metadata = {}
|
||
# If segment is long, generate summary and tags via LLM API
|
||
if len(seg_text.split()) > 500:
|
||
summary, auto_tags = llm_summarize_and_tag(seg_text)
|
||
metadata["summary"] = summary
|
||
metadata["tags"] = auto_tags
|
||
|
||
# Generate a unique note ID and add source reference
|
||
note_id = str(uuid.uuid4())
|
||
metadata["id"] = note_id
|
||
metadata["source_file"] = file_id
|
||
|
||
# Build Markdown content with YAML front matter
|
||
md_content = "---\n"
|
||
md_content += yaml.dump(metadata, default_flow_style=False)
|
||
md_content += "---\n\n"
|
||
md_content += seg_text
|
||
|
||
# Save the note
|
||
md_filename = os.path.join(KB_DIR, f"{note_id}.md")
|
||
with open(md_filename, "w", encoding="utf-8") as f:
|
||
f.write(md_content)
|
||
note_ids.append(note_id)
|
||
|
||
return jsonify({"status": "success", "note_ids": note_ids})
|
||
|
||
|
||
|
||
if __name__ == "__main__":
|
||
app.run(debug=True) |