notesapp/app.py

264 lines
7.9 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import os, re, uuid, yaml, glob, docx, PyPDF2, json
from flask import Flask, request, jsonify
INDEX_FILE = "kb_index.json"
WORD_THRESHOLD = 500 # Only generate summary/tags if text > 500 words
KB_DIR = "kb" # Directory where your Markdown notes are stored
UPLOAD_DIR = 'uploads'
os.makedirs(UPLOAD_DIR, exist_ok=True)
os.makedirs(KB_DIR, exist_ok=True)
# ------------------------------
# LLM API Stub for Summarization & Tagging
# ------------------------------
def llm_summarize_and_tag(text):
"""
Stub for calling an LLM API.
Replace with your API call.
Returns a tuple: (summary, [tags])
"""
summary = f"Auto summary: {text[:100]}..." # Simple stub summary
tags = ["auto", "generated"]
return summary, tags
# ------------------------------
# File Extraction Helpers
# ------------------------------
def extract_text_from_pdf(file_path):
text = ""
try:
with open(file_path, "rb") as f:
reader = PyPDF2.PdfReader(f)
for page in reader.pages:
page_text = page.extract_text()
if page_text:
text += page_text + "\n"
except Exception as e:
print(f"Error reading PDF {file_path}: {e}")
return text
def extract_text_from_docx(file_path):
text = ""
try:
doc = docx.Document(file_path)
for para in doc.paragraphs:
text += para.text + "\n"
except Exception as e:
print(f"Error reading DOCX {file_path}: {e}")
return text
# ------------------------------
# Segmentation Helper
# ------------------------------
def segment_text(text):
"""
Splits text into segments based on double newlines.
Returns a list of non-empty segments.
"""
segments = [seg.strip() for seg in re.split(r'\n\s*\n', text) if seg.strip()]
return segments
def load_note_from_file(file_path):
"""Load a note from a Markdown file by parsing its YAML front matter."""
with open(file_path, "r", encoding="utf-8") as f:
content = f.read()
metadata = {}
body = content
if content.startswith("---"):
# Look for YAML front matter between the first two '---' lines
match = re.search(r"^---\s*\n(.*?)\n---\s*\n", content, re.DOTALL)
if match:
frontmatter_str = match.group(1)
try:
metadata = yaml.safe_load(frontmatter_str) or {}
except Exception as e:
print(f"Error parsing YAML in {file_path}: {e}")
body = content[match.end():].strip()
metadata["body"] = body
metadata["file_path"] = file_path # Used for updating an existing note
return metadata
def get_all_notes():
"""Return a list of all notes in the KB."""
notes = []
for file_path in glob.glob(os.path.join(KB_DIR, "*.md")):
note = load_note_from_file(file_path)
notes.append(note)
return notes
def get_notes_with_tag(tag):
"""Return only the notes that contain a specific tag."""
notes = get_all_notes()
filtered = [note for note in notes if "tags" in note and tag in note["tags"]]
return filtered
def get_related_tags(tag):
"""
For a given tag, return a dict of other tags that appear in the same notes
along with their frequency.
"""
notes = get_notes_with_tag(tag)
related = {}
for note in notes:
for t in note.get("tags", []):
if t != tag:
related[t] = related.get(t, 0) + 1
return related
app = Flask(__name__)
# ------------------------------
# Flask Endpoints
# ------------------------------
@app.route("/api/notes", methods=["GET"])
def api_notes():
"""
If a query parameter 'tag' is provided, return only notes with that tag.
Otherwise, return all notes.
"""
tag = request.args.get("tag")
if tag:
notes = get_notes_with_tag(tag)
else:
notes = get_all_notes()
# Remove file_path from the returned data
for note in notes:
note.pop("file_path", None)
return jsonify(notes)
@app.route("/api/related_tags", methods=["GET"])
def api_related_tags():
"""Return tags that appear with a given tag (with frequency counts)."""
tag = request.args.get("tag")
if not tag:
return jsonify({"error": "tag parameter is required"}), 400
related = get_related_tags(tag)
return jsonify(related)
@app.route("/api/note/<note_id>", methods=["GET"])
def get_note(note_id):
"""Return a single note by its ID."""
notes = get_all_notes()
for note in notes:
if note.get("id") == note_id:
note_copy = note.copy()
note_copy.pop("file_path", None)
return jsonify(note_copy)
return jsonify({"error": "Note not found"}), 404
# ------------------------------
# SAVE or update a note
# ------------------------------
@app.route("/api/note", methods=["POST"])
def save_note():
"""
Save or update a note. The expected JSON payload should contain:
- id (optional; if provided, the note is updated)
- tags (a commaseparated string)
- summary (optional)
- body (the note content)
The note is stored as a Markdown file with YAML front matter.
"""
data = request.get_json()
if not data:
return jsonify({"error": "No JSON provided"}), 400
note_id = data.get("id")
if note_id:
# Update existing note: find the file by matching note_id
notes = get_all_notes()
file_path = None
for note in notes:
if note.get("id") == note_id:
file_path = note.get("file_path")
break
if not file_path:
return jsonify({"error": "Note not found"}), 404
else:
# Create a new note
note_id = str(uuid.uuid4())
file_path = os.path.join(KB_DIR, f"{note_id}.md")
# Process tags (assumes a commaseparated string)
tags = [tag.strip() for tag in data.get("tags", "").split(",") if tag.strip()]
metadata = {
"id": note_id,
"tags": tags,
"summary": data.get("summary", "")
}
frontmatter = "---\n" + yaml.dump(metadata, default_flow_style=False) + "---\n\n"
content = frontmatter + data.get("body", "")
with open(file_path, "w", encoding="utf-8") as f:
f.write(content)
return jsonify({"status": "success", "id": note_id})
@app.route('/confirm', methods=['POST'])
def confirm():
"""
Receives a JSON payload with a file_id and reviewed segments.
For each segment longer than 500 words, automatically generate a summary and tags.
Each segment is then saved as a Markdown note with YAML front matter.
"""
data = request.get_json()
if not data:
return jsonify({"error": "No JSON provided"}), 400
file_id = data.get("file_id")
segments = data.get("segments") # List of segments with potential manual edits
if not file_id or segments is None:
return jsonify({"error": "file_id and segments are required"}), 400
note_ids = []
for segment in segments:
seg_text = segment.get("text", "")
metadata = {}
# If segment is long, generate summary and tags via LLM API
if len(seg_text.split()) > 500:
summary, auto_tags = llm_summarize_and_tag(seg_text)
metadata["summary"] = summary
metadata["tags"] = auto_tags
# Generate a unique note ID and add source reference
note_id = str(uuid.uuid4())
metadata["id"] = note_id
metadata["source_file"] = file_id
# Build Markdown content with YAML front matter
md_content = "---\n"
md_content += yaml.dump(metadata, default_flow_style=False)
md_content += "---\n\n"
md_content += seg_text
# Save the note
md_filename = os.path.join(KB_DIR, f"{note_id}.md")
with open(md_filename, "w", encoding="utf-8") as f:
f.write(md_content)
note_ids.append(note_id)
return jsonify({"status": "success", "note_ids": note_ids})
if __name__ == "__main__":
app.run(debug=True)