notesapp/app.py

import os, re, uuid, yaml, glob, docx, PyPDF2, json
from flask import Flask, request, jsonify


INDEX_FILE = "kb_index.json"
WORD_THRESHOLD = 500  # Only generate summary/tags if text > 500 words

KB_DIR = "kb"  # Directory where your Markdown notes are stored
UPLOAD_DIR = 'uploads'
os.makedirs(UPLOAD_DIR, exist_ok=True)
os.makedirs(KB_DIR, exist_ok=True)


# ------------------------------
# LLM API Stub for Summarization & Tagging
# ------------------------------
def llm_summarize_and_tag(text):
    """
    Stub for calling an LLM API.
    Replace with your API call.
    Returns a tuple: (summary, [tags])
    """
    summary = f"Auto summary: {text[:100]}..."  # Simple stub summary
    tags = ["auto", "generated"]
    return summary, tags

# ------------------------------
# File Extraction Helpers
# ------------------------------
def extract_text_from_pdf(file_path):
    text = ""
    try:
        with open(file_path, "rb") as f:
            reader = PyPDF2.PdfReader(f)
            for page in reader.pages:
                page_text = page.extract_text()
                if page_text:
                    text += page_text + "\n"
    except Exception as e:
        print(f"Error reading PDF {file_path}: {e}")
    return text

def extract_text_from_docx(file_path):
    text = ""
    try:
        doc = docx.Document(file_path)
        for para in doc.paragraphs:
            text += para.text + "\n"
    except Exception as e:
        print(f"Error reading DOCX {file_path}: {e}")
    return text

# ------------------------------
# Segmentation Helper
# ------------------------------
def segment_text(text):
    """
    Splits text into segments based on double newlines.
    Returns a list of non-empty segments.
    """
    segments = [seg.strip() for seg in re.split(r'\n\s*\n', text) if seg.strip()]
    return segments


def load_note_from_file(file_path):
    """Load a note from a Markdown file by parsing its YAML front matter."""
    with open(file_path, "r", encoding="utf-8") as f:
        content = f.read()
    metadata = {}
    body = content
    if content.startswith("---"):
        # Look for YAML front matter between the first two '---' lines
        match = re.search(r"^---\s*\n(.*?)\n---\s*\n", content, re.DOTALL)
        if match:
            frontmatter_str = match.group(1)
            try:
                metadata = yaml.safe_load(frontmatter_str) or {}
            except Exception as e:
                print(f"Error parsing YAML in {file_path}: {e}")
            body = content[match.end():].strip()
    metadata["body"] = body
    metadata["file_path"] = file_path  # Used for updating an existing note
    return metadata

def get_all_notes():
    """Return a list of all notes in the KB."""
    notes = []
    for file_path in glob.glob(os.path.join(KB_DIR, "*.md")):
        note = load_note_from_file(file_path)
        notes.append(note)
    return notes

def get_notes_with_tag(tag):
    """Return only the notes that contain a specific tag."""
    notes = get_all_notes()
    filtered = [note for note in notes if "tags" in note and tag in note["tags"]]
    return filtered

def get_related_tags(tag):
    """
    For a given tag, return a dict of other tags that appear in the same notes
    along with their frequency.
    """
    notes = get_notes_with_tag(tag)
    related = {}
    for note in notes:
        for t in note.get("tags", []):
            if t != tag:
                related[t] = related.get(t, 0) + 1
    return related

app = Flask(__name__)


# ------------------------------
# Flask Endpoints
# ------------------------------


@app.route("/api/notes", methods=["GET"])
def api_notes():
    """
    If a query parameter 'tag' is provided, return only notes with that tag.
    Otherwise, return all notes.
    """
    tag = request.args.get("tag")
    if tag:
        notes = get_notes_with_tag(tag)
    else:
        notes = get_all_notes()
    # Remove file_path from the returned data
    for note in notes:
        note.pop("file_path", None)
    return jsonify(notes)

@app.route("/api/related_tags", methods=["GET"])
def api_related_tags():
    """Return tags that appear with a given tag (with frequency counts)."""
    tag = request.args.get("tag")
    if not tag:
        return jsonify({"error": "tag parameter is required"}), 400
    related = get_related_tags(tag)
    return jsonify(related)

@app.route("/api/note/<note_id>", methods=["GET"])
def get_note(note_id):
    """Return a single note by its ID."""
    notes = get_all_notes()
    for note in notes:
        if note.get("id") == note_id:
            note_copy = note.copy()
            note_copy.pop("file_path", None)
            return jsonify(note_copy)
    return jsonify({"error": "Note not found"}), 404


# ------------------------------
# SAVE or update a note
# ------------------------------

@app.route("/api/note", methods=["POST"])
def save_note():
    """
    Save or update a note. The expected JSON payload should contain:
      - id (optional; if provided, the note is updated)
      - tags (a comma‑separated string)
      - summary (optional)
      - body (the note content)
    The note is stored as a Markdown file with YAML front matter.
    """
    data = request.get_json()
    if not data:
        return jsonify({"error": "No JSON provided"}), 400

    note_id = data.get("id")
    if note_id:
        # Update existing note: find the file by matching note_id
        notes = get_all_notes()
        file_path = None
        for note in notes:
            if note.get("id") == note_id:
                file_path = note.get("file_path")
                break
        if not file_path:
            return jsonify({"error": "Note not found"}), 404
    else:
        # Create a new note
        note_id = str(uuid.uuid4())
        file_path = os.path.join(KB_DIR, f"{note_id}.md")

    # Process tags (assumes a comma‑separated string)
    tags = [tag.strip() for tag in data.get("tags", "").split(",") if tag.strip()]
    metadata = {
        "id": note_id,
        "tags": tags,
        "summary": data.get("summary", "")
    }
    frontmatter = "---\n" + yaml.dump(metadata, default_flow_style=False) + "---\n\n"
    content = frontmatter + data.get("body", "")

    with open(file_path, "w", encoding="utf-8") as f:
        f.write(content)

    return jsonify({"status": "success", "id": note_id})


@app.route('/confirm', methods=['POST'])
def confirm():
    """
    Receives a JSON payload with a file_id and reviewed segments.
    For each segment longer than 500 words, automatically generate a summary and tags.
    Each segment is then saved as a Markdown note with YAML front matter.
    """
    data = request.get_json()
    if not data:
        return jsonify({"error": "No JSON provided"}), 400

    file_id = data.get("file_id")
    segments = data.get("segments")  # List of segments with potential manual edits
    if not file_id or segments is None:
        return jsonify({"error": "file_id and segments are required"}), 400

    note_ids = []
    for segment in segments:
        seg_text = segment.get("text", "")
        metadata = {}
        # If segment is long, generate summary and tags via LLM API
        if len(seg_text.split()) > 500:
            summary, auto_tags = llm_summarize_and_tag(seg_text)
            metadata["summary"] = summary
            metadata["tags"] = auto_tags

        # Generate a unique note ID and add source reference
        note_id = str(uuid.uuid4())
        metadata["id"] = note_id
        metadata["source_file"] = file_id

        # Build Markdown content with YAML front matter
        md_content = "---\n"
        md_content += yaml.dump(metadata, default_flow_style=False)
        md_content += "---\n\n"
        md_content += seg_text

        # Save the note
        md_filename = os.path.join(KB_DIR, f"{note_id}.md")
        with open(md_filename, "w", encoding="utf-8") as f:
            f.write(md_content)
        note_ids.append(note_id)

    return jsonify({"status": "success", "note_ids": note_ids})


if __name__ == "__main__":
    app.run(debug=True)