import os, re, uuid, yaml, glob, docx, PyPDF2, json from flask import Flask, request, jsonify INDEX_FILE = "kb_index.json" WORD_THRESHOLD = 500 # Only generate summary/tags if text > 500 words KB_DIR = "kb" # Directory where your Markdown notes are stored UPLOAD_DIR = 'uploads' os.makedirs(UPLOAD_DIR, exist_ok=True) os.makedirs(KB_DIR, exist_ok=True) # ------------------------------ # LLM API Stub for Summarization & Tagging # ------------------------------ def llm_summarize_and_tag(text): """ Stub for calling an LLM API. Replace with your API call. Returns a tuple: (summary, [tags]) """ summary = f"Auto summary: {text[:100]}..." # Simple stub summary tags = ["auto", "generated"] return summary, tags # ------------------------------ # File Extraction Helpers # ------------------------------ def extract_text_from_pdf(file_path): text = "" try: with open(file_path, "rb") as f: reader = PyPDF2.PdfReader(f) for page in reader.pages: page_text = page.extract_text() if page_text: text += page_text + "\n" except Exception as e: print(f"Error reading PDF {file_path}: {e}") return text def extract_text_from_docx(file_path): text = "" try: doc = docx.Document(file_path) for para in doc.paragraphs: text += para.text + "\n" except Exception as e: print(f"Error reading DOCX {file_path}: {e}") return text # ------------------------------ # Segmentation Helper # ------------------------------ def segment_text(text): """ Splits text into segments based on double newlines. Returns a list of non-empty segments. """ segments = [seg.strip() for seg in re.split(r'\n\s*\n', text) if seg.strip()] return segments def load_note_from_file(file_path): """Load a note from a Markdown file by parsing its YAML front matter.""" with open(file_path, "r", encoding="utf-8") as f: content = f.read() metadata = {} body = content if content.startswith("---"): # Look for YAML front matter between the first two '---' lines match = re.search(r"^---\s*\n(.*?)\n---\s*\n", content, re.DOTALL) if match: frontmatter_str = match.group(1) try: metadata = yaml.safe_load(frontmatter_str) or {} except Exception as e: print(f"Error parsing YAML in {file_path}: {e}") body = content[match.end():].strip() metadata["body"] = body metadata["file_path"] = file_path # Used for updating an existing note return metadata def get_all_notes(): """Return a list of all notes in the KB.""" notes = [] for file_path in glob.glob(os.path.join(KB_DIR, "*.md")): note = load_note_from_file(file_path) notes.append(note) return notes def get_notes_with_tag(tag): """Return only the notes that contain a specific tag.""" notes = get_all_notes() filtered = [note for note in notes if "tags" in note and tag in note["tags"]] return filtered def get_related_tags(tag): """ For a given tag, return a dict of other tags that appear in the same notes along with their frequency. """ notes = get_notes_with_tag(tag) related = {} for note in notes: for t in note.get("tags", []): if t != tag: related[t] = related.get(t, 0) + 1 return related app = Flask(__name__) # ------------------------------ # Flask Endpoints # ------------------------------ @app.route("/api/notes", methods=["GET"]) def api_notes(): """ If a query parameter 'tag' is provided, return only notes with that tag. Otherwise, return all notes. """ tag = request.args.get("tag") if tag: notes = get_notes_with_tag(tag) else: notes = get_all_notes() # Remove file_path from the returned data for note in notes: note.pop("file_path", None) return jsonify(notes) @app.route("/api/related_tags", methods=["GET"]) def api_related_tags(): """Return tags that appear with a given tag (with frequency counts).""" tag = request.args.get("tag") if not tag: return jsonify({"error": "tag parameter is required"}), 400 related = get_related_tags(tag) return jsonify(related) @app.route("/api/note/", methods=["GET"]) def get_note(note_id): """Return a single note by its ID.""" notes = get_all_notes() for note in notes: if note.get("id") == note_id: note_copy = note.copy() note_copy.pop("file_path", None) return jsonify(note_copy) return jsonify({"error": "Note not found"}), 404 # ------------------------------ # SAVE or update a note # ------------------------------ @app.route("/api/note", methods=["POST"]) def save_note(): """ Save or update a note. The expected JSON payload should contain: - id (optional; if provided, the note is updated) - tags (a comma‑separated string) - summary (optional) - body (the note content) The note is stored as a Markdown file with YAML front matter. """ data = request.get_json() if not data: return jsonify({"error": "No JSON provided"}), 400 note_id = data.get("id") if note_id: # Update existing note: find the file by matching note_id notes = get_all_notes() file_path = None for note in notes: if note.get("id") == note_id: file_path = note.get("file_path") break if not file_path: return jsonify({"error": "Note not found"}), 404 else: # Create a new note note_id = str(uuid.uuid4()) file_path = os.path.join(KB_DIR, f"{note_id}.md") # Process tags (assumes a comma‑separated string) tags = [tag.strip() for tag in data.get("tags", "").split(",") if tag.strip()] metadata = { "id": note_id, "tags": tags, "summary": data.get("summary", "") } frontmatter = "---\n" + yaml.dump(metadata, default_flow_style=False) + "---\n\n" content = frontmatter + data.get("body", "") with open(file_path, "w", encoding="utf-8") as f: f.write(content) return jsonify({"status": "success", "id": note_id}) @app.route('/confirm', methods=['POST']) def confirm(): """ Receives a JSON payload with a file_id and reviewed segments. For each segment longer than 500 words, automatically generate a summary and tags. Each segment is then saved as a Markdown note with YAML front matter. """ data = request.get_json() if not data: return jsonify({"error": "No JSON provided"}), 400 file_id = data.get("file_id") segments = data.get("segments") # List of segments with potential manual edits if not file_id or segments is None: return jsonify({"error": "file_id and segments are required"}), 400 note_ids = [] for segment in segments: seg_text = segment.get("text", "") metadata = {} # If segment is long, generate summary and tags via LLM API if len(seg_text.split()) > 500: summary, auto_tags = llm_summarize_and_tag(seg_text) metadata["summary"] = summary metadata["tags"] = auto_tags # Generate a unique note ID and add source reference note_id = str(uuid.uuid4()) metadata["id"] = note_id metadata["source_file"] = file_id # Build Markdown content with YAML front matter md_content = "---\n" md_content += yaml.dump(metadata, default_flow_style=False) md_content += "---\n\n" md_content += seg_text # Save the note md_filename = os.path.join(KB_DIR, f"{note_id}.md") with open(md_filename, "w", encoding="utf-8") as f: f.write(md_content) note_ids.append(note_id) return jsonify({"status": "success", "note_ids": note_ids}) if __name__ == "__main__": app.run(debug=True)