notesapp/indexer.py

import os, re, uuid, yaml, glob, docx, PyPDF2, json
from watchdog.observers import Observer
from watchdog.events import FileSystemEventHandler


INDEX_FILE = "kb_index.json"
WORD_THRESHOLD = 500  # Only generate summary/tags if text > 500 words

KB_DIR = "kb"  # Directory where your Markdown notes are stored
UPLOAD_DIR = 'uploads'
os.makedirs(UPLOAD_DIR, exist_ok=True)
os.makedirs(KB_DIR, exist_ok=True)


def load_index():
    if os.path.exists(INDEX_FILE):
        with open(INDEX_FILE, "r", encoding="utf-8") as f:
            return json.load(f)
    return {}

def save_index(index):
    with open(INDEX_FILE, "w", encoding="utf-8") as f:
        json.dump(index, f, indent=2)

# --- Helper Functions for File Extraction ---

def extract_text_from_md(file_path):
    """
    Extract YAML front matter (if any) and body text from a Markdown file.
    """
    with open(file_path, "r", encoding="utf-8") as f:
        content = f.read()
    metadata = {}
    body = content
    if content.startswith("---"):
        # Look for the second '---' delimiter
        match = re.search(r"^---\s*\n(.*?)\n---\s*\n", content, re.DOTALL)
        if match:
            frontmatter_str = match.group(1)
            try:
                metadata = yaml.safe_load(frontmatter_str) or {}
            except Exception as e:
                print(f"Error parsing YAML in {file_path}: {e}")
            body = content[match.end():].strip()
    return metadata, body

def extract_text_from_pdf(file_path):
    """
    Extract text from a PDF file using PyPDF2.
    """
    text = ""
    try:
        with open(file_path, "rb") as f:
            reader = PyPDF2.PdfReader(f)
            for page in reader.pages:
                page_text = page.extract_text()
                if page_text:
                    text += page_text + "\n"
    except Exception as e:
        print(f"Error reading PDF {file_path}: {e}")
    return text

def extract_text_from_docx(file_path):
    """
    Extract text from a DOCX file using python-docx.
    """
    text = ""
    try:
        doc = docx.Document(file_path)
        for para in doc.paragraphs:
            text += para.text + "\n"
    except Exception as e:
        print(f"Error reading DOCX {file_path}: {e}")
    return text

def chunk_text(text, max_words=WORD_THRESHOLD):
    """
    Break text into chunks of up to max_words.
    Attempts to maintain coherence by splitting on word count.
    """
    words = text.split()
    chunks = []
    for i in range(0, len(words), max_words):
        chunk = " ".join(words[i:i+max_words])
        chunks.append(chunk)
    return chunks

# --- LLM Integration ---

def llm_summarize_and_tag(text):
    """
    Stub function to call your LLM API for summarization and tag generation.
    Replace this with an actual API call.
    Returns a tuple: (summary, [tags])
    """
    # For demonstration, we take the first 100 characters as a 'summary'
    # and return some dummy tags.
    summary = f"Summary: {text[:100]}..."
    tags = ["auto-tag1", "auto-tag2"]
    return summary, tags

# --- Processing Function ---

def process_file(file_path, index):
    ext = os.path.splitext(file_path)[1].lower()
    # Process Markdown notes
    if ext == ".md":
        metadata, body = extract_text_from_md(file_path)
        # If text is long and summary is missing, generate summary and tags.
        if len(body.split()) > WORD_THRESHOLD and "summary" not in metadata:
            summary, auto_tags = llm_summarize_and_tag(body)
            metadata["summary"] = summary
            # Combine with any existing tags.
            if "tags" in metadata:
                metadata["tags"] = list(set(metadata["tags"] + auto_tags))
            else:
                metadata["tags"] = auto_tags
        # Ensure an ID exists (use filename as fallback).
        if "id" not in metadata:
            metadata["id"] = os.path.basename(file_path)
        metadata["body"] = body
        index[metadata["id"]] = metadata
        print(f"Indexed Markdown note: {metadata['id']}")

    # Process PDFs
    elif ext == ".pdf":
        text = extract_text_from_pdf(file_path)
        chunks = chunk_text(text, max_words=WORD_THRESHOLD)
        for i, chunk in enumerate(chunks):
            note_id = f"{os.path.basename(file_path)}_chunk_{i}"
            note_data = {
                "id": note_id,
                "source": file_path,
                "chunk_index": i,
                "body": chunk
            }
            if len(chunk.split()) > WORD_THRESHOLD:
                summary, auto_tags = llm_summarize_and_tag(chunk)
                note_data["summary"] = summary
                note_data["tags"] = auto_tags
            index[note_id] = note_data
            print(f"Indexed PDF chunk: {note_id}")

    # Process Word documents (.doc, .docx)
    elif ext in [".doc", ".docx"]:
        text = extract_text_from_docx(file_path)
        chunks = chunk_text(text, max_words=WORD_THRESHOLD)
        for i, chunk in enumerate(chunks):
            note_id = f"{os.path.basename(file_path)}_chunk_{i}"
            note_data = {
                "id": note_id,
                "source": file_path,
                "chunk_index": i,
                "body": chunk
            }
            if len(chunk.split()) > WORD_THRESHOLD:
                summary, auto_tags = llm_summarize_and_tag(chunk)
                note_data["summary"] = summary
                note_data["tags"] = auto_tags
            index[note_id] = note_data
            print(f"Indexed DOCX chunk: {note_id}")
    else:
        print(f"Unsupported file type: {file_path}")

# --- Watchdog Event Handler ---

class KBEventHandler(FileSystemEventHandler):
    def __init__(self, index):
        self.index = index

    def process(self, file_path):
        # Only process supported file types.
        if file_path.lower().endswith((".md", ".pdf", ".doc", ".docx")):
            process_file(file_path, self.index)
            save_index(self.index)

    def on_created(self, event):
        if not event.is_directory:
            self.process(event.src_path)

    def on_modified(self, event):
        if not event.is_directory:
            self.process(event.src_path)

def main():
    index = load_index()
    event_handler = KBEventHandler(index)
    observer = Observer()
    observer.schedule(event_handler, path=KB_DIR, recursive=False)
    observer.start()
    print("Monitoring KB directory for changes...")
    try:
        while True:
            time.sleep(1)
    except KeyboardInterrupt:
        observer.stop()
    observer.join()

if __name__ == "__main__":
    main()