import os, re, uuid, yaml, glob, docx, PyPDF2, json from watchdog.observers import Observer from watchdog.events import FileSystemEventHandler INDEX_FILE = "kb_index.json" WORD_THRESHOLD = 500 # Only generate summary/tags if text > 500 words KB_DIR = "kb" # Directory where your Markdown notes are stored UPLOAD_DIR = 'uploads' os.makedirs(UPLOAD_DIR, exist_ok=True) os.makedirs(KB_DIR, exist_ok=True) def load_index(): if os.path.exists(INDEX_FILE): with open(INDEX_FILE, "r", encoding="utf-8") as f: return json.load(f) return {} def save_index(index): with open(INDEX_FILE, "w", encoding="utf-8") as f: json.dump(index, f, indent=2) # --- Helper Functions for File Extraction --- def extract_text_from_md(file_path): """ Extract YAML front matter (if any) and body text from a Markdown file. """ with open(file_path, "r", encoding="utf-8") as f: content = f.read() metadata = {} body = content if content.startswith("---"): # Look for the second '---' delimiter match = re.search(r"^---\s*\n(.*?)\n---\s*\n", content, re.DOTALL) if match: frontmatter_str = match.group(1) try: metadata = yaml.safe_load(frontmatter_str) or {} except Exception as e: print(f"Error parsing YAML in {file_path}: {e}") body = content[match.end():].strip() return metadata, body def extract_text_from_pdf(file_path): """ Extract text from a PDF file using PyPDF2. """ text = "" try: with open(file_path, "rb") as f: reader = PyPDF2.PdfReader(f) for page in reader.pages: page_text = page.extract_text() if page_text: text += page_text + "\n" except Exception as e: print(f"Error reading PDF {file_path}: {e}") return text def extract_text_from_docx(file_path): """ Extract text from a DOCX file using python-docx. """ text = "" try: doc = docx.Document(file_path) for para in doc.paragraphs: text += para.text + "\n" except Exception as e: print(f"Error reading DOCX {file_path}: {e}") return text def chunk_text(text, max_words=WORD_THRESHOLD): """ Break text into chunks of up to max_words. Attempts to maintain coherence by splitting on word count. """ words = text.split() chunks = [] for i in range(0, len(words), max_words): chunk = " ".join(words[i:i+max_words]) chunks.append(chunk) return chunks # --- LLM Integration --- def llm_summarize_and_tag(text): """ Stub function to call your LLM API for summarization and tag generation. Replace this with an actual API call. Returns a tuple: (summary, [tags]) """ # For demonstration, we take the first 100 characters as a 'summary' # and return some dummy tags. summary = f"Summary: {text[:100]}..." tags = ["auto-tag1", "auto-tag2"] return summary, tags # --- Processing Function --- def process_file(file_path, index): ext = os.path.splitext(file_path)[1].lower() # Process Markdown notes if ext == ".md": metadata, body = extract_text_from_md(file_path) # If text is long and summary is missing, generate summary and tags. if len(body.split()) > WORD_THRESHOLD and "summary" not in metadata: summary, auto_tags = llm_summarize_and_tag(body) metadata["summary"] = summary # Combine with any existing tags. if "tags" in metadata: metadata["tags"] = list(set(metadata["tags"] + auto_tags)) else: metadata["tags"] = auto_tags # Ensure an ID exists (use filename as fallback). if "id" not in metadata: metadata["id"] = os.path.basename(file_path) metadata["body"] = body index[metadata["id"]] = metadata print(f"Indexed Markdown note: {metadata['id']}") # Process PDFs elif ext == ".pdf": text = extract_text_from_pdf(file_path) chunks = chunk_text(text, max_words=WORD_THRESHOLD) for i, chunk in enumerate(chunks): note_id = f"{os.path.basename(file_path)}_chunk_{i}" note_data = { "id": note_id, "source": file_path, "chunk_index": i, "body": chunk } if len(chunk.split()) > WORD_THRESHOLD: summary, auto_tags = llm_summarize_and_tag(chunk) note_data["summary"] = summary note_data["tags"] = auto_tags index[note_id] = note_data print(f"Indexed PDF chunk: {note_id}") # Process Word documents (.doc, .docx) elif ext in [".doc", ".docx"]: text = extract_text_from_docx(file_path) chunks = chunk_text(text, max_words=WORD_THRESHOLD) for i, chunk in enumerate(chunks): note_id = f"{os.path.basename(file_path)}_chunk_{i}" note_data = { "id": note_id, "source": file_path, "chunk_index": i, "body": chunk } if len(chunk.split()) > WORD_THRESHOLD: summary, auto_tags = llm_summarize_and_tag(chunk) note_data["summary"] = summary note_data["tags"] = auto_tags index[note_id] = note_data print(f"Indexed DOCX chunk: {note_id}") else: print(f"Unsupported file type: {file_path}") # --- Watchdog Event Handler --- class KBEventHandler(FileSystemEventHandler): def __init__(self, index): self.index = index def process(self, file_path): # Only process supported file types. if file_path.lower().endswith((".md", ".pdf", ".doc", ".docx")): process_file(file_path, self.index) save_index(self.index) def on_created(self, event): if not event.is_directory: self.process(event.src_path) def on_modified(self, event): if not event.is_directory: self.process(event.src_path) def main(): index = load_index() event_handler = KBEventHandler(index) observer = Observer() observer.schedule(event_handler, path=KB_DIR, recursive=False) observer.start() print("Monitoring KB directory for changes...") try: while True: time.sleep(1) except KeyboardInterrupt: observer.stop() observer.join() if __name__ == "__main__": main()