#!/usr/bin/env python3
"""
Script to generate MDC files from the PocketFlow docs folder, creating one MDC file per MD file.

Usage:
    python update_pocketflow_mdc.py [--docs-dir PATH] [--rules-dir PATH]
"""

import os
import re
import shutil
from pathlib import Path
import sys
import html.parser

class HTMLTagStripper(html.parser.HTMLParser):
    """HTML Parser subclass to strip HTML tags from content"""
    def __init__(self):
        super().__init__()
        self.reset()
        self.strict = False
        self.convert_charrefs = True
        self.text = []
    
    def handle_data(self, data):
        self.text.append(data)
    
    def get_text(self):
        return ''.join(self.text)

def strip_html_tags(html_content):
    """Remove HTML tags from content"""
    stripper = HTMLTagStripper()
    stripper.feed(html_content)
    return stripper.get_text()

def extract_frontmatter(file_path):
    """Extract title, parent, and nav_order from markdown frontmatter"""
    frontmatter = {}
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            content = f.read()
            
            # Extract frontmatter between --- markers
            fm_match = re.search(r'^---\s*(.+?)\s*---', content, re.DOTALL)
            if fm_match:
                frontmatter_text = fm_match.group(1)
                
                # Extract fields
                title_match = re.search(r'title:\s*"?([^"\n]+)"?', frontmatter_text)
                parent_match = re.search(r'parent:\s*"?([^"\n]+)"?', frontmatter_text)
                nav_order_match = re.search(r'nav_order:\s*(\d+)', frontmatter_text)
                
                if title_match:
                    frontmatter['title'] = title_match.group(1)
                if parent_match:
                    frontmatter['parent'] = parent_match.group(1)
                if nav_order_match:
                    frontmatter['nav_order'] = int(nav_order_match.group(1))
    except Exception as e:
        print(f"Error reading frontmatter from {file_path}: {e}")
    
    return frontmatter

def extract_first_heading(file_path):
    """Extract the first heading from markdown content"""
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            content = f.read()
            
            # Remove frontmatter
            content = re.sub(r'^---.*?---\s*', '', content, flags=re.DOTALL)
            
            # Find first heading
            heading_match = re.search(r'#\s+(.+)', content)
            if heading_match:
                return heading_match.group(1).strip()
    except Exception as e:
        print(f"Error extracting heading from {file_path}: {e}")
    
    # Fallback to filename if no heading found
    return Path(file_path).stem.replace('_', ' ').title()

def get_mdc_description(md_file, frontmatter, heading):
    """Generate a description for the MDC file based on file metadata"""
    section = ""
    subsection = ""
    
    # Determine section from path
    path_parts = Path(md_file).parts
    if 'core_abstraction' in path_parts:
        section = "Core Abstraction"
    elif 'design_pattern' in path_parts:
        section = "Design Pattern"
    elif 'utility_function' in path_parts:
        section = "Utility Function"
    
    # Use frontmatter title or heading as subsection
    if 'title' in frontmatter:
        subsection = frontmatter['title']
    else:
        subsection = heading
    
    # For index.md at root level, use a different format
    if Path(md_file).name == "index.md" and section == "":
        return "Guidelines for using PocketFlow, a minimalist LLM framework"
    
    # For other files, create a more specific description
    if section:
        return f"Guidelines for using PocketFlow, {section}, {subsection}"
    else:
        return f"Guidelines for using PocketFlow, {subsection}"

def process_markdown_content(content, remove_local_refs=False):
    """Process markdown content to make it suitable for MDC file"""
    # Remove frontmatter
    content = re.sub(r'^---.*?---\s*', '', content, flags=re.DOTALL)
    
    # Replace HTML div tags and their content
    content = re.sub(r'<div.*?>.*?</div>', '', content, flags=re.DOTALL)
    
    if remove_local_refs:
        # Replace markdown links to local documentation with just the text in brackets
        # This prevents automatically including all docs when the file is loaded
        # Keep the brackets around the text for better discoverability
        content = re.sub(r'\[([^\]]+)\]\(\./[^)]+\)', r'[\1]', content)
    else:
        # Adjust relative links to maintain references within the docs structure
        content = re.sub(r'\]\(\./([^)]+)\)', r'](mdc:./\1)', content)
        
        # Ensure links to md files work correctly
        content = re.sub(r'\]\(mdc:\./(.+?)\.md\)', r'](mdc:./\1.md)', content)
        content = re.sub(r'\]\(mdc:\./(.+?)\.html\)', r'](mdc:./\1.md)', content)
    
    # Strip remaining HTML tags
    content = strip_html_tags(content)
    
    return content

def generate_mdc_header(md_file, description, always_apply=False):
    """Generate MDC file header with appropriate frontmatter"""
    # Determine if we should include globs
    # For index.md and guide.md, we include **/*.py to provide high-level context for Python files
    # For other files, leave it empty to be less intrusive
    globs = "**/*.py" if always_apply else ""
    
    return f"""---
description: {description}
globs: {globs}
alwaysApply: {"true" if always_apply else "false"}
---
"""

def has_substantive_content(content):
    """Check if the processed content has substantive content beyond the frontmatter"""
    # Remove frontmatter
    content_without_frontmatter = re.sub(r'^---.*?---\s*', '', content, flags=re.DOTALL)
    
    # Remove whitespace and common HTML/markdown formatting
    cleaned_content = re.sub(r'\s+', '', content_without_frontmatter)
    cleaned_content = re.sub(r'{:.*?}', '', cleaned_content)
    
    # If there's almost nothing left after cleaning, consider it empty
    return len(cleaned_content) > 20  # Arbitrary threshold, adjust as needed

def convert_md_to_mdc(md_file, output_dir, docs_dir, special_treatment=False):
    """Convert a markdown file to MDC format and save to the output directory"""
    try:
        print(f"Processing: {md_file}")
        
        # Skip empty index.md files in subfolders
        file_name = Path(md_file).name
        parent_dir = Path(md_file).parent.name
        
        # Check if this is an index.md in a subfolder (not the main index.md)
        if (file_name == "index.md" and parent_dir != "docs" and 
            parent_dir in ["core_abstraction", "design_pattern", "utility_function"]):
            
            # Read the content
            with open(md_file, 'r', encoding='utf-8') as f:
                content = f.read()
                
            # Skip if it doesn't have substantive content
            if not has_substantive_content(content):
                print(f"Skipping empty subfolder index: {md_file}")
                return True
        
        # Extract metadata from file
        frontmatter = extract_frontmatter(md_file)
        heading = extract_first_heading(md_file)
        description = get_mdc_description(md_file, frontmatter, heading)
        
        # Read the content
        with open(md_file, 'r', encoding='utf-8') as f:
            content = f.read()
        
        # Check if this file should have special treatment (index.md or guide.md)
        is_special = special_treatment or Path(md_file).name == "guide.md"
        
        # Process the content
        processed_content = process_markdown_content(content, remove_local_refs=is_special)
        
        # Generate the MDC header
        mdc_header = generate_mdc_header(md_file, description, always_apply=is_special)
        
        # Combine header and processed content
        mdc_content = mdc_header + processed_content
        
        # Perform a final check to ensure the processed content is substantive
        if not has_substantive_content(processed_content):
            print(f"Skipping file with no substantive content after processing: {md_file}")
            return True
        
        # Get the path relative to the docs directory
        rel_path = os.path.relpath(md_file, start=Path(docs_dir))
        
        # Extract just the filename and directory structure without the 'docs/' prefix
        path_parts = Path(rel_path).parts
        if len(path_parts) > 1 and path_parts[0] == 'docs':
            # Remove the 'docs/' prefix from the path
            rel_path = os.path.join(*path_parts[1:])
        
        # Create the output path
        output_path = Path(output_dir) / rel_path
        
        # Create output directory if it doesn't exist
        output_path.parent.mkdir(parents=True, exist_ok=True)
        
        # Change extension from .md to .mdc
        output_path = output_path.with_suffix('.mdc')
        
        # Write the MDC file
        with open(output_path, 'w', encoding='utf-8') as f:
            f.write(mdc_content)
        
        print(f"Created MDC file: {output_path}")
        return True
    
    except Exception as e:
        print(f"Error converting {md_file} to MDC: {e}")
        return False

def generate_mdc_files(docs_dir, rules_dir):
    """Generate MDC files from all markdown files in the docs directory"""
    docs_path = Path(docs_dir)
    rules_path = Path(rules_dir)
    
    # Make sure the docs directory exists
    if not docs_path.exists() or not docs_path.is_dir():
        raise ValueError(f"Directory not found: {docs_dir}")
    
    print(f"Generating MDC files from docs in: {docs_dir}")
    print(f"Output will be written to: {rules_dir}")
    
    # Create the rules directory if it doesn't exist
    rules_path.mkdir(parents=True, exist_ok=True)
    
    # Process the main index.md file first
    index_file = docs_path / "index.md"
    if index_file.exists():
        convert_md_to_mdc(index_file, rules_path, docs_dir, special_treatment=True)
    
    # Process guide.md file with special treatment (if it exists)
    guide_file = docs_path / "guide.md"
    if guide_file.exists():
        convert_md_to_mdc(guide_file, rules_path, docs_dir, special_treatment=True)
    
    # Process all other markdown files
    success_count = 0
    failure_count = 0
    
    # Find all markdown files
    md_files = list(docs_path.glob("**/*.md"))
    
    # Skip the main index.md and guide.md files as we've already processed them
    md_files = [f for f in md_files if f != index_file and f != guide_file]
    
    # Process each markdown file
    for md_file in md_files:
        if convert_md_to_mdc(md_file, rules_path, docs_dir):
            success_count += 1
        else:
            failure_count += 1
    
    print(f"\nProcessed {len(md_files) + 2} markdown files:")
    print(f"  - Successfully converted: {success_count + 2}")
    print(f"  - Failed conversions: {failure_count}")
    
    return success_count > 0 and failure_count == 0

if __name__ == "__main__":
    import argparse
    
    parser = argparse.ArgumentParser(description="Generate MDC files from PocketFlow docs")
    
    # Get script directory
    script_dir = Path(__file__).parent.absolute()
    
    # Default to PocketFlow/docs directory relative to script location
    default_docs_dir = (script_dir.parent / "docs").as_posix()
    
    # Default rules directory - changed to .cursor/rules
    default_rules_dir = (script_dir.parent / ".cursor" / "rules").as_posix()
    
    parser.add_argument("--docs-dir", 
                        default=default_docs_dir, 
                        help="Path to PocketFlow docs directory")
    parser.add_argument("--rules-dir", 
                        default=default_rules_dir, 
                        help="Output directory for MDC files")
    
    args = parser.parse_args()
    
    try:
        success = generate_mdc_files(args.docs_dir, args.rules_dir)
        sys.exit(0 if success else 1)
    except Exception as e:
        print(f"Error: {e}")
        sys.exit(1)