pocketflow/utils/update_pocketflow_mdc.py

#!/usr/bin/env python3
"""
Script to generate MDC files from the PocketFlow docs folder, creating one MDC file per MD file.

Usage:
    python update_pocketflow_mdc.py [--docs-dir PATH] [--rules-dir PATH]
"""

import os
import re
import shutil
from pathlib import Path
import sys
import html.parser

class HTMLTagStripper(html.parser.HTMLParser):
    """HTML Parser subclass to strip HTML tags from content"""
    def __init__(self):
        super().__init__()
        self.reset()
        self.strict = False
        self.convert_charrefs = True
        self.text = []

    def handle_data(self, data):
        self.text.append(data)

    def get_text(self):
        return ''.join(self.text)

def strip_html_tags(html_content):
    """Remove HTML tags from content"""
    stripper = HTMLTagStripper()
    stripper.feed(html_content)
    return stripper.get_text()

def extract_frontmatter(file_path):
    """Extract title, parent, and nav_order from markdown frontmatter"""
    frontmatter = {}
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            content = f.read()

            # Extract frontmatter between --- markers
            fm_match = re.search(r'^---\s*(.+?)\s*---', content, re.DOTALL)
            if fm_match:
                frontmatter_text = fm_match.group(1)

                # Extract fields
                title_match = re.search(r'title:\s*"?([^"\n]+)"?', frontmatter_text)
                parent_match = re.search(r'parent:\s*"?([^"\n]+)"?', frontmatter_text)
                nav_order_match = re.search(r'nav_order:\s*(\d+)', frontmatter_text)

                if title_match:
                    frontmatter['title'] = title_match.group(1)
                if parent_match:
                    frontmatter['parent'] = parent_match.group(1)
                if nav_order_match:
                    frontmatter['nav_order'] = int(nav_order_match.group(1))
    except Exception as e:
        print(f"Error reading frontmatter from {file_path}: {e}")

    return frontmatter

def extract_first_heading(file_path):
    """Extract the first heading from markdown content"""
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            content = f.read()

            # Remove frontmatter
            content = re.sub(r'^---.*?---\s*', '', content, flags=re.DOTALL)

            # Find first heading
            heading_match = re.search(r'#\s+(.+)', content)
            if heading_match:
                return heading_match.group(1).strip()
    except Exception as e:
        print(f"Error extracting heading from {file_path}: {e}")

    # Fallback to filename if no heading found
    return Path(file_path).stem.replace('_', ' ').title()

def get_mdc_description(md_file, frontmatter, heading):
    """Generate a description for the MDC file based on file metadata"""
    section = ""
    subsection = ""

    # Determine section from path
    path_parts = Path(md_file).parts
    if 'core_abstraction' in path_parts:
        section = "Core Abstraction"
    elif 'design_pattern' in path_parts:
        section = "Design Pattern"
    elif 'utility_function' in path_parts:
        section = "Utility Function"

    # Use frontmatter title or heading as subsection
    if 'title' in frontmatter:
        subsection = frontmatter['title']
    else:
        subsection = heading

    # For index.md at root level, use a different format
    if Path(md_file).name == "index.md" and section == "":
        return "Guidelines for using PocketFlow, a minimalist LLM framework"

    # For other files, create a more specific description
    if section:
        return f"Guidelines for using PocketFlow, {section}, {subsection}"
    else:
        return f"Guidelines for using PocketFlow, {subsection}"

def process_markdown_content(content, remove_local_refs=False):
    """Process markdown content to make it suitable for MDC file"""
    # Remove frontmatter
    content = re.sub(r'^---.*?---\s*', '', content, flags=re.DOTALL)

    # Replace HTML div tags and their content
    content = re.sub(r'<div.*?>.*?</div>', '', content, flags=re.DOTALL)

    if remove_local_refs:
        # Replace markdown links to local documentation with just the text in brackets
        # This prevents automatically including all docs when the file is loaded
        # Keep the brackets around the text for better discoverability
        content = re.sub(r'\[([^\]]+)\]\(\./[^)]+\)', r'[\1]', content)
    else:
        # Adjust relative links to maintain references within the docs structure
        content = re.sub(r'\]\(\./([^)]+)\)', r'](mdc:./\1)', content)

        # Ensure links to md files work correctly
        content = re.sub(r'\]\(mdc:\./(.+?)\.md\)', r'](mdc:./\1.md)', content)
        content = re.sub(r'\]\(mdc:\./(.+?)\.html\)', r'](mdc:./\1.md)', content)

    # Strip remaining HTML tags
    content = strip_html_tags(content)

    return content

def generate_mdc_header(md_file, description, always_apply=False):
    """Generate MDC file header with appropriate frontmatter"""
    # Determine if we should include globs
    # For index.md and guide.md, we include **/*.py to provide high-level context for Python files
    # For other files, leave it empty to be less intrusive
    globs = "**/*.py" if always_apply else ""

    return f"""---
description: {description}
globs: {globs}
alwaysApply: {"true" if always_apply else "false"}
---
"""

def has_substantive_content(content):
    """Check if the processed content has substantive content beyond the frontmatter"""
    # Remove frontmatter
    content_without_frontmatter = re.sub(r'^---.*?---\s*', '', content, flags=re.DOTALL)

    # Remove whitespace and common HTML/markdown formatting
    cleaned_content = re.sub(r'\s+', '', content_without_frontmatter)
    cleaned_content = re.sub(r'{:.*?}', '', cleaned_content)

    # If there's almost nothing left after cleaning, consider it empty
    return len(cleaned_content) > 20  # Arbitrary threshold, adjust as needed

def convert_md_to_mdc(md_file, output_dir, docs_dir, special_treatment=False):
    """Convert a markdown file to MDC format and save to the output directory"""
    try:
        print(f"Processing: {md_file}")

        # Skip empty index.md files in subfolders
        file_name = Path(md_file).name
        parent_dir = Path(md_file).parent.name

        # Check if this is an index.md in a subfolder (not the main index.md)
        if (file_name == "index.md" and parent_dir != "docs" and
            parent_dir in ["core_abstraction", "design_pattern", "utility_function"]):

            # Read the content
            with open(md_file, 'r', encoding='utf-8') as f:
                content = f.read()

            # Skip if it doesn't have substantive content
            if not has_substantive_content(content):
                print(f"Skipping empty subfolder index: {md_file}")
                return True

        # Extract metadata from file
        frontmatter = extract_frontmatter(md_file)
        heading = extract_first_heading(md_file)
        description = get_mdc_description(md_file, frontmatter, heading)

        # Read the content
        with open(md_file, 'r', encoding='utf-8') as f:
            content = f.read()

        # Check if this file should have special treatment (index.md or guide.md)
        is_special = special_treatment or Path(md_file).name == "guide.md"

        # Process the content
        processed_content = process_markdown_content(content, remove_local_refs=is_special)

        # Generate the MDC header
        mdc_header = generate_mdc_header(md_file, description, always_apply=is_special)

        # Combine header and processed content
        mdc_content = mdc_header + processed_content

        # Perform a final check to ensure the processed content is substantive
        if not has_substantive_content(processed_content):
            print(f"Skipping file with no substantive content after processing: {md_file}")
            return True

        # Get the path relative to the docs directory
        rel_path = os.path.relpath(md_file, start=Path(docs_dir))

        # Extract just the filename and directory structure without the 'docs/' prefix
        path_parts = Path(rel_path).parts
        if len(path_parts) > 1 and path_parts[0] == 'docs':
            # Remove the 'docs/' prefix from the path
            rel_path = os.path.join(*path_parts[1:])

        # Create the output path
        output_path = Path(output_dir) / rel_path

        # Create output directory if it doesn't exist
        output_path.parent.mkdir(parents=True, exist_ok=True)

        # Change extension from .md to .mdc
        output_path = output_path.with_suffix('.mdc')

        # Write the MDC file
        with open(output_path, 'w', encoding='utf-8') as f:
            f.write(mdc_content)

        print(f"Created MDC file: {output_path}")
        return True

    except Exception as e:
        print(f"Error converting {md_file} to MDC: {e}")
        return False

def generate_mdc_files(docs_dir, rules_dir):
    """Generate MDC files from all markdown files in the docs directory"""
    docs_path = Path(docs_dir)
    rules_path = Path(rules_dir)

    # Make sure the docs directory exists
    if not docs_path.exists() or not docs_path.is_dir():
        raise ValueError(f"Directory not found: {docs_dir}")

    print(f"Generating MDC files from docs in: {docs_dir}")
    print(f"Output will be written to: {rules_dir}")

    # Create the rules directory if it doesn't exist
    rules_path.mkdir(parents=True, exist_ok=True)

    # Process the main index.md file first
    index_file = docs_path / "index.md"
    if index_file.exists():
        convert_md_to_mdc(index_file, rules_path, docs_dir, special_treatment=True)

    # Process guide.md file with special treatment (if it exists)
    guide_file = docs_path / "guide.md"
    if guide_file.exists():
        convert_md_to_mdc(guide_file, rules_path, docs_dir, special_treatment=True)

    # Process all other markdown files
    success_count = 0
    failure_count = 0

    # Find all markdown files
    md_files = list(docs_path.glob("**/*.md"))

    # Skip the main index.md and guide.md files as we've already processed them
    md_files = [f for f in md_files if f != index_file and f != guide_file]

    # Process each markdown file
    for md_file in md_files:
        if convert_md_to_mdc(md_file, rules_path, docs_dir):
            success_count += 1
        else:
            failure_count += 1

    print(f"\nProcessed {len(md_files) + 2} markdown files:")
    print(f"  - Successfully converted: {success_count + 2}")
    print(f"  - Failed conversions: {failure_count}")

    return success_count > 0 and failure_count == 0

if __name__ == "__main__":
    import argparse

    parser = argparse.ArgumentParser(description="Generate MDC files from PocketFlow docs")

    # Get script directory
    script_dir = Path(__file__).parent.absolute()

    # Default to PocketFlow/docs directory relative to script location
    default_docs_dir = (script_dir.parent / "docs").as_posix()

    # Default rules directory - changed to .cursor/rules
    default_rules_dir = (script_dir.parent / ".cursor" / "rules").as_posix()

    parser.add_argument("--docs-dir",
                        default=default_docs_dir,
                        help="Path to PocketFlow docs directory")
    parser.add_argument("--rules-dir",
                        default=default_rules_dir,
                        help="Output directory for MDC files")

    args = parser.parse_args()

    try:
        success = generate_mdc_files(args.docs_dir, args.rules_dir)
        sys.exit(0 if success else 1)
    except Exception as e:
        print(f"Error: {e}")
        sys.exit(1)