319 lines
12 KiB
Python
319 lines
12 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Script to generate MDC files from the PocketFlow docs folder, creating one MDC file per MD file.
|
|
|
|
Usage:
|
|
python update_pocketflow_mdc.py [--docs-dir PATH] [--rules-dir PATH]
|
|
"""
|
|
|
|
import os
|
|
import re
|
|
import shutil
|
|
from pathlib import Path
|
|
import sys
|
|
import html.parser
|
|
|
|
class HTMLTagStripper(html.parser.HTMLParser):
|
|
"""HTML Parser subclass to strip HTML tags from content"""
|
|
def __init__(self):
|
|
super().__init__()
|
|
self.reset()
|
|
self.strict = False
|
|
self.convert_charrefs = True
|
|
self.text = []
|
|
|
|
def handle_data(self, data):
|
|
self.text.append(data)
|
|
|
|
def get_text(self):
|
|
return ''.join(self.text)
|
|
|
|
def strip_html_tags(html_content):
|
|
"""Remove HTML tags from content"""
|
|
stripper = HTMLTagStripper()
|
|
stripper.feed(html_content)
|
|
return stripper.get_text()
|
|
|
|
def extract_frontmatter(file_path):
|
|
"""Extract title, parent, and nav_order from markdown frontmatter"""
|
|
frontmatter = {}
|
|
try:
|
|
with open(file_path, 'r', encoding='utf-8') as f:
|
|
content = f.read()
|
|
|
|
# Extract frontmatter between --- markers
|
|
fm_match = re.search(r'^---\s*(.+?)\s*---', content, re.DOTALL)
|
|
if fm_match:
|
|
frontmatter_text = fm_match.group(1)
|
|
|
|
# Extract fields
|
|
title_match = re.search(r'title:\s*"?([^"\n]+)"?', frontmatter_text)
|
|
parent_match = re.search(r'parent:\s*"?([^"\n]+)"?', frontmatter_text)
|
|
nav_order_match = re.search(r'nav_order:\s*(\d+)', frontmatter_text)
|
|
|
|
if title_match:
|
|
frontmatter['title'] = title_match.group(1)
|
|
if parent_match:
|
|
frontmatter['parent'] = parent_match.group(1)
|
|
if nav_order_match:
|
|
frontmatter['nav_order'] = int(nav_order_match.group(1))
|
|
except Exception as e:
|
|
print(f"Error reading frontmatter from {file_path}: {e}")
|
|
|
|
return frontmatter
|
|
|
|
def extract_first_heading(file_path):
|
|
"""Extract the first heading from markdown content"""
|
|
try:
|
|
with open(file_path, 'r', encoding='utf-8') as f:
|
|
content = f.read()
|
|
|
|
# Remove frontmatter
|
|
content = re.sub(r'^---.*?---\s*', '', content, flags=re.DOTALL)
|
|
|
|
# Find first heading
|
|
heading_match = re.search(r'#\s+(.+)', content)
|
|
if heading_match:
|
|
return heading_match.group(1).strip()
|
|
except Exception as e:
|
|
print(f"Error extracting heading from {file_path}: {e}")
|
|
|
|
# Fallback to filename if no heading found
|
|
return Path(file_path).stem.replace('_', ' ').title()
|
|
|
|
def get_mdc_description(md_file, frontmatter, heading):
|
|
"""Generate a description for the MDC file based on file metadata"""
|
|
section = ""
|
|
subsection = ""
|
|
|
|
# Determine section from path
|
|
path_parts = Path(md_file).parts
|
|
if 'core_abstraction' in path_parts:
|
|
section = "Core Abstraction"
|
|
elif 'design_pattern' in path_parts:
|
|
section = "Design Pattern"
|
|
elif 'utility_function' in path_parts:
|
|
section = "Utility Function"
|
|
|
|
# Use frontmatter title or heading as subsection
|
|
if 'title' in frontmatter:
|
|
subsection = frontmatter['title']
|
|
else:
|
|
subsection = heading
|
|
|
|
# For index.md at root level, use a different format
|
|
if Path(md_file).name == "index.md" and section == "":
|
|
return "Guidelines for using PocketFlow, a minimalist LLM framework"
|
|
|
|
# For other files, create a more specific description
|
|
if section:
|
|
return f"Guidelines for using PocketFlow, {section}, {subsection}"
|
|
else:
|
|
return f"Guidelines for using PocketFlow, {subsection}"
|
|
|
|
def process_markdown_content(content, remove_local_refs=False):
|
|
"""Process markdown content to make it suitable for MDC file"""
|
|
# Remove frontmatter
|
|
content = re.sub(r'^---.*?---\s*', '', content, flags=re.DOTALL)
|
|
|
|
# Replace HTML div tags and their content
|
|
content = re.sub(r'<div.*?>.*?</div>', '', content, flags=re.DOTALL)
|
|
|
|
if remove_local_refs:
|
|
# Replace markdown links to local documentation with just the text in brackets
|
|
# This prevents automatically including all docs when the file is loaded
|
|
# Keep the brackets around the text for better discoverability
|
|
content = re.sub(r'\[([^\]]+)\]\(\./[^)]+\)', r'[\1]', content)
|
|
else:
|
|
# Adjust relative links to maintain references within the docs structure
|
|
content = re.sub(r'\]\(\./([^)]+)\)', r'](mdc:./\1)', content)
|
|
|
|
# Ensure links to md files work correctly
|
|
content = re.sub(r'\]\(mdc:\./(.+?)\.md\)', r'](mdc:./\1.md)', content)
|
|
content = re.sub(r'\]\(mdc:\./(.+?)\.html\)', r'](mdc:./\1.md)', content)
|
|
|
|
# Strip remaining HTML tags
|
|
content = strip_html_tags(content)
|
|
|
|
return content
|
|
|
|
def generate_mdc_header(md_file, description, always_apply=False):
|
|
"""Generate MDC file header with appropriate frontmatter"""
|
|
# Determine if we should include globs
|
|
# For index.md and guide.md, we include **/*.py to provide high-level context for Python files
|
|
# For other files, leave it empty to be less intrusive
|
|
globs = "**/*.py" if always_apply else ""
|
|
|
|
return f"""---
|
|
description: {description}
|
|
globs: {globs}
|
|
alwaysApply: {"true" if always_apply else "false"}
|
|
---
|
|
"""
|
|
|
|
def has_substantive_content(content):
|
|
"""Check if the processed content has substantive content beyond the frontmatter"""
|
|
# Remove frontmatter
|
|
content_without_frontmatter = re.sub(r'^---.*?---\s*', '', content, flags=re.DOTALL)
|
|
|
|
# Remove whitespace and common HTML/markdown formatting
|
|
cleaned_content = re.sub(r'\s+', '', content_without_frontmatter)
|
|
cleaned_content = re.sub(r'{:.*?}', '', cleaned_content)
|
|
|
|
# If there's almost nothing left after cleaning, consider it empty
|
|
return len(cleaned_content) > 20 # Arbitrary threshold, adjust as needed
|
|
|
|
def convert_md_to_mdc(md_file, output_dir, docs_dir, special_treatment=False):
|
|
"""Convert a markdown file to MDC format and save to the output directory"""
|
|
try:
|
|
print(f"Processing: {md_file}")
|
|
|
|
# Skip empty index.md files in subfolders
|
|
file_name = Path(md_file).name
|
|
parent_dir = Path(md_file).parent.name
|
|
|
|
# Check if this is an index.md in a subfolder (not the main index.md)
|
|
if (file_name == "index.md" and parent_dir != "docs" and
|
|
parent_dir in ["core_abstraction", "design_pattern", "utility_function"]):
|
|
|
|
# Read the content
|
|
with open(md_file, 'r', encoding='utf-8') as f:
|
|
content = f.read()
|
|
|
|
# Skip if it doesn't have substantive content
|
|
if not has_substantive_content(content):
|
|
print(f"Skipping empty subfolder index: {md_file}")
|
|
return True
|
|
|
|
# Extract metadata from file
|
|
frontmatter = extract_frontmatter(md_file)
|
|
heading = extract_first_heading(md_file)
|
|
description = get_mdc_description(md_file, frontmatter, heading)
|
|
|
|
# Read the content
|
|
with open(md_file, 'r', encoding='utf-8') as f:
|
|
content = f.read()
|
|
|
|
# Check if this file should have special treatment (index.md or guide.md)
|
|
is_special = special_treatment or Path(md_file).name == "guide.md"
|
|
|
|
# Process the content
|
|
processed_content = process_markdown_content(content, remove_local_refs=is_special)
|
|
|
|
# Generate the MDC header
|
|
mdc_header = generate_mdc_header(md_file, description, always_apply=is_special)
|
|
|
|
# Combine header and processed content
|
|
mdc_content = mdc_header + processed_content
|
|
|
|
# Perform a final check to ensure the processed content is substantive
|
|
if not has_substantive_content(processed_content):
|
|
print(f"Skipping file with no substantive content after processing: {md_file}")
|
|
return True
|
|
|
|
# Get the path relative to the docs directory
|
|
rel_path = os.path.relpath(md_file, start=Path(docs_dir))
|
|
|
|
# Extract just the filename and directory structure without the 'docs/' prefix
|
|
path_parts = Path(rel_path).parts
|
|
if len(path_parts) > 1 and path_parts[0] == 'docs':
|
|
# Remove the 'docs/' prefix from the path
|
|
rel_path = os.path.join(*path_parts[1:])
|
|
|
|
# Create the output path
|
|
output_path = Path(output_dir) / rel_path
|
|
|
|
# Create output directory if it doesn't exist
|
|
output_path.parent.mkdir(parents=True, exist_ok=True)
|
|
|
|
# Change extension from .md to .mdc
|
|
output_path = output_path.with_suffix('.mdc')
|
|
|
|
# Write the MDC file
|
|
with open(output_path, 'w', encoding='utf-8') as f:
|
|
f.write(mdc_content)
|
|
|
|
print(f"Created MDC file: {output_path}")
|
|
return True
|
|
|
|
except Exception as e:
|
|
print(f"Error converting {md_file} to MDC: {e}")
|
|
return False
|
|
|
|
def generate_mdc_files(docs_dir, rules_dir):
|
|
"""Generate MDC files from all markdown files in the docs directory"""
|
|
docs_path = Path(docs_dir)
|
|
rules_path = Path(rules_dir)
|
|
|
|
# Make sure the docs directory exists
|
|
if not docs_path.exists() or not docs_path.is_dir():
|
|
raise ValueError(f"Directory not found: {docs_dir}")
|
|
|
|
print(f"Generating MDC files from docs in: {docs_dir}")
|
|
print(f"Output will be written to: {rules_dir}")
|
|
|
|
# Create the rules directory if it doesn't exist
|
|
rules_path.mkdir(parents=True, exist_ok=True)
|
|
|
|
# Process the main index.md file first
|
|
index_file = docs_path / "index.md"
|
|
if index_file.exists():
|
|
convert_md_to_mdc(index_file, rules_path, docs_dir, special_treatment=True)
|
|
|
|
# Process guide.md file with special treatment (if it exists)
|
|
guide_file = docs_path / "guide.md"
|
|
if guide_file.exists():
|
|
convert_md_to_mdc(guide_file, rules_path, docs_dir, special_treatment=True)
|
|
|
|
# Process all other markdown files
|
|
success_count = 0
|
|
failure_count = 0
|
|
|
|
# Find all markdown files
|
|
md_files = list(docs_path.glob("**/*.md"))
|
|
|
|
# Skip the main index.md and guide.md files as we've already processed them
|
|
md_files = [f for f in md_files if f != index_file and f != guide_file]
|
|
|
|
# Process each markdown file
|
|
for md_file in md_files:
|
|
if convert_md_to_mdc(md_file, rules_path, docs_dir):
|
|
success_count += 1
|
|
else:
|
|
failure_count += 1
|
|
|
|
print(f"\nProcessed {len(md_files) + 2} markdown files:")
|
|
print(f" - Successfully converted: {success_count + 2}")
|
|
print(f" - Failed conversions: {failure_count}")
|
|
|
|
return success_count > 0 and failure_count == 0
|
|
|
|
if __name__ == "__main__":
|
|
import argparse
|
|
|
|
parser = argparse.ArgumentParser(description="Generate MDC files from PocketFlow docs")
|
|
|
|
# Get script directory
|
|
script_dir = Path(__file__).parent.absolute()
|
|
|
|
# Default to PocketFlow/docs directory relative to script location
|
|
default_docs_dir = (script_dir.parent / "docs").as_posix()
|
|
|
|
# Default rules directory - changed to .cursor/rules
|
|
default_rules_dir = (script_dir.parent / ".cursor" / "rules").as_posix()
|
|
|
|
parser.add_argument("--docs-dir",
|
|
default=default_docs_dir,
|
|
help="Path to PocketFlow docs directory")
|
|
parser.add_argument("--rules-dir",
|
|
default=default_rules_dir,
|
|
help="Output directory for MDC files")
|
|
|
|
args = parser.parse_args()
|
|
|
|
try:
|
|
success = generate_mdc_files(args.docs_dir, args.rules_dir)
|
|
sys.exit(0 if success else 1)
|
|
except Exception as e:
|
|
print(f"Error: {e}")
|
|
sys.exit(1) |