update cursor rule files

This commit is contained in:
BO WEN 2025-04-30 11:46:43 -04:00
parent b561a10c76
commit f6c4b06db8
12 changed files with 246 additions and 110 deletions

View File

@ -50,30 +50,75 @@ flow.run(shared)
A **BatchFlow** runs a **Flow** multiple times, each time with different `params`. Think of it as a loop that replays the Flow for each parameter set.
### Key Differences from BatchNode
**Important**: Unlike BatchNode, which processes items and modifies the shared store:
1. BatchFlow returns **parameters to pass to the child Flow**, not data to process
2. These parameters are accessed in child nodes via `self.params`, not from the shared store
3. Each child Flow runs independently with a different set of parameters
4. Child nodes can be regular Nodes, not BatchNodes (the batching happens at the Flow level)
### Example: Summarize Many Files
```python
class SummarizeAllFiles(BatchFlow):
def prep(self, shared):
# Return a list of param dicts (one per file)
# IMPORTANT: Return a list of param dictionaries (not data for processing)
filenames = list(shared["data"].keys()) # e.g., ["file1.txt", "file2.txt", ...]
return [{"filename": fn} for fn in filenames]
# Suppose we have a per-file Flow (e.g., load_file >> summarize >> reduce):
summarize_file = SummarizeFile(start=load_file)
# Child node that accesses filename from params, not shared store
class LoadFile(Node):
def prep(self, shared):
# Access filename from params (not from shared)
filename = self.params["filename"] # Important! Use self.params, not shared
return filename
# Wrap that flow into a BatchFlow:
def exec(self, filename):
with open(filename, 'r') as f:
return f.read()
def post(self, shared, prep_res, exec_res):
# Store file content in shared
shared["current_file_content"] = exec_res
return "default"
# Summarize node that works on the currently loaded file
class Summarize(Node):
def prep(self, shared):
return shared["current_file_content"]
def exec(self, content):
prompt = f"Summarize this file in 50 words: {content}"
return call_llm(prompt)
def post(self, shared, prep_res, exec_res):
# Store summary in shared, indexed by current filename
filename = self.params["filename"] # Again, using params
if "summaries" not in shared:
shared["summaries"] = {}
shared["summaries"][filename] = exec_res
return "default"
# Create a per-file flow
load_file = LoadFile()
summarize = Summarize()
load_file >> summarize
summarize_file = Flow(start=load_file)
# Wrap in a BatchFlow to process all files
summarize_all_files = SummarizeAllFiles(start=summarize_file)
summarize_all_files.run(shared)
```
### Under the Hood
1. `prep(shared)` returns a list of param dicts—e.g., `[{filename: "file1.txt"}, {filename: "file2.txt"}, ...]`.
1. `prep(shared)` in the BatchFlow returns a list of param dicts—e.g., `[{"filename": "file1.txt"}, {"filename": "file2.txt"}, ...]`.
2. The **BatchFlow** loops through each dict. For each one:
- It merges the dict with the BatchFlows own `params`.
- It calls `flow.run(shared)` using the merged result.
3. This means the sub-Flow is run **repeatedly**, once for every param dict.
- It merges the dict with the BatchFlow's own `params` (if any): `{**batch_flow.params, **dict_from_prep}`
- It calls `flow.run(shared)` using the merged parameters
- **IMPORTANT**: These parameters are passed to the child Flow's nodes via `self.params`, NOT via the shared store
3. This means the sub-Flow is run **repeatedly**, once for every param dict, with each node in the flow accessing the parameters via `self.params`.
---
@ -89,6 +134,7 @@ At each level, **BatchFlow** merges its own param dict with the parents. By t
class FileBatchFlow(BatchFlow):
def prep(self, shared):
# Access directory from params (set by parent)
directory = self.params["directory"]
# e.g., files = ["file1.txt", "file2.txt", ...]
files = [f for f in os.listdir(directory) if f.endswith(".txt")]
@ -99,7 +145,31 @@ class DirectoryBatchFlow(BatchFlow):
directories = [ "/path/to/dirA", "/path/to/dirB"]
return [{"directory": d} for d in directories]
# MapSummaries have params like {"directory": "/path/to/dirA", "filename": "file1.txt"}
inner_flow = FileBatchFlow(start=MapSummaries())
# The actual processing node
class ProcessFile(Node):
def prep(self, shared):
# Access both directory and filename from params
directory = self.params["directory"] # From outer batch
filename = self.params["filename"] # From inner batch
full_path = os.path.join(directory, filename)
return full_path
def exec(self, full_path):
# Process the file...
return f"Processed {full_path}"
def post(self, shared, prep_res, exec_res):
# Store results, perhaps indexed by path
if "results" not in shared:
shared["results"] = {}
shared["results"][prep_res] = exec_res
return "default"
# Set up the nested batch structure
process_node = ProcessFile()
inner_flow = FileBatchFlow(start=process_node)
outer_flow = DirectoryBatchFlow(start=inner_flow)
# Run it
outer_flow.run(shared)
```

View File

@ -13,10 +13,10 @@ Nodes and Flows **communicate** in 2 ways:
- Great for data results, large content, or anything multiple nodes need.
- You shall design the data structure and populate it ahead.
- > **Separation of Concerns:** Use `Shared Store` for almost all cases to separate *Data Schema* from *Compute Logic*! This approach is both flexible and easy to manage, resulting in more maintainable code. `Params` is more a syntax sugar for [Batch](mdc:batch.md).
- > **Separation of Concerns:** Use `Shared Store` for almost all cases to separate *Data Schema* from *Compute Logic*! This approach is both flexible and easy to manage, resulting in more maintainable code. `Params` is more a syntax sugar for [Batch](mdc:./batch.md).
{: .best-practice }
2. **Params (only for [Batch](mdc:batch.md))**
2. **Params (only for [Batch](mdc:./batch.md))**
- Each node has a local, ephemeral `params` dict passed in by the **parent Flow**, used as an identifier for tasks. Parameter keys and values shall be **immutable**.
- Good for identifiers like filenames or numeric IDs, in Batch mode.
@ -84,7 +84,7 @@ Here:
> Only set the uppermost Flow params because others will be overwritten by the parent Flow.
>
> If you need to set child node params, see [Batch](mdc:batch.md).
> If you need to set child node params, see [Batch](mdc:./batch.md).
{: .warning }
Typically, **Params** are identifiers (e.g., file name, page number). Use them to fetch the task you assigned or write to a specific part of the shared store.

View File

@ -12,7 +12,7 @@ Agent is a powerful design pattern in which nodes can take dynamic actions based
## Implement Agent with Graph
1. **Context and Action:** Implement nodes that supply context and perform actions.
2. **Branching:** Use branching to connect each action node to an agent node. Use action to allow the agent to direct the [flow](mdc:../core_abstraction/flow.md) between nodes—and potentially loop back for multi-step.
2. **Branching:** Use branching to connect each action node to an agent node. Use action to allow the agent to direct the [flow](../core_abstraction/flow.md) between nodes—and potentially loop back for multi-step.
3. **Agent Node:** Provide a prompt to decide action—for example:
```python
@ -48,7 +48,7 @@ parameters:
The core of building **high-performance** and **reliable** agents boils down to:
1. **Context Management:** Provide *relevant, minimal context.* For example, rather than including an entire chat history, retrieve the most relevant via [RAG](mdc:rag.md). Even with larger context windows, LLMs still fall victim to ["lost in the middle"](mdc:https:/arxiv.org/abs/2307.03172), overlooking mid-prompt content.
1. **Context Management:** Provide *relevant, minimal context.* For example, rather than including an entire chat history, retrieve the most relevant via [RAG](mdc:./rag.md). Even with larger context windows, LLMs still fall victim to ["lost in the middle"](https://arxiv.org/abs/2307.03172), overlooking mid-prompt content.
2. **Action Space:** Provide *a well-structured and unambiguous* set of actions—avoiding overlap like separate `read_databases` or `read_csvs`. Instead, import CSVs into the database.

View File

@ -13,7 +13,7 @@ and there is a logical way to break the task into smaller, ideally independent p
You first break down the task using [BatchNode](mdc:../core_abstraction/batch.md) in the map phase, followed by aggregation in the reduce phase.
You first break down the task using [BatchNode](../core_abstraction/batch.md) in the map phase, followed by aggregation in the reduce phase.
### Example: Document Summarization
@ -65,5 +65,5 @@ print("Individual Summaries:", shared["file_summaries"])
print("\nFinal Summary:\n", shared["all_files_summary"])
```
> **Performance Tip**: The example above works sequentially. You can speed up the map phase by running it in parallel. See [(Advanced) Parallel](mdc:../core_abstraction/parallel.md) for more details.
> **Performance Tip**: The example above works sequentially. You can speed up the map phase by running it in parallel. See [(Advanced) Parallel](../core_abstraction/parallel.md) for more details.
{: .note }

View File

@ -5,7 +5,7 @@ alwaysApply: false
---
# (Advanced) Multi-Agents
Multiple [Agents](mdc:flow.md) can work together by handling subtasks and communicating the progress.
Multiple [Agents](mdc:./flow.md) can work together by handling subtasks and communicating the progress.
Communication between agents is typically implemented using message queues in shared storage.
> Most of time, you don't need Multi-Agents. Start with a simple solution first.

View File

@ -16,9 +16,9 @@ For certain LLM tasks like answering questions, providing relevant context is es
## Stage 1: Offline Indexing
We create three Nodes:
1. `ChunkDocs` [chunks](mdc:../utility_function/chunking.md) raw text.
2. `EmbedDocs` [embeds](mdc:../utility_function/embedding.md) each chunk.
3. `StoreIndex` stores embeddings into a [vector database](mdc:../utility_function/vector.md).
1. `ChunkDocs` [chunks](../utility_function/chunking.md) raw text.
2. `EmbedDocs` [embeds](../utility_function/embedding.md) each chunk.
3. `StoreIndex` stores embeddings into a [vector database](../utility_function/vector.md).
```python
class ChunkDocs(BatchNode):

View File

@ -81,7 +81,7 @@ summary:
return structured_result
```
> Besides using `assert` statements, another popular way to validate schemas is [Pydantic](mdc:https:/github.com/pydantic/pydantic)
> Besides using `assert` statements, another popular way to validate schemas is [Pydantic](https://github.com/pydantic/pydantic)
{: .note }
### Why YAML instead of JSON?

View File

@ -5,14 +5,14 @@ alwaysApply: false
---
# Workflow
Many real-world tasks are too complex for one LLM call. The solution is to **Task Decomposition**: decompose them into a [chain](mdc:../core_abstraction/flow.md) of multiple Nodes.
Many real-world tasks are too complex for one LLM call. The solution is to **Task Decomposition**: decompose them into a [chain](../core_abstraction/flow.md) of multiple Nodes.
> - You don't want to make each task **too coarse**, because it may be *too complex for one LLM call*.
> - You don't want to make each task **too granular**, because then *the LLM call doesn't have enough context* and results are *not consistent across nodes*.
>
> You usually need multiple *iterations* to find the *sweet spot*. If the task has too many *edge cases*, consider using [Agents](mdc:agent.md).
> You usually need multiple *iterations* to find the *sweet spot*. If the task has too many *edge cases*, consider using [Agents](mdc:./agent.md).
{: .best-practice }
### Example: Article Writing
@ -46,4 +46,4 @@ shared = {"topic": "AI Safety"}
writing_flow.run(shared)
```
For *dynamic cases*, consider using [Agents](mdc:agent.md).
For *dynamic cases*, consider using [Agents](mdc:./agent.md).

View File

@ -3,9 +3,19 @@ description: Guidelines for using PocketFlow, Agentic Coding
globs: **/*.py
alwaysApply: true
---
# DOCUMENTATION FIRST POLICY
**CRITICAL INSTRUCTION**: When implementing a Pocket Flow app:
1. **ALWAYS REQUEST MDC FILES FIRST** - Before writing any code, request and review all relevant MDC documentation files. This doc provides an explaination of the documents.
2. **UNDERSTAND THE FRAMEWORK** - Gain comprehensive understanding of the Pocket Flow framework from documentation
3. **AVOID ASSUMPTION-DRIVEN DEVELOPMENT** - Do not base your implementation on assumptions or guesswork. Even if the human didn't explicitly mention pocket flow in their request, if the code you are editing is using pocket flow, you should request relevant docs to help you understand best practice as well before editing.
**VERIFICATION**: Begin each implementation with a brief summary of the documentation you've reviewed to inform your approach.
# Agentic Coding: Humans Design, Agents code!
> If you are an AI agents involved in building LLM Systems, read this guide **VERY, VERY** carefully! This is the most important chapter in the entire document. Throughout development, you should always (1) start with a small and simple solution, (2) design at a high level (`docs/design.md`) before implementation, and (3) frequently ask humans for feedback and clarification.
> If you are an AI agent involved in building LLM Systems, read this guide **VERY, VERY** carefully! This is the most important chapter in the entire document. Throughout development, you should always (1) start with a small and simple solution, (2) design at a high level (`docs/design.md`) before implementation, and (3) frequently ask humans for feedback and clarification.
{: .warning }
## Agentic Coding Steps
@ -225,3 +235,62 @@ my_project/
if __name__ == "__main__":
main()
```
# Pocket Flow
A [100-line](https://github.com/the-pocket/PocketFlow/blob/main/pocketflow/__init__.py) minimalist LLM framework for *Agents, Task Decomposition, RAG, etc*.
- **Lightweight**: Just the core graph abstraction in 100 lines. ZERO dependencies, and vendor lock-in.
- **Expressive**: Everything you love from larger frameworks—([Multi-])[Agents], [Workflow], [RAG], and more.
- **Agentic-Coding**: Intuitive enough for AI agents to help humans build complex LLM applications.
## Core Abstraction
We model the LLM workflow as a **Graph + Shared Store**:
- [Node] handles simple (LLM) tasks.
- [Flow] connects nodes through **Actions** (labeled edges).
- [Shared Store] enables communication between nodes within flows.
- [Batch] nodes/flows allow for data-intensive tasks.
- [Async] nodes/flows allow waiting for asynchronous tasks.
- [(Advanced) Parallel] nodes/flows handle I/O-bound tasks.
## Design Pattern
From there, its easy to implement popular design patterns:
- [Agent] autonomously makes decisions.
- [Workflow] chains multiple tasks into pipelines.
- [RAG] integrates data retrieval with generation.
- [Map Reduce] splits data tasks into Map and Reduce steps.
- [Structured Output] formats outputs consistently.
- [(Advanced) Multi-Agents] coordinate multiple agents.
## Utility Function
We **do not** provide built-in utilities. Instead, we offer *examples*—please *implement your own*:
- [LLM Wrapper]
- [Viz and Debug]
- [Web Search]
- [Chunking]
- [Embedding]
- [Vector Databases]
- [Text-to-Speech]
**Why not built-in?**: I believe it's a *bad practice* for vendor-specific APIs in a general framework:
- *API Volatility*: Frequent changes lead to heavy maintenance for hardcoded APIs.
- *Flexibility*: You may want to switch vendors, use fine-tuned models, or run them locally.
- *Optimizations*: Prompt caching, batching, and streaming are easier without vendor lock-in.
## Ready to build your Apps?
Check out [Agentic Coding Guidance], the fastest way to develop LLM projects with Pocket Flow!

View File

@ -1,62 +0,0 @@
---
description: Guidelines for using PocketFlow, a minimalist LLM framework
globs: **/*.py
alwaysApply: true
---
# Pocket Flow
A [100-line](mdc:https:/github.com/the-pocket/PocketFlow/blob/main/pocketflow/__init__.py) minimalist LLM framework for *Agents, Task Decomposition, RAG, etc*.
- **Lightweight**: Just the core graph abstraction in 100 lines. ZERO dependencies, and vendor lock-in.
- **Expressive**: Everything you love from larger frameworks—([Multi-])[Agents], [Workflow], [RAG], and more.
- **Agentic-Coding**: Intuitive enough for AI agents to help humans build complex LLM applications.
## Core Abstraction
We model the LLM workflow as a **Graph + Shared Store**:
- [Node] handles simple (LLM) tasks.
- [Flow] connects nodes through **Actions** (labeled edges).
- [Shared Store] enables communication between nodes within flows.
- [Batch] nodes/flows allow for data-intensive tasks.
- [Async] nodes/flows allow waiting for asynchronous tasks.
- [(Advanced) Parallel] nodes/flows handle I/O-bound tasks.
## Design Pattern
From there, its easy to implement popular design patterns:
- [Agent] autonomously makes decisions.
- [Workflow] chains multiple tasks into pipelines.
- [RAG] integrates data retrieval with generation.
- [Map Reduce] splits data tasks into Map and Reduce steps.
- [Structured Output] formats outputs consistently.
- [(Advanced) Multi-Agents] coordinate multiple agents.
## Utility Function
We **do not** provide built-in utilities. Instead, we offer *examples*—please *implement your own*:
- [LLM Wrapper]
- [Viz and Debug]
- [Web Search]
- [Chunking]
- [Embedding]
- [Vector Databases]
- [Text-to-Speech]
**Why not built-in?**: I believe it's a *bad practice* for vendor-specific APIs in a general framework:
- *API Volatility*: Frequent changes lead to heavy maintenance for hardcoded APIs.
- *Flexibility*: You may want to switch vendors, use fine-tuned models, or run them locally.
- *Optimizations*: Prompt caching, batching, and streaming are easier without vendor lock-in.
## Ready to build your Apps?
Check out [Agentic Coding Guidance], the fastest way to develop LLM projects with Pocket Flow!

View File

@ -26,11 +26,11 @@ def build_mermaid(start):
return parent and link(parent, get_id(node))
visited.add(node)
if isinstance(node, Flow):
node.start and parent and link(parent, get_id(node.start))
node.start_node and parent and link(parent, get_id(node.start_node))
lines.append(f"\n subgraph sub_flow_{get_id(node)}[{type(node).__name__}]")
node.start and walk(node.start)
node.start_node and walk(node.start_node)
for nxt in node.successors.values():
node.start and walk(nxt, get_id(node.start)) or (parent and link(parent, get_id(nxt))) or walk(nxt)
node.start_node and walk(nxt, get_id(node.start_node)) or (parent and link(parent, get_id(nxt))) or walk(nxt)
lines.append(" end\n")
else:
lines.append(f" {(nid := get_id(node))}['{type(node).__name__}']")

View File

@ -101,6 +101,10 @@ def get_mdc_description(md_file, frontmatter, heading):
else:
subsection = heading
# For the combined guide and index
if Path(md_file).name == "guide.md":
return "Guidelines for using PocketFlow, Agentic Coding"
# For index.md at root level, use a different format
if Path(md_file).name == "index.md" and section == "":
return "Guidelines for using PocketFlow, a minimalist LLM framework"
@ -137,6 +141,20 @@ def process_markdown_content(content, remove_local_refs=False):
return content
def get_documentation_first_policy():
"""Return the DOCUMENTATION FIRST POLICY text to be included in the guide"""
return """# DOCUMENTATION FIRST POLICY
**CRITICAL INSTRUCTION**: When implementing a Pocket Flow app:
1. **ALWAYS REQUEST MDC FILES FIRST** - Before writing any code, request and review all relevant MDC documentation files. This doc provides an explaination of the documents.
2. **UNDERSTAND THE FRAMEWORK** - Gain comprehensive understanding of the Pocket Flow framework from documentation
3. **AVOID ASSUMPTION-DRIVEN DEVELOPMENT** - Do not base your implementation on assumptions or guesswork. Even if the human didn't explicitly mention pocket flow in their request, if the code you are editing is using pocket flow, you should request relevant docs to help you understand best practice as well before editing.
**VERIFICATION**: Begin each implementation with a brief summary of the documentation you've reviewed to inform your approach.
"""
def generate_mdc_header(md_file, description, always_apply=False):
"""Generate MDC file header with appropriate frontmatter"""
# Determine if we should include globs
@ -163,13 +181,64 @@ def has_substantive_content(content):
# If there's almost nothing left after cleaning, consider it empty
return len(cleaned_content) > 20 # Arbitrary threshold, adjust as needed
def create_combined_guide(docs_dir, rules_dir):
"""Create a combined guide that includes both the guide and index content"""
docs_path = Path(docs_dir)
rules_path = Path(rules_dir)
guide_file = docs_path / "guide.md"
index_file = docs_path / "index.md"
if not guide_file.exists() or not index_file.exists():
print("Warning: guide.md or index.md not found, skipping combined guide creation")
return False
# Get guide content and index content
with open(guide_file, 'r', encoding='utf-8') as f:
guide_content = f.read()
with open(index_file, 'r', encoding='utf-8') as f:
index_content = f.read()
# Process the content
processed_guide = process_markdown_content(guide_content, remove_local_refs=True)
processed_index = process_markdown_content(index_content, remove_local_refs=True)
# Get the documentation first policy
doc_first_policy = get_documentation_first_policy()
# Combine the content with the documentation first policy at the beginning
combined_content = doc_first_policy + processed_guide + "\n\n" + processed_index
# Generate the MDC header
description = "Guidelines for using PocketFlow, Agentic Coding"
mdc_header = generate_mdc_header(guide_file, description, always_apply=True)
# Combine header and processed content
mdc_content = mdc_header + combined_content
# Create the output path with the new filename
output_path = rules_path / "guide_for_pocketflow.mdc"
# Write the MDC file
with open(output_path, 'w', encoding='utf-8') as f:
f.write(mdc_content)
print(f"Created combined guide MDC file: {output_path}")
return True
def convert_md_to_mdc(md_file, output_dir, docs_dir, special_treatment=False):
"""Convert a markdown file to MDC format and save to the output directory"""
try:
print(f"Processing: {md_file}")
# Skip empty index.md files in subfolders
# Skip guide.md and index.md as they'll be handled separately
file_name = Path(md_file).name
if file_name in ["guide.md", "index.md"]:
print(f"Skipping {file_name} for individual processing - it will be included in the combined guide")
return True
# Skip empty index.md files in subfolders
parent_dir = Path(md_file).parent.name
# Check if this is an index.md in a subfolder (not the main index.md)
@ -194,14 +263,11 @@ def convert_md_to_mdc(md_file, output_dir, docs_dir, special_treatment=False):
with open(md_file, 'r', encoding='utf-8') as f:
content = f.read()
# Check if this file should have special treatment (index.md or guide.md)
is_special = special_treatment or Path(md_file).name == "guide.md"
# Process the content
processed_content = process_markdown_content(content, remove_local_refs=is_special)
processed_content = process_markdown_content(content, remove_local_refs=special_treatment)
# Generate the MDC header
mdc_header = generate_mdc_header(md_file, description, always_apply=is_special)
mdc_header = generate_mdc_header(md_file, description, always_apply=special_treatment)
# Combine header and processed content
mdc_content = mdc_header + processed_content
@ -255,15 +321,8 @@ def generate_mdc_files(docs_dir, rules_dir):
# Create the rules directory if it doesn't exist
rules_path.mkdir(parents=True, exist_ok=True)
# Process the main index.md file first
index_file = docs_path / "index.md"
if index_file.exists():
convert_md_to_mdc(index_file, rules_path, docs_dir, special_treatment=True)
# Process guide.md file with special treatment (if it exists)
guide_file = docs_path / "guide.md"
if guide_file.exists():
convert_md_to_mdc(guide_file, rules_path, docs_dir, special_treatment=True)
# Create the combined guide file first (includes both guide.md and index.md)
create_combined_guide(docs_dir, rules_dir)
# Process all other markdown files
success_count = 0
@ -272,8 +331,8 @@ def generate_mdc_files(docs_dir, rules_dir):
# Find all markdown files
md_files = list(docs_path.glob("**/*.md"))
# Skip the main index.md and guide.md files as we've already processed them
md_files = [f for f in md_files if f != index_file and f != guide_file]
# Skip the main index.md and guide.md files as we've already processed them in create_combined_guide
md_files = [f for f in md_files if f.name != "index.md" and f.name != "guide.md"]
# Process each markdown file
for md_file in md_files:
@ -282,8 +341,8 @@ def generate_mdc_files(docs_dir, rules_dir):
else:
failure_count += 1
print(f"\nProcessed {len(md_files) + 2} markdown files:")
print(f" - Successfully converted: {success_count + 2}")
print(f"\nProcessed {len(md_files) + 1} markdown files:") # +1 for the combined guide
print(f" - Successfully converted: {success_count + 1}") # +1 for the combined guide
print(f" - Failed conversions: {failure_count}")
return success_count > 0 and failure_count == 0