update rag tutorial

2025-03-30 19:57:57 -04:00 · 2025-03-30 19:57:57 -04:00 · 337dcc0d66
parent f6818621f2
commit 337dcc0d66
4 changed files with 44 additions and 9 deletions
--- a/cookbook/pocketflow-rag/README.md
+++ b/cookbook/pocketflow-rag/README.md
@ -4,6 +4,7 @@ This project demonstrates a simplified RAG system that retrieves relevant docume
 ## Features
 - Document chunking for better retrieval granularity
 - Simple vector-based document retrieval
 - Two-stage pipeline (offline indexing, online querying)
 - FAISS-powered similarity search
@ -57,6 +58,7 @@ The magic happens through a two-stage pipeline implemented with PocketFlow:
 ```mermaid
 graph TD
    subgraph OfflineFlow[Offline Document Indexing]
        ChunkDocs[ChunkDocumentsNode] --> EmbedDocs[EmbedDocumentsNode]
        EmbedDocs[EmbedDocumentsNode] --> CreateIndex[CreateIndexNode]
    end
@ -66,14 +68,19 @@ graph TD
 ```
 Here's what each part does:
-1. **EmbedDocumentsNode**: Converts documents into vector representations
+1. **ChunkDocumentsNode**: Splits documents into smaller chunks for more granular retrieval
-2. **CreateIndexNode**: Creates a searchable FAISS index from embeddings
+2. **EmbedDocumentsNode**: Converts document chunks into vector representations
-3. **EmbedQueryNode**: Converts user query into the same vector space
+3. **CreateIndexNode**: Creates a searchable FAISS index from embeddings
-4. **RetrieveDocumentNode**: Finds the most similar document using vector search
+4. **EmbedQueryNode**: Converts user query into the same vector space
 5. **RetrieveDocumentNode**: Finds the most similar document chunk using vector search
 ## Example Output
 ```
 ==================================================
 PocketFlow RAG Document Retrieval
 ==================================================
 ✅ Created 5 chunks from 5 documents
 ✅ Created 5 document embeddings
 🔍 Creating search index...
 ✅ Index created with 5 vectors
@ -88,4 +95,4 @@ Here's what each part does:
 - [`main.py`](./main.py): Main entry point for running the RAG demonstration
 - [`flow.py`](./flow.py): Configures the flows that connect the nodes
 - [`nodes.py`](./nodes.py): Defines the nodes for document processing and retrieval
- [`utils.py`](./utils.py): Utility functions including the embedding function
+- [`utils.py`](./utils.py): Utility functions including chunking and embedding functions
--- a/cookbook/pocketflow-rag/flow.py
+++ b/cookbook/pocketflow-rag/flow.py
@ -1,12 +1,13 @@
 from pocketflow import Flow
-from nodes import EmbedDocumentsNode, CreateIndexNode, EmbedQueryNode, RetrieveDocumentNode
+from nodes import EmbedDocumentsNode, CreateIndexNode, EmbedQueryNode, RetrieveDocumentNode, ChunkDocumentsNode
 def get_offline_flow():
    # Create offline flow for document indexing
    chunk_docs_node = ChunkDocumentsNode()
    embed_docs_node = EmbedDocumentsNode()
    create_index_node = CreateIndexNode()
-    embed_docs_node >> create_index_node
+    chunk_docs_node >> embed_docs_node >> create_index_node
-    offline_flow = Flow(start=embed_docs_node)
+    offline_flow = Flow(start=chunk_docs_node)
    return offline_flow
 def get_online_flow():
--- a/cookbook/pocketflow-rag/nodes.py
+++ b/cookbook/pocketflow-rag/nodes.py
@ -1,9 +1,31 @@
 from pocketflow import Node, Flow, BatchNode
 import numpy as np
 import faiss
-from utils import get_embedding, get_openai_embedding
+from utils import get_embedding, get_openai_embedding, fixed_size_chunk
 # Nodes for the offline flow
 class ChunkDocumentsNode(BatchNode):
    def prep(self, shared):
        """Read texts from shared store"""
        return shared["texts"]
    def exec(self, text):
        """Chunk a single text into smaller pieces"""
        return fixed_size_chunk(text)
    def post(self, shared, prep_res, exec_res_list):
        """Store chunked texts in the shared store"""
        # Flatten the list of lists into a single list of chunks
        all_chunks = []
        for chunks in exec_res_list:
            all_chunks.extend(chunks)
        # Replace the original texts with the flat list of chunks
        shared["texts"] = all_chunks
        print(f"✅ Created {len(all_chunks)} chunks from {len(prep_res)} documents")
        return "default"
 class EmbedDocumentsNode(BatchNode):
    def prep(self, shared):
        """Read texts from shared store and return as an iterable"""
--- a/cookbook/pocketflow-rag/utils.py
+++ b/cookbook/pocketflow-rag/utils.py
@ -41,6 +41,11 @@ def get_openai_embedding(text):
    # Convert to numpy array for consistency with other embedding functions
    return np.array(embedding, dtype=np.float32)
 def fixed_size_chunk(text, chunk_size=2000):
    chunks = []
    for i in range(0, len(text), chunk_size):
        chunks.append(text[i : i + chunk_size])
    return chunks
 if __name__ == "__main__":
    # Test the embedding function