update rag tutorial

2025-03-21 13:37:28 -04:00 · 2025-03-21 13:37:28 -04:00 · 833e0870cd
parent 05cd05efe2
commit 833e0870cd
8 changed files with 359 additions and 7 deletions
--- a/cookbook/pocketflow-agent/requirements.txt
+++ b/cookbook/pocketflow-agent/requirements.txt
@ -1,4 +1,4 @@
 pocketflow>=0.0.1
-aiohttp>=3.8.0  # For async HTTP requests
-openai>=1.0.0   # For async LLM calls 
+aiohttp>=3.8.0  # For HTTP requests
+openai>=1.0.0   # For LLM calls 
 duckduckgo-search>=7.5.2    # For web search
--- a/cookbook/pocketflow-rag/README.md
+++ b/cookbook/pocketflow-rag/README.md
@ -0,0 +1,91 @@
+# Retrieval Augmented Generation (RAG)
+
+This project demonstrates a simplified RAG system that retrieves relevant documents based on user queries.
+
+## Features
+
+- Simple vector-based document retrieval
+- Two-stage pipeline (offline indexing, online querying)
+- FAISS-powered similarity search
+
+## Getting Started
+
+1. Install the required dependencies:
+
+```bash
+pip install -r requirements.txt
+```
+
+2. Run the application with a sample query:
+
+```bash
+python main.py --"Large Language Model"
+```
+
+3. Or run without arguments to use the default query:
+
+```bash
+python main.py
+```
+
+## API Key
+
+By default, demo uses dummy embedding based on character frequencies. To use real OpenAI embedding:
+
+1. Edit nodes.py to replace the dummy `get_embedding` with `get_openai_embedding`:
+```python
+# Change this line:
+query_embedding = get_embedding(query)
+# To this:
+query_embedding = get_openai_embedding(query)
+
+# And also change this line:
+return get_embedding(text)
+# To this:
+return get_openai_embedding(text)
+```
+
+2. Make sure your OpenAI API key is set:
+```bash
+export OPENAI_API_KEY="your-api-key-here"
+```
+
+## How It Works
+
+The magic happens through a two-stage pipeline implemented with PocketFlow:
+
+```mermaid
+graph TD
+    subgraph OfflineFlow[Offline Document Indexing]
+        EmbedDocs[EmbedDocumentsNode] --> CreateIndex[CreateIndexNode]
+    end
+    
+    subgraph OnlineFlow[Online Query Processing]
+        EmbedQuery[EmbedQueryNode] --> RetrieveDoc[RetrieveDocumentNode]
+    end
+```
+
+Here's what each part does:
+1. **EmbedDocumentsNode**: Converts documents into vector representations
+2. **CreateIndexNode**: Creates a searchable FAISS index from embeddings
+3. **EmbedQueryNode**: Converts user query into the same vector space
+4. **RetrieveDocumentNode**: Finds the most similar document using vector search
+
+## Example Output
+
+```
+✅ Created 5 document embeddings
+🔍 Creating search index...
+✅ Index created with 5 vectors
+🔍 Embedding query: Large Language Model
+🔎 Searching for relevant documents...
+📄 Retrieved document (index: 3, distance: 0.3296)
+📄 Most relevant text: "PocketFlow is a 100-line Large Language Model Framework."
+```
+
+## Files
+
+- [`main.py`](./main.py): Main entry point for running the RAG demonstration
+- [`flow.py`](./flow.py): Configures the flows that connect the nodes
+- [`nodes.py`](./nodes.py): Defines the nodes for document processing and retrieval
+- [`utils.py`](./utils.py): Utility functions including the embedding function
--- a/cookbook/pocketflow-rag/flow.py
+++ b/cookbook/pocketflow-rag/flow.py
@ -0,0 +1,22 @@
+from pocketflow import Flow
+from nodes import EmbedDocumentsNode, CreateIndexNode, EmbedQueryNode, RetrieveDocumentNode
+
+def get_offline_flow():
+    # Create offline flow for document indexing
+    embed_docs_node = EmbedDocumentsNode()
+    create_index_node = CreateIndexNode()
+    embed_docs_node >> create_index_node
+    offline_flow = Flow(start=embed_docs_node)
+    return offline_flow
+
+def get_online_flow():
+    # Create online flow for document retrieval
+    embed_query_node = EmbedQueryNode()
+    retrieve_doc_node = RetrieveDocumentNode()
+    embed_query_node >> retrieve_doc_node
+    online_flow = Flow(start=embed_query_node)
+    return online_flow
+
+# Initialize flows
+offline_flow = get_offline_flow()
+online_flow = get_online_flow() 
--- a/cookbook/pocketflow-rag/main.py
+++ b/cookbook/pocketflow-rag/main.py
@ -0,0 +1,55 @@
+import sys
+from flow import offline_flow, online_flow
+
+def run_rag_demo():
+    """
+    Run a demonstration of the RAG system.
+    
+    This function:
+    1. Indexes a set of sample documents (offline flow)
+    2. Takes a query from the command line
+    3. Retrieves the most relevant document (online flow)
+    """
+
+    # Sample texts - corpus of documents to search
+    texts = [
+        "The quick brown fox jumps over the lazy dog.",
+        "Machine learning is a subset of artificial intelligence.",
+        "Python is a popular programming language for data science.",
+        "PocketFlow is a 100-line Large Language Model Framework.",
+        "The weather is sunny and warm today.",
+    ]
+    
+    print("=" * 50)
+    print("PocketFlow RAG Document Retrieval")
+    print("=" * 50)
+    
+    # Default query
+    default_query = "Large Language Model"
+    
+    # Get query from command line if provided with --
+    query = default_query
+    for arg in sys.argv[1:]:
+        if arg.startswith("--"):
+            query = arg[2:]
+            break
+    
+    # Single shared store for both flows
+    shared = {
+        "texts": texts,
+        "embeddings": None,
+        "index": None,
+        "query": query,
+        "query_embedding": None,
+        "retrieved_document": None
+    }
+    
+    # Initialize and run the offline flow (document indexing)
+    offline_flow.run(shared)
+    
+    # Run the online flow to retrieve the most relevant document
+    online_flow.run(shared)
+
+
+if __name__ == "__main__":
+    run_rag_demo()
--- a/cookbook/pocketflow-rag/nodes.py
+++ b/cookbook/pocketflow-rag/nodes.py
@ -0,0 +1,95 @@
+from pocketflow import Node, Flow, BatchNode
+import numpy as np
+import faiss
+from utils import get_embedding, get_openai_embedding
+
+# Nodes for the offline flow
+class EmbedDocumentsNode(BatchNode):
+    def prep(self, shared):
+        """Read texts from shared store and return as an iterable"""
+        return shared["texts"]
+    
+    def exec(self, text):
+        """Embed a single text"""
+        return get_embedding(text)
+    
+    def post(self, shared, prep_res, exec_res_list):
+        """Store embeddings in the shared store"""
+        embeddings = np.array(exec_res_list, dtype=np.float32)
+        shared["embeddings"] = embeddings
+        print(f"✅ Created {len(embeddings)} document embeddings")
+        return "default"
+
+class CreateIndexNode(Node):
+    def prep(self, shared):
+        """Get embeddings from shared store"""
+        return shared["embeddings"]
+    
+    def exec(self, embeddings):
+        """Create FAISS index and add embeddings"""
+        print("🔍 Creating search index...")
+        dimension = embeddings.shape[1]
+        
+        # Create a flat L2 index
+        index = faiss.IndexFlatL2(dimension)
+        
+        # Add the embeddings to the index
+        index.add(embeddings)
+        
+        return index
+    
+    def post(self, shared, prep_res, exec_res):
+        """Store the index in shared store"""
+        shared["index"] = exec_res
+        print(f"✅ Index created with {exec_res.ntotal} vectors")
+        return "default"
+
+# Nodes for the online flow
+class EmbedQueryNode(Node):
+    def prep(self, shared):
+        """Get query from shared store"""
+        return shared["query"]
+    
+    def exec(self, query):
+        """Embed the query"""
+        print(f"🔍 Embedding query: {query}")
+        query_embedding = get_embedding(query)
+        return np.array([query_embedding], dtype=np.float32)
+    
+    def post(self, shared, prep_res, exec_res):
+        """Store query embedding in shared store"""
+        shared["query_embedding"] = exec_res
+        return "default"
+
+class RetrieveDocumentNode(Node):
+    def prep(self, shared):
+        """Get query embedding, index, and texts from shared store"""
+        return shared["query_embedding"], shared["index"], shared["texts"]
+    
+    def exec(self, inputs):
+        """Search the index for similar documents"""
+        print("🔎 Searching for relevant documents...")
+        query_embedding, index, texts = inputs
+        
+        # Search for the most similar document
+        distances, indices = index.search(query_embedding, k=1)
+        
+        # Get the index of the most similar document
+        best_idx = indices[0][0]
+        distance = distances[0][0]
+        
+        # Get the corresponding text
+        most_relevant_text = texts[best_idx]
+        
+        return {
+            "text": most_relevant_text,
+            "index": best_idx,
+            "distance": distance
+        }
+    
+    def post(self, shared, prep_res, exec_res):
+        """Store retrieved document in shared store"""
+        shared["retrieved_document"] = exec_res
+        print(f"📄 Retrieved document (index: {exec_res['index']}, distance: {exec_res['distance']:.4f})")
+        print(f"📄 Most relevant text: \"{exec_res['text']}\"")
+        return "default"
--- a/cookbook/pocketflow-rag/requirements.txt
+++ b/cookbook/pocketflow-rag/requirements.txt
@ -0,0 +1,4 @@
+pocketflow>=0.0.5
+numpy>=1.20.0
+faiss-cpu>=1.7.0
+openai>=1.0.0
--- a/cookbook/pocketflow-rag/utils.py
+++ b/cookbook/pocketflow-rag/utils.py
@ -0,0 +1,79 @@
+import os
+import numpy as np
+from openai import OpenAI
+
+def get_embedding(text):
+    """
+    A simple embedding function that converts text to vector.
+    
+    In a real application, you would use a proper embedding model like OpenAI,
+    Hugging Face, or other embedding services. For this example, we'll use a 
+    simple approach based on character frequencies for demonstration purposes.
+    """
+    # Create a simple embedding (128-dimensional) based on character frequencies
+    # This is just for demonstration - not a real embedding algorithm!
+    embedding = np.zeros(128, dtype=np.float32)
+    
+    # Generate a deterministic but distributed embedding based on character frequency
+    for i, char in enumerate(text):
+        # Use modulo to distribute values across the embedding dimensions
+        pos = ord(char) % 128
+        embedding[pos] += 1.0
+    
+    # Normalize the embedding
+    norm = np.linalg.norm(embedding)
+    if norm > 0:
+        embedding = embedding / norm
+    
+    return embedding
+
+def get_openai_embedding(text):
+    client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY", "YOUR_API_KEY"))
+    
+    response = client.embeddings.create(
+        model="text-embedding-ada-002",
+        input=text
+    )
+    
+    # Extract the embedding vector from the response
+    embedding = response.data[0].embedding
+    
+    # Convert to numpy array for consistency with other embedding functions
+    return np.array(embedding, dtype=np.float32)
+
+
+if __name__ == "__main__":
+    # Test the embedding function
+    text1 = "The quick brown fox jumps over the lazy dog."
+    text2 = "Python is a popular programming language for data science."
+    
+    emb1 = get_embedding(text1)
+    emb2 = get_embedding(text2)
+    
+    print(f"Embedding 1 shape: {emb1.shape}")
+    print(f"Embedding 2 shape: {emb2.shape}")
+    
+    # Calculate similarity (dot product)
+    similarity = np.dot(emb1, emb2)
+    print(f"Similarity between texts: {similarity:.4f}")
+    
+    # Compare with a different text
+    text3 = "Machine learning is a subset of artificial intelligence."
+    emb3 = get_embedding(text3)
+    similarity13 = np.dot(emb1, emb3)
+    similarity23 = np.dot(emb2, emb3)
+    
+    print(f"Similarity between text1 and text3: {similarity13:.4f}")
+    print(f"Similarity between text2 and text3: {similarity23:.4f}")
+    
+    # These simple comparisons should show higher similarity 
+    # between related concepts (text2 and text3) than between
+    # unrelated texts (text1 and text3)
+    
+    # Uncomment to test OpenAI embeddings (requires API key)
+    print("\nTesting OpenAI embeddings (requires API key):")
+    oai_emb1 = get_openai_embedding(text1)
+    oai_emb2 = get_openai_embedding(text2)
+    print(f"OpenAI Embedding 1 shape: {oai_emb1.shape}")
+    oai_similarity = np.dot(oai_emb1, oai_emb2)
+    print(f"OpenAI similarity between texts: {oai_similarity:.4f}")
--- a/docs/utility_function/embedding.md
+++ b/docs/utility_function/embedding.md
@ -29,12 +29,18 @@ Below you will find an overview table of various text embedding APIs, along with

 ### 1. OpenAI
 ```python
-import openai
+from openai import OpenAI

-openai.api_key = "YOUR_API_KEY"
-resp = openai.Embedding.create(model="text-embedding-ada-002", input="Hello world")
-vec = resp["data"][0]["embedding"]
-print(vec)
+client = OpenAI(api_key="YOUR_API_KEY")
+response = client.embeddings.create(
+    model="text-embedding-ada-002",
+    input=text
+)
+    
+# Extract the embedding vector from the response
+embedding = response.data[0].embedding
+embedding = np.array(embedding, dtype=np.float32)
+print(embedding)
 ```

 ### 2. Azure OpenAI