update rag tutorial

2025-03-21 13:37:28 -04:00 · 2025-03-21 13:37:28 -04:00 · 833e0870cd
parent 05cd05efe2
commit 833e0870cd
8 changed files with 359 additions and 7 deletions
--- a/cookbook/pocketflow-agent/requirements.txt
+++ b/cookbook/pocketflow-agent/requirements.txt
@ -1,4 +1,4 @@
 pocketflow>=0.0.1
-aiohttp>=3.8.0  # For async HTTP requests
+aiohttp>=3.8.0  # For HTTP requests
-openai>=1.0.0   # For async LLM calls 
+openai>=1.0.0   # For LLM calls 
 duckduckgo-search>=7.5.2    # For web search
--- a/cookbook/pocketflow-rag/README.md
+++ b/cookbook/pocketflow-rag/README.md
@ -0,0 +1,91 @@
 # Retrieval Augmented Generation (RAG)
 This project demonstrates a simplified RAG system that retrieves relevant documents based on user queries.
 ## Features
 - Simple vector-based document retrieval
 - Two-stage pipeline (offline indexing, online querying)
 - FAISS-powered similarity search
 ## Getting Started
 1. Install the required dependencies:
 ```bash
 pip install -r requirements.txt
 ```
 2. Run the application with a sample query:
 ```bash
 python main.py --"Large Language Model"
 ```
 3. Or run without arguments to use the default query:
 ```bash
 python main.py
 ```
 ## API Key
 By default, demo uses dummy embedding based on character frequencies. To use real OpenAI embedding:
 1. Edit nodes.py to replace the dummy `get_embedding` with `get_openai_embedding`:
 ```python
 # Change this line:
 query_embedding = get_embedding(query)
 # To this:
 query_embedding = get_openai_embedding(query)
 # And also change this line:
 return get_embedding(text)
 # To this:
 return get_openai_embedding(text)
 ```
 2. Make sure your OpenAI API key is set:
 ```bash
 export OPENAI_API_KEY="your-api-key-here"
 ```
 ## How It Works
 The magic happens through a two-stage pipeline implemented with PocketFlow:
 ```mermaid
 graph TD
    subgraph OfflineFlow[Offline Document Indexing]
        EmbedDocs[EmbedDocumentsNode] --> CreateIndex[CreateIndexNode]
    end
    subgraph OnlineFlow[Online Query Processing]
        EmbedQuery[EmbedQueryNode] --> RetrieveDoc[RetrieveDocumentNode]
    end
 ```
 Here's what each part does:
 1. **EmbedDocumentsNode**: Converts documents into vector representations
 2. **CreateIndexNode**: Creates a searchable FAISS index from embeddings
 3. **EmbedQueryNode**: Converts user query into the same vector space
 4. **RetrieveDocumentNode**: Finds the most similar document using vector search
 ## Example Output
 ```
 ✅ Created 5 document embeddings
 🔍 Creating search index...
 ✅ Index created with 5 vectors
 🔍 Embedding query: Large Language Model
 🔎 Searching for relevant documents...
 📄 Retrieved document (index: 3, distance: 0.3296)
 📄 Most relevant text: "PocketFlow is a 100-line Large Language Model Framework."
 ```
 ## Files
 - [`main.py`](./main.py): Main entry point for running the RAG demonstration
 - [`flow.py`](./flow.py): Configures the flows that connect the nodes
 - [`nodes.py`](./nodes.py): Defines the nodes for document processing and retrieval
 - [`utils.py`](./utils.py): Utility functions including the embedding function
--- a/cookbook/pocketflow-rag/flow.py
+++ b/cookbook/pocketflow-rag/flow.py
@ -0,0 +1,22 @@
 from pocketflow import Flow
 from nodes import EmbedDocumentsNode, CreateIndexNode, EmbedQueryNode, RetrieveDocumentNode
 def get_offline_flow():
    # Create offline flow for document indexing
    embed_docs_node = EmbedDocumentsNode()
    create_index_node = CreateIndexNode()
    embed_docs_node >> create_index_node
    offline_flow = Flow(start=embed_docs_node)
    return offline_flow
 def get_online_flow():
    # Create online flow for document retrieval
    embed_query_node = EmbedQueryNode()
    retrieve_doc_node = RetrieveDocumentNode()
    embed_query_node >> retrieve_doc_node
    online_flow = Flow(start=embed_query_node)
    return online_flow
 # Initialize flows
 offline_flow = get_offline_flow()
 online_flow = get_online_flow() 
--- a/cookbook/pocketflow-rag/main.py
+++ b/cookbook/pocketflow-rag/main.py
@ -0,0 +1,55 @@
 import sys
 from flow import offline_flow, online_flow
 def run_rag_demo():
    """
    Run a demonstration of the RAG system.
    This function:
    1. Indexes a set of sample documents (offline flow)
    2. Takes a query from the command line
    3. Retrieves the most relevant document (online flow)
    """
    # Sample texts - corpus of documents to search
    texts = [
        "The quick brown fox jumps over the lazy dog.",
        "Machine learning is a subset of artificial intelligence.",
        "Python is a popular programming language for data science.",
        "PocketFlow is a 100-line Large Language Model Framework.",
        "The weather is sunny and warm today.",
    ]
    print("=" * 50)
    print("PocketFlow RAG Document Retrieval")
    print("=" * 50)
    # Default query
    default_query = "Large Language Model"
    # Get query from command line if provided with --
    query = default_query
    for arg in sys.argv[1:]:
        if arg.startswith("--"):
            query = arg[2:]
            break
    # Single shared store for both flows
    shared = {
        "texts": texts,
        "embeddings": None,
        "index": None,
        "query": query,
        "query_embedding": None,
        "retrieved_document": None
    }
    # Initialize and run the offline flow (document indexing)
    offline_flow.run(shared)
    # Run the online flow to retrieve the most relevant document
    online_flow.run(shared)
 if __name__ == "__main__":
    run_rag_demo()
--- a/cookbook/pocketflow-rag/nodes.py
+++ b/cookbook/pocketflow-rag/nodes.py
@ -0,0 +1,95 @@
 from pocketflow import Node, Flow, BatchNode
 import numpy as np
 import faiss
 from utils import get_embedding, get_openai_embedding
 # Nodes for the offline flow
 class EmbedDocumentsNode(BatchNode):
    def prep(self, shared):
        """Read texts from shared store and return as an iterable"""
        return shared["texts"]
    def exec(self, text):
        """Embed a single text"""
        return get_embedding(text)
    def post(self, shared, prep_res, exec_res_list):
        """Store embeddings in the shared store"""
        embeddings = np.array(exec_res_list, dtype=np.float32)
        shared["embeddings"] = embeddings
        print(f"✅ Created {len(embeddings)} document embeddings")
        return "default"
 class CreateIndexNode(Node):
    def prep(self, shared):
        """Get embeddings from shared store"""
        return shared["embeddings"]
    def exec(self, embeddings):
        """Create FAISS index and add embeddings"""
        print("🔍 Creating search index...")
        dimension = embeddings.shape[1]
        # Create a flat L2 index
        index = faiss.IndexFlatL2(dimension)
        # Add the embeddings to the index
        index.add(embeddings)
        return index
    def post(self, shared, prep_res, exec_res):
        """Store the index in shared store"""
        shared["index"] = exec_res
        print(f"✅ Index created with {exec_res.ntotal} vectors")
        return "default"
 # Nodes for the online flow
 class EmbedQueryNode(Node):
    def prep(self, shared):
        """Get query from shared store"""
        return shared["query"]
    def exec(self, query):
        """Embed the query"""
        print(f"🔍 Embedding query: {query}")
        query_embedding = get_embedding(query)
        return np.array([query_embedding], dtype=np.float32)
    def post(self, shared, prep_res, exec_res):
        """Store query embedding in shared store"""
        shared["query_embedding"] = exec_res
        return "default"
 class RetrieveDocumentNode(Node):
    def prep(self, shared):
        """Get query embedding, index, and texts from shared store"""
        return shared["query_embedding"], shared["index"], shared["texts"]
    def exec(self, inputs):
        """Search the index for similar documents"""
        print("🔎 Searching for relevant documents...")
        query_embedding, index, texts = inputs
        # Search for the most similar document
        distances, indices = index.search(query_embedding, k=1)
        # Get the index of the most similar document
        best_idx = indices[0][0]
        distance = distances[0][0]
        # Get the corresponding text
        most_relevant_text = texts[best_idx]
        return {
            "text": most_relevant_text,
            "index": best_idx,
            "distance": distance
        }
    def post(self, shared, prep_res, exec_res):
        """Store retrieved document in shared store"""
        shared["retrieved_document"] = exec_res
        print(f"📄 Retrieved document (index: {exec_res['index']}, distance: {exec_res['distance']:.4f})")
        print(f"📄 Most relevant text: \"{exec_res['text']}\"")
        return "default"
--- a/cookbook/pocketflow-rag/requirements.txt
+++ b/cookbook/pocketflow-rag/requirements.txt
@ -0,0 +1,4 @@
 pocketflow>=0.0.5
 numpy>=1.20.0
 faiss-cpu>=1.7.0
 openai>=1.0.0
--- a/cookbook/pocketflow-rag/utils.py
+++ b/cookbook/pocketflow-rag/utils.py
@ -0,0 +1,79 @@
 import os
 import numpy as np
 from openai import OpenAI
 def get_embedding(text):
    """
    A simple embedding function that converts text to vector.
    In a real application, you would use a proper embedding model like OpenAI,
    Hugging Face, or other embedding services. For this example, we'll use a 
    simple approach based on character frequencies for demonstration purposes.
    """
    # Create a simple embedding (128-dimensional) based on character frequencies
    # This is just for demonstration - not a real embedding algorithm!
    embedding = np.zeros(128, dtype=np.float32)
    # Generate a deterministic but distributed embedding based on character frequency
    for i, char in enumerate(text):
        # Use modulo to distribute values across the embedding dimensions
        pos = ord(char) % 128
        embedding[pos] += 1.0
    # Normalize the embedding
    norm = np.linalg.norm(embedding)
    if norm > 0:
        embedding = embedding / norm
    return embedding
 def get_openai_embedding(text):
    client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY", "YOUR_API_KEY"))
    response = client.embeddings.create(
        model="text-embedding-ada-002",
        input=text
    )
    # Extract the embedding vector from the response
    embedding = response.data[0].embedding
    # Convert to numpy array for consistency with other embedding functions
    return np.array(embedding, dtype=np.float32)
 if __name__ == "__main__":
    # Test the embedding function
    text1 = "The quick brown fox jumps over the lazy dog."
    text2 = "Python is a popular programming language for data science."
    emb1 = get_embedding(text1)
    emb2 = get_embedding(text2)
    print(f"Embedding 1 shape: {emb1.shape}")
    print(f"Embedding 2 shape: {emb2.shape}")
    # Calculate similarity (dot product)
    similarity = np.dot(emb1, emb2)
    print(f"Similarity between texts: {similarity:.4f}")
    # Compare with a different text
    text3 = "Machine learning is a subset of artificial intelligence."
    emb3 = get_embedding(text3)
    similarity13 = np.dot(emb1, emb3)
    similarity23 = np.dot(emb2, emb3)
    print(f"Similarity between text1 and text3: {similarity13:.4f}")
    print(f"Similarity between text2 and text3: {similarity23:.4f}")
    # These simple comparisons should show higher similarity 
    # between related concepts (text2 and text3) than between
    # unrelated texts (text1 and text3)
    # Uncomment to test OpenAI embeddings (requires API key)
    print("\nTesting OpenAI embeddings (requires API key):")
    oai_emb1 = get_openai_embedding(text1)
    oai_emb2 = get_openai_embedding(text2)
    print(f"OpenAI Embedding 1 shape: {oai_emb1.shape}")
    oai_similarity = np.dot(oai_emb1, oai_emb2)
    print(f"OpenAI similarity between texts: {oai_similarity:.4f}")
--- a/docs/utility_function/embedding.md
+++ b/docs/utility_function/embedding.md
@ -29,12 +29,18 @@ Below you will find an overview table of various text embedding APIs, along with
 ### 1. OpenAI
 ```python
-import openai
+from openai import OpenAI
-openai.api_key = "YOUR_API_KEY"
+client = OpenAI(api_key="YOUR_API_KEY")
-resp = openai.Embedding.create(model="text-embedding-ada-002", input="Hello world")
+response = client.embeddings.create(
-vec = resp["data"][0]["embedding"]
+    model="text-embedding-ada-002",
-print(vec)
+    input=text
 )
 # Extract the embedding vector from the response
 embedding = response.data[0].embedding
 embedding = np.array(embedding, dtype=np.float32)
 print(embedding)
 ```
 ### 2. Azure OpenAI