update rag tutorial

This commit is contained in:
zachary62 2025-03-21 13:37:28 -04:00
parent 05cd05efe2
commit 833e0870cd
8 changed files with 359 additions and 7 deletions

View File

@ -1,4 +1,4 @@
pocketflow>=0.0.1 pocketflow>=0.0.1
aiohttp>=3.8.0 # For async HTTP requests aiohttp>=3.8.0 # For HTTP requests
openai>=1.0.0 # For async LLM calls openai>=1.0.0 # For LLM calls
duckduckgo-search>=7.5.2 # For web search duckduckgo-search>=7.5.2 # For web search

View File

@ -0,0 +1,91 @@
# Retrieval Augmented Generation (RAG)
This project demonstrates a simplified RAG system that retrieves relevant documents based on user queries.
## Features
- Simple vector-based document retrieval
- Two-stage pipeline (offline indexing, online querying)
- FAISS-powered similarity search
## Getting Started
1. Install the required dependencies:
```bash
pip install -r requirements.txt
```
2. Run the application with a sample query:
```bash
python main.py --"Large Language Model"
```
3. Or run without arguments to use the default query:
```bash
python main.py
```
## API Key
By default, demo uses dummy embedding based on character frequencies. To use real OpenAI embedding:
1. Edit nodes.py to replace the dummy `get_embedding` with `get_openai_embedding`:
```python
# Change this line:
query_embedding = get_embedding(query)
# To this:
query_embedding = get_openai_embedding(query)
# And also change this line:
return get_embedding(text)
# To this:
return get_openai_embedding(text)
```
2. Make sure your OpenAI API key is set:
```bash
export OPENAI_API_KEY="your-api-key-here"
```
## How It Works
The magic happens through a two-stage pipeline implemented with PocketFlow:
```mermaid
graph TD
subgraph OfflineFlow[Offline Document Indexing]
EmbedDocs[EmbedDocumentsNode] --> CreateIndex[CreateIndexNode]
end
subgraph OnlineFlow[Online Query Processing]
EmbedQuery[EmbedQueryNode] --> RetrieveDoc[RetrieveDocumentNode]
end
```
Here's what each part does:
1. **EmbedDocumentsNode**: Converts documents into vector representations
2. **CreateIndexNode**: Creates a searchable FAISS index from embeddings
3. **EmbedQueryNode**: Converts user query into the same vector space
4. **RetrieveDocumentNode**: Finds the most similar document using vector search
## Example Output
```
✅ Created 5 document embeddings
🔍 Creating search index...
✅ Index created with 5 vectors
🔍 Embedding query: Large Language Model
🔎 Searching for relevant documents...
📄 Retrieved document (index: 3, distance: 0.3296)
📄 Most relevant text: "PocketFlow is a 100-line Large Language Model Framework."
```
## Files
- [`main.py`](./main.py): Main entry point for running the RAG demonstration
- [`flow.py`](./flow.py): Configures the flows that connect the nodes
- [`nodes.py`](./nodes.py): Defines the nodes for document processing and retrieval
- [`utils.py`](./utils.py): Utility functions including the embedding function

View File

@ -0,0 +1,22 @@
from pocketflow import Flow
from nodes import EmbedDocumentsNode, CreateIndexNode, EmbedQueryNode, RetrieveDocumentNode
def get_offline_flow():
# Create offline flow for document indexing
embed_docs_node = EmbedDocumentsNode()
create_index_node = CreateIndexNode()
embed_docs_node >> create_index_node
offline_flow = Flow(start=embed_docs_node)
return offline_flow
def get_online_flow():
# Create online flow for document retrieval
embed_query_node = EmbedQueryNode()
retrieve_doc_node = RetrieveDocumentNode()
embed_query_node >> retrieve_doc_node
online_flow = Flow(start=embed_query_node)
return online_flow
# Initialize flows
offline_flow = get_offline_flow()
online_flow = get_online_flow()

View File

@ -0,0 +1,55 @@
import sys
from flow import offline_flow, online_flow
def run_rag_demo():
"""
Run a demonstration of the RAG system.
This function:
1. Indexes a set of sample documents (offline flow)
2. Takes a query from the command line
3. Retrieves the most relevant document (online flow)
"""
# Sample texts - corpus of documents to search
texts = [
"The quick brown fox jumps over the lazy dog.",
"Machine learning is a subset of artificial intelligence.",
"Python is a popular programming language for data science.",
"PocketFlow is a 100-line Large Language Model Framework.",
"The weather is sunny and warm today.",
]
print("=" * 50)
print("PocketFlow RAG Document Retrieval")
print("=" * 50)
# Default query
default_query = "Large Language Model"
# Get query from command line if provided with --
query = default_query
for arg in sys.argv[1:]:
if arg.startswith("--"):
query = arg[2:]
break
# Single shared store for both flows
shared = {
"texts": texts,
"embeddings": None,
"index": None,
"query": query,
"query_embedding": None,
"retrieved_document": None
}
# Initialize and run the offline flow (document indexing)
offline_flow.run(shared)
# Run the online flow to retrieve the most relevant document
online_flow.run(shared)
if __name__ == "__main__":
run_rag_demo()

View File

@ -0,0 +1,95 @@
from pocketflow import Node, Flow, BatchNode
import numpy as np
import faiss
from utils import get_embedding, get_openai_embedding
# Nodes for the offline flow
class EmbedDocumentsNode(BatchNode):
def prep(self, shared):
"""Read texts from shared store and return as an iterable"""
return shared["texts"]
def exec(self, text):
"""Embed a single text"""
return get_embedding(text)
def post(self, shared, prep_res, exec_res_list):
"""Store embeddings in the shared store"""
embeddings = np.array(exec_res_list, dtype=np.float32)
shared["embeddings"] = embeddings
print(f"✅ Created {len(embeddings)} document embeddings")
return "default"
class CreateIndexNode(Node):
def prep(self, shared):
"""Get embeddings from shared store"""
return shared["embeddings"]
def exec(self, embeddings):
"""Create FAISS index and add embeddings"""
print("🔍 Creating search index...")
dimension = embeddings.shape[1]
# Create a flat L2 index
index = faiss.IndexFlatL2(dimension)
# Add the embeddings to the index
index.add(embeddings)
return index
def post(self, shared, prep_res, exec_res):
"""Store the index in shared store"""
shared["index"] = exec_res
print(f"✅ Index created with {exec_res.ntotal} vectors")
return "default"
# Nodes for the online flow
class EmbedQueryNode(Node):
def prep(self, shared):
"""Get query from shared store"""
return shared["query"]
def exec(self, query):
"""Embed the query"""
print(f"🔍 Embedding query: {query}")
query_embedding = get_embedding(query)
return np.array([query_embedding], dtype=np.float32)
def post(self, shared, prep_res, exec_res):
"""Store query embedding in shared store"""
shared["query_embedding"] = exec_res
return "default"
class RetrieveDocumentNode(Node):
def prep(self, shared):
"""Get query embedding, index, and texts from shared store"""
return shared["query_embedding"], shared["index"], shared["texts"]
def exec(self, inputs):
"""Search the index for similar documents"""
print("🔎 Searching for relevant documents...")
query_embedding, index, texts = inputs
# Search for the most similar document
distances, indices = index.search(query_embedding, k=1)
# Get the index of the most similar document
best_idx = indices[0][0]
distance = distances[0][0]
# Get the corresponding text
most_relevant_text = texts[best_idx]
return {
"text": most_relevant_text,
"index": best_idx,
"distance": distance
}
def post(self, shared, prep_res, exec_res):
"""Store retrieved document in shared store"""
shared["retrieved_document"] = exec_res
print(f"📄 Retrieved document (index: {exec_res['index']}, distance: {exec_res['distance']:.4f})")
print(f"📄 Most relevant text: \"{exec_res['text']}\"")
return "default"

View File

@ -0,0 +1,4 @@
pocketflow>=0.0.5
numpy>=1.20.0
faiss-cpu>=1.7.0
openai>=1.0.0

View File

@ -0,0 +1,79 @@
import os
import numpy as np
from openai import OpenAI
def get_embedding(text):
"""
A simple embedding function that converts text to vector.
In a real application, you would use a proper embedding model like OpenAI,
Hugging Face, or other embedding services. For this example, we'll use a
simple approach based on character frequencies for demonstration purposes.
"""
# Create a simple embedding (128-dimensional) based on character frequencies
# This is just for demonstration - not a real embedding algorithm!
embedding = np.zeros(128, dtype=np.float32)
# Generate a deterministic but distributed embedding based on character frequency
for i, char in enumerate(text):
# Use modulo to distribute values across the embedding dimensions
pos = ord(char) % 128
embedding[pos] += 1.0
# Normalize the embedding
norm = np.linalg.norm(embedding)
if norm > 0:
embedding = embedding / norm
return embedding
def get_openai_embedding(text):
client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY", "YOUR_API_KEY"))
response = client.embeddings.create(
model="text-embedding-ada-002",
input=text
)
# Extract the embedding vector from the response
embedding = response.data[0].embedding
# Convert to numpy array for consistency with other embedding functions
return np.array(embedding, dtype=np.float32)
if __name__ == "__main__":
# Test the embedding function
text1 = "The quick brown fox jumps over the lazy dog."
text2 = "Python is a popular programming language for data science."
emb1 = get_embedding(text1)
emb2 = get_embedding(text2)
print(f"Embedding 1 shape: {emb1.shape}")
print(f"Embedding 2 shape: {emb2.shape}")
# Calculate similarity (dot product)
similarity = np.dot(emb1, emb2)
print(f"Similarity between texts: {similarity:.4f}")
# Compare with a different text
text3 = "Machine learning is a subset of artificial intelligence."
emb3 = get_embedding(text3)
similarity13 = np.dot(emb1, emb3)
similarity23 = np.dot(emb2, emb3)
print(f"Similarity between text1 and text3: {similarity13:.4f}")
print(f"Similarity between text2 and text3: {similarity23:.4f}")
# These simple comparisons should show higher similarity
# between related concepts (text2 and text3) than between
# unrelated texts (text1 and text3)
# Uncomment to test OpenAI embeddings (requires API key)
print("\nTesting OpenAI embeddings (requires API key):")
oai_emb1 = get_openai_embedding(text1)
oai_emb2 = get_openai_embedding(text2)
print(f"OpenAI Embedding 1 shape: {oai_emb1.shape}")
oai_similarity = np.dot(oai_emb1, oai_emb2)
print(f"OpenAI similarity between texts: {oai_similarity:.4f}")

View File

@ -29,12 +29,18 @@ Below you will find an overview table of various text embedding APIs, along with
### 1. OpenAI ### 1. OpenAI
```python ```python
import openai from openai import OpenAI
openai.api_key = "YOUR_API_KEY" client = OpenAI(api_key="YOUR_API_KEY")
resp = openai.Embedding.create(model="text-embedding-ada-002", input="Hello world") response = client.embeddings.create(
vec = resp["data"][0]["embedding"] model="text-embedding-ada-002",
print(vec) input=text
)
# Extract the embedding vector from the response
embedding = response.data[0].embedding
embedding = np.array(embedding, dtype=np.float32)
print(embedding)
``` ```
### 2. Azure OpenAI ### 2. Azure OpenAI