update rag tutorial
This commit is contained in:
parent
05cd05efe2
commit
833e0870cd
|
|
@ -1,4 +1,4 @@
|
|||
pocketflow>=0.0.1
|
||||
aiohttp>=3.8.0 # For async HTTP requests
|
||||
openai>=1.0.0 # For async LLM calls
|
||||
aiohttp>=3.8.0 # For HTTP requests
|
||||
openai>=1.0.0 # For LLM calls
|
||||
duckduckgo-search>=7.5.2 # For web search
|
||||
|
|
@ -0,0 +1,91 @@
|
|||
# Retrieval Augmented Generation (RAG)
|
||||
|
||||
This project demonstrates a simplified RAG system that retrieves relevant documents based on user queries.
|
||||
|
||||
## Features
|
||||
|
||||
- Simple vector-based document retrieval
|
||||
- Two-stage pipeline (offline indexing, online querying)
|
||||
- FAISS-powered similarity search
|
||||
|
||||
## Getting Started
|
||||
|
||||
1. Install the required dependencies:
|
||||
|
||||
```bash
|
||||
pip install -r requirements.txt
|
||||
```
|
||||
|
||||
2. Run the application with a sample query:
|
||||
|
||||
```bash
|
||||
python main.py --"Large Language Model"
|
||||
```
|
||||
|
||||
3. Or run without arguments to use the default query:
|
||||
|
||||
```bash
|
||||
python main.py
|
||||
```
|
||||
|
||||
## API Key
|
||||
|
||||
By default, demo uses dummy embedding based on character frequencies. To use real OpenAI embedding:
|
||||
|
||||
1. Edit nodes.py to replace the dummy `get_embedding` with `get_openai_embedding`:
|
||||
```python
|
||||
# Change this line:
|
||||
query_embedding = get_embedding(query)
|
||||
# To this:
|
||||
query_embedding = get_openai_embedding(query)
|
||||
|
||||
# And also change this line:
|
||||
return get_embedding(text)
|
||||
# To this:
|
||||
return get_openai_embedding(text)
|
||||
```
|
||||
|
||||
2. Make sure your OpenAI API key is set:
|
||||
```bash
|
||||
export OPENAI_API_KEY="your-api-key-here"
|
||||
```
|
||||
|
||||
## How It Works
|
||||
|
||||
The magic happens through a two-stage pipeline implemented with PocketFlow:
|
||||
|
||||
```mermaid
|
||||
graph TD
|
||||
subgraph OfflineFlow[Offline Document Indexing]
|
||||
EmbedDocs[EmbedDocumentsNode] --> CreateIndex[CreateIndexNode]
|
||||
end
|
||||
|
||||
subgraph OnlineFlow[Online Query Processing]
|
||||
EmbedQuery[EmbedQueryNode] --> RetrieveDoc[RetrieveDocumentNode]
|
||||
end
|
||||
```
|
||||
|
||||
Here's what each part does:
|
||||
1. **EmbedDocumentsNode**: Converts documents into vector representations
|
||||
2. **CreateIndexNode**: Creates a searchable FAISS index from embeddings
|
||||
3. **EmbedQueryNode**: Converts user query into the same vector space
|
||||
4. **RetrieveDocumentNode**: Finds the most similar document using vector search
|
||||
|
||||
## Example Output
|
||||
|
||||
```
|
||||
✅ Created 5 document embeddings
|
||||
🔍 Creating search index...
|
||||
✅ Index created with 5 vectors
|
||||
🔍 Embedding query: Large Language Model
|
||||
🔎 Searching for relevant documents...
|
||||
📄 Retrieved document (index: 3, distance: 0.3296)
|
||||
📄 Most relevant text: "PocketFlow is a 100-line Large Language Model Framework."
|
||||
```
|
||||
|
||||
## Files
|
||||
|
||||
- [`main.py`](./main.py): Main entry point for running the RAG demonstration
|
||||
- [`flow.py`](./flow.py): Configures the flows that connect the nodes
|
||||
- [`nodes.py`](./nodes.py): Defines the nodes for document processing and retrieval
|
||||
- [`utils.py`](./utils.py): Utility functions including the embedding function
|
||||
|
|
@ -0,0 +1,22 @@
|
|||
from pocketflow import Flow
|
||||
from nodes import EmbedDocumentsNode, CreateIndexNode, EmbedQueryNode, RetrieveDocumentNode
|
||||
|
||||
def get_offline_flow():
|
||||
# Create offline flow for document indexing
|
||||
embed_docs_node = EmbedDocumentsNode()
|
||||
create_index_node = CreateIndexNode()
|
||||
embed_docs_node >> create_index_node
|
||||
offline_flow = Flow(start=embed_docs_node)
|
||||
return offline_flow
|
||||
|
||||
def get_online_flow():
|
||||
# Create online flow for document retrieval
|
||||
embed_query_node = EmbedQueryNode()
|
||||
retrieve_doc_node = RetrieveDocumentNode()
|
||||
embed_query_node >> retrieve_doc_node
|
||||
online_flow = Flow(start=embed_query_node)
|
||||
return online_flow
|
||||
|
||||
# Initialize flows
|
||||
offline_flow = get_offline_flow()
|
||||
online_flow = get_online_flow()
|
||||
|
|
@ -0,0 +1,55 @@
|
|||
import sys
|
||||
from flow import offline_flow, online_flow
|
||||
|
||||
def run_rag_demo():
|
||||
"""
|
||||
Run a demonstration of the RAG system.
|
||||
|
||||
This function:
|
||||
1. Indexes a set of sample documents (offline flow)
|
||||
2. Takes a query from the command line
|
||||
3. Retrieves the most relevant document (online flow)
|
||||
"""
|
||||
|
||||
# Sample texts - corpus of documents to search
|
||||
texts = [
|
||||
"The quick brown fox jumps over the lazy dog.",
|
||||
"Machine learning is a subset of artificial intelligence.",
|
||||
"Python is a popular programming language for data science.",
|
||||
"PocketFlow is a 100-line Large Language Model Framework.",
|
||||
"The weather is sunny and warm today.",
|
||||
]
|
||||
|
||||
print("=" * 50)
|
||||
print("PocketFlow RAG Document Retrieval")
|
||||
print("=" * 50)
|
||||
|
||||
# Default query
|
||||
default_query = "Large Language Model"
|
||||
|
||||
# Get query from command line if provided with --
|
||||
query = default_query
|
||||
for arg in sys.argv[1:]:
|
||||
if arg.startswith("--"):
|
||||
query = arg[2:]
|
||||
break
|
||||
|
||||
# Single shared store for both flows
|
||||
shared = {
|
||||
"texts": texts,
|
||||
"embeddings": None,
|
||||
"index": None,
|
||||
"query": query,
|
||||
"query_embedding": None,
|
||||
"retrieved_document": None
|
||||
}
|
||||
|
||||
# Initialize and run the offline flow (document indexing)
|
||||
offline_flow.run(shared)
|
||||
|
||||
# Run the online flow to retrieve the most relevant document
|
||||
online_flow.run(shared)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
run_rag_demo()
|
||||
|
|
@ -0,0 +1,95 @@
|
|||
from pocketflow import Node, Flow, BatchNode
|
||||
import numpy as np
|
||||
import faiss
|
||||
from utils import get_embedding, get_openai_embedding
|
||||
|
||||
# Nodes for the offline flow
|
||||
class EmbedDocumentsNode(BatchNode):
|
||||
def prep(self, shared):
|
||||
"""Read texts from shared store and return as an iterable"""
|
||||
return shared["texts"]
|
||||
|
||||
def exec(self, text):
|
||||
"""Embed a single text"""
|
||||
return get_embedding(text)
|
||||
|
||||
def post(self, shared, prep_res, exec_res_list):
|
||||
"""Store embeddings in the shared store"""
|
||||
embeddings = np.array(exec_res_list, dtype=np.float32)
|
||||
shared["embeddings"] = embeddings
|
||||
print(f"✅ Created {len(embeddings)} document embeddings")
|
||||
return "default"
|
||||
|
||||
class CreateIndexNode(Node):
|
||||
def prep(self, shared):
|
||||
"""Get embeddings from shared store"""
|
||||
return shared["embeddings"]
|
||||
|
||||
def exec(self, embeddings):
|
||||
"""Create FAISS index and add embeddings"""
|
||||
print("🔍 Creating search index...")
|
||||
dimension = embeddings.shape[1]
|
||||
|
||||
# Create a flat L2 index
|
||||
index = faiss.IndexFlatL2(dimension)
|
||||
|
||||
# Add the embeddings to the index
|
||||
index.add(embeddings)
|
||||
|
||||
return index
|
||||
|
||||
def post(self, shared, prep_res, exec_res):
|
||||
"""Store the index in shared store"""
|
||||
shared["index"] = exec_res
|
||||
print(f"✅ Index created with {exec_res.ntotal} vectors")
|
||||
return "default"
|
||||
|
||||
# Nodes for the online flow
|
||||
class EmbedQueryNode(Node):
|
||||
def prep(self, shared):
|
||||
"""Get query from shared store"""
|
||||
return shared["query"]
|
||||
|
||||
def exec(self, query):
|
||||
"""Embed the query"""
|
||||
print(f"🔍 Embedding query: {query}")
|
||||
query_embedding = get_embedding(query)
|
||||
return np.array([query_embedding], dtype=np.float32)
|
||||
|
||||
def post(self, shared, prep_res, exec_res):
|
||||
"""Store query embedding in shared store"""
|
||||
shared["query_embedding"] = exec_res
|
||||
return "default"
|
||||
|
||||
class RetrieveDocumentNode(Node):
|
||||
def prep(self, shared):
|
||||
"""Get query embedding, index, and texts from shared store"""
|
||||
return shared["query_embedding"], shared["index"], shared["texts"]
|
||||
|
||||
def exec(self, inputs):
|
||||
"""Search the index for similar documents"""
|
||||
print("🔎 Searching for relevant documents...")
|
||||
query_embedding, index, texts = inputs
|
||||
|
||||
# Search for the most similar document
|
||||
distances, indices = index.search(query_embedding, k=1)
|
||||
|
||||
# Get the index of the most similar document
|
||||
best_idx = indices[0][0]
|
||||
distance = distances[0][0]
|
||||
|
||||
# Get the corresponding text
|
||||
most_relevant_text = texts[best_idx]
|
||||
|
||||
return {
|
||||
"text": most_relevant_text,
|
||||
"index": best_idx,
|
||||
"distance": distance
|
||||
}
|
||||
|
||||
def post(self, shared, prep_res, exec_res):
|
||||
"""Store retrieved document in shared store"""
|
||||
shared["retrieved_document"] = exec_res
|
||||
print(f"📄 Retrieved document (index: {exec_res['index']}, distance: {exec_res['distance']:.4f})")
|
||||
print(f"📄 Most relevant text: \"{exec_res['text']}\"")
|
||||
return "default"
|
||||
|
|
@ -0,0 +1,4 @@
|
|||
pocketflow>=0.0.5
|
||||
numpy>=1.20.0
|
||||
faiss-cpu>=1.7.0
|
||||
openai>=1.0.0
|
||||
|
|
@ -0,0 +1,79 @@
|
|||
import os
|
||||
import numpy as np
|
||||
from openai import OpenAI
|
||||
|
||||
def get_embedding(text):
|
||||
"""
|
||||
A simple embedding function that converts text to vector.
|
||||
|
||||
In a real application, you would use a proper embedding model like OpenAI,
|
||||
Hugging Face, or other embedding services. For this example, we'll use a
|
||||
simple approach based on character frequencies for demonstration purposes.
|
||||
"""
|
||||
# Create a simple embedding (128-dimensional) based on character frequencies
|
||||
# This is just for demonstration - not a real embedding algorithm!
|
||||
embedding = np.zeros(128, dtype=np.float32)
|
||||
|
||||
# Generate a deterministic but distributed embedding based on character frequency
|
||||
for i, char in enumerate(text):
|
||||
# Use modulo to distribute values across the embedding dimensions
|
||||
pos = ord(char) % 128
|
||||
embedding[pos] += 1.0
|
||||
|
||||
# Normalize the embedding
|
||||
norm = np.linalg.norm(embedding)
|
||||
if norm > 0:
|
||||
embedding = embedding / norm
|
||||
|
||||
return embedding
|
||||
|
||||
def get_openai_embedding(text):
|
||||
client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY", "YOUR_API_KEY"))
|
||||
|
||||
response = client.embeddings.create(
|
||||
model="text-embedding-ada-002",
|
||||
input=text
|
||||
)
|
||||
|
||||
# Extract the embedding vector from the response
|
||||
embedding = response.data[0].embedding
|
||||
|
||||
# Convert to numpy array for consistency with other embedding functions
|
||||
return np.array(embedding, dtype=np.float32)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
# Test the embedding function
|
||||
text1 = "The quick brown fox jumps over the lazy dog."
|
||||
text2 = "Python is a popular programming language for data science."
|
||||
|
||||
emb1 = get_embedding(text1)
|
||||
emb2 = get_embedding(text2)
|
||||
|
||||
print(f"Embedding 1 shape: {emb1.shape}")
|
||||
print(f"Embedding 2 shape: {emb2.shape}")
|
||||
|
||||
# Calculate similarity (dot product)
|
||||
similarity = np.dot(emb1, emb2)
|
||||
print(f"Similarity between texts: {similarity:.4f}")
|
||||
|
||||
# Compare with a different text
|
||||
text3 = "Machine learning is a subset of artificial intelligence."
|
||||
emb3 = get_embedding(text3)
|
||||
similarity13 = np.dot(emb1, emb3)
|
||||
similarity23 = np.dot(emb2, emb3)
|
||||
|
||||
print(f"Similarity between text1 and text3: {similarity13:.4f}")
|
||||
print(f"Similarity between text2 and text3: {similarity23:.4f}")
|
||||
|
||||
# These simple comparisons should show higher similarity
|
||||
# between related concepts (text2 and text3) than between
|
||||
# unrelated texts (text1 and text3)
|
||||
|
||||
# Uncomment to test OpenAI embeddings (requires API key)
|
||||
print("\nTesting OpenAI embeddings (requires API key):")
|
||||
oai_emb1 = get_openai_embedding(text1)
|
||||
oai_emb2 = get_openai_embedding(text2)
|
||||
print(f"OpenAI Embedding 1 shape: {oai_emb1.shape}")
|
||||
oai_similarity = np.dot(oai_emb1, oai_emb2)
|
||||
print(f"OpenAI similarity between texts: {oai_similarity:.4f}")
|
||||
|
|
@ -29,12 +29,18 @@ Below you will find an overview table of various text embedding APIs, along with
|
|||
|
||||
### 1. OpenAI
|
||||
```python
|
||||
import openai
|
||||
from openai import OpenAI
|
||||
|
||||
openai.api_key = "YOUR_API_KEY"
|
||||
resp = openai.Embedding.create(model="text-embedding-ada-002", input="Hello world")
|
||||
vec = resp["data"][0]["embedding"]
|
||||
print(vec)
|
||||
client = OpenAI(api_key="YOUR_API_KEY")
|
||||
response = client.embeddings.create(
|
||||
model="text-embedding-ada-002",
|
||||
input=text
|
||||
)
|
||||
|
||||
# Extract the embedding vector from the response
|
||||
embedding = response.data[0].embedding
|
||||
embedding = np.array(embedding, dtype=np.float32)
|
||||
print(embedding)
|
||||
```
|
||||
|
||||
### 2. Azure OpenAI
|
||||
|
|
|
|||
Loading…
Reference in New Issue