update rag tutorial

This commit is contained in:
zachary62 2025-03-30 19:57:57 -04:00
parent f6818621f2
commit 337dcc0d66
4 changed files with 44 additions and 9 deletions

View File

@ -4,6 +4,7 @@ This project demonstrates a simplified RAG system that retrieves relevant docume
## Features ## Features
- Document chunking for better retrieval granularity
- Simple vector-based document retrieval - Simple vector-based document retrieval
- Two-stage pipeline (offline indexing, online querying) - Two-stage pipeline (offline indexing, online querying)
- FAISS-powered similarity search - FAISS-powered similarity search
@ -57,6 +58,7 @@ The magic happens through a two-stage pipeline implemented with PocketFlow:
```mermaid ```mermaid
graph TD graph TD
subgraph OfflineFlow[Offline Document Indexing] subgraph OfflineFlow[Offline Document Indexing]
ChunkDocs[ChunkDocumentsNode] --> EmbedDocs[EmbedDocumentsNode]
EmbedDocs[EmbedDocumentsNode] --> CreateIndex[CreateIndexNode] EmbedDocs[EmbedDocumentsNode] --> CreateIndex[CreateIndexNode]
end end
@ -66,14 +68,19 @@ graph TD
``` ```
Here's what each part does: Here's what each part does:
1. **EmbedDocumentsNode**: Converts documents into vector representations 1. **ChunkDocumentsNode**: Splits documents into smaller chunks for more granular retrieval
2. **CreateIndexNode**: Creates a searchable FAISS index from embeddings 2. **EmbedDocumentsNode**: Converts document chunks into vector representations
3. **EmbedQueryNode**: Converts user query into the same vector space 3. **CreateIndexNode**: Creates a searchable FAISS index from embeddings
4. **RetrieveDocumentNode**: Finds the most similar document using vector search 4. **EmbedQueryNode**: Converts user query into the same vector space
5. **RetrieveDocumentNode**: Finds the most similar document chunk using vector search
## Example Output ## Example Output
``` ```
==================================================
PocketFlow RAG Document Retrieval
==================================================
✅ Created 5 chunks from 5 documents
✅ Created 5 document embeddings ✅ Created 5 document embeddings
🔍 Creating search index... 🔍 Creating search index...
✅ Index created with 5 vectors ✅ Index created with 5 vectors
@ -88,4 +95,4 @@ Here's what each part does:
- [`main.py`](./main.py): Main entry point for running the RAG demonstration - [`main.py`](./main.py): Main entry point for running the RAG demonstration
- [`flow.py`](./flow.py): Configures the flows that connect the nodes - [`flow.py`](./flow.py): Configures the flows that connect the nodes
- [`nodes.py`](./nodes.py): Defines the nodes for document processing and retrieval - [`nodes.py`](./nodes.py): Defines the nodes for document processing and retrieval
- [`utils.py`](./utils.py): Utility functions including the embedding function - [`utils.py`](./utils.py): Utility functions including chunking and embedding functions

View File

@ -1,12 +1,13 @@
from pocketflow import Flow from pocketflow import Flow
from nodes import EmbedDocumentsNode, CreateIndexNode, EmbedQueryNode, RetrieveDocumentNode from nodes import EmbedDocumentsNode, CreateIndexNode, EmbedQueryNode, RetrieveDocumentNode, ChunkDocumentsNode
def get_offline_flow(): def get_offline_flow():
# Create offline flow for document indexing # Create offline flow for document indexing
chunk_docs_node = ChunkDocumentsNode()
embed_docs_node = EmbedDocumentsNode() embed_docs_node = EmbedDocumentsNode()
create_index_node = CreateIndexNode() create_index_node = CreateIndexNode()
embed_docs_node >> create_index_node chunk_docs_node >> embed_docs_node >> create_index_node
offline_flow = Flow(start=embed_docs_node) offline_flow = Flow(start=chunk_docs_node)
return offline_flow return offline_flow
def get_online_flow(): def get_online_flow():

View File

@ -1,9 +1,31 @@
from pocketflow import Node, Flow, BatchNode from pocketflow import Node, Flow, BatchNode
import numpy as np import numpy as np
import faiss import faiss
from utils import get_embedding, get_openai_embedding from utils import get_embedding, get_openai_embedding, fixed_size_chunk
# Nodes for the offline flow # Nodes for the offline flow
class ChunkDocumentsNode(BatchNode):
def prep(self, shared):
"""Read texts from shared store"""
return shared["texts"]
def exec(self, text):
"""Chunk a single text into smaller pieces"""
return fixed_size_chunk(text)
def post(self, shared, prep_res, exec_res_list):
"""Store chunked texts in the shared store"""
# Flatten the list of lists into a single list of chunks
all_chunks = []
for chunks in exec_res_list:
all_chunks.extend(chunks)
# Replace the original texts with the flat list of chunks
shared["texts"] = all_chunks
print(f"✅ Created {len(all_chunks)} chunks from {len(prep_res)} documents")
return "default"
class EmbedDocumentsNode(BatchNode): class EmbedDocumentsNode(BatchNode):
def prep(self, shared): def prep(self, shared):
"""Read texts from shared store and return as an iterable""" """Read texts from shared store and return as an iterable"""

View File

@ -41,6 +41,11 @@ def get_openai_embedding(text):
# Convert to numpy array for consistency with other embedding functions # Convert to numpy array for consistency with other embedding functions
return np.array(embedding, dtype=np.float32) return np.array(embedding, dtype=np.float32)
def fixed_size_chunk(text, chunk_size=2000):
chunks = []
for i in range(0, len(text), chunk_size):
chunks.append(text[i : i + chunk_size])
return chunks
if __name__ == "__main__": if __name__ == "__main__":
# Test the embedding function # Test the embedding function