update rag tutorial
This commit is contained in:
parent
f6818621f2
commit
337dcc0d66
|
|
@ -4,6 +4,7 @@ This project demonstrates a simplified RAG system that retrieves relevant docume
|
||||||
|
|
||||||
## Features
|
## Features
|
||||||
|
|
||||||
|
- Document chunking for better retrieval granularity
|
||||||
- Simple vector-based document retrieval
|
- Simple vector-based document retrieval
|
||||||
- Two-stage pipeline (offline indexing, online querying)
|
- Two-stage pipeline (offline indexing, online querying)
|
||||||
- FAISS-powered similarity search
|
- FAISS-powered similarity search
|
||||||
|
|
@ -57,6 +58,7 @@ The magic happens through a two-stage pipeline implemented with PocketFlow:
|
||||||
```mermaid
|
```mermaid
|
||||||
graph TD
|
graph TD
|
||||||
subgraph OfflineFlow[Offline Document Indexing]
|
subgraph OfflineFlow[Offline Document Indexing]
|
||||||
|
ChunkDocs[ChunkDocumentsNode] --> EmbedDocs[EmbedDocumentsNode]
|
||||||
EmbedDocs[EmbedDocumentsNode] --> CreateIndex[CreateIndexNode]
|
EmbedDocs[EmbedDocumentsNode] --> CreateIndex[CreateIndexNode]
|
||||||
end
|
end
|
||||||
|
|
||||||
|
|
@ -66,14 +68,19 @@ graph TD
|
||||||
```
|
```
|
||||||
|
|
||||||
Here's what each part does:
|
Here's what each part does:
|
||||||
1. **EmbedDocumentsNode**: Converts documents into vector representations
|
1. **ChunkDocumentsNode**: Splits documents into smaller chunks for more granular retrieval
|
||||||
2. **CreateIndexNode**: Creates a searchable FAISS index from embeddings
|
2. **EmbedDocumentsNode**: Converts document chunks into vector representations
|
||||||
3. **EmbedQueryNode**: Converts user query into the same vector space
|
3. **CreateIndexNode**: Creates a searchable FAISS index from embeddings
|
||||||
4. **RetrieveDocumentNode**: Finds the most similar document using vector search
|
4. **EmbedQueryNode**: Converts user query into the same vector space
|
||||||
|
5. **RetrieveDocumentNode**: Finds the most similar document chunk using vector search
|
||||||
|
|
||||||
## Example Output
|
## Example Output
|
||||||
|
|
||||||
```
|
```
|
||||||
|
==================================================
|
||||||
|
PocketFlow RAG Document Retrieval
|
||||||
|
==================================================
|
||||||
|
✅ Created 5 chunks from 5 documents
|
||||||
✅ Created 5 document embeddings
|
✅ Created 5 document embeddings
|
||||||
🔍 Creating search index...
|
🔍 Creating search index...
|
||||||
✅ Index created with 5 vectors
|
✅ Index created with 5 vectors
|
||||||
|
|
@ -88,4 +95,4 @@ Here's what each part does:
|
||||||
- [`main.py`](./main.py): Main entry point for running the RAG demonstration
|
- [`main.py`](./main.py): Main entry point for running the RAG demonstration
|
||||||
- [`flow.py`](./flow.py): Configures the flows that connect the nodes
|
- [`flow.py`](./flow.py): Configures the flows that connect the nodes
|
||||||
- [`nodes.py`](./nodes.py): Defines the nodes for document processing and retrieval
|
- [`nodes.py`](./nodes.py): Defines the nodes for document processing and retrieval
|
||||||
- [`utils.py`](./utils.py): Utility functions including the embedding function
|
- [`utils.py`](./utils.py): Utility functions including chunking and embedding functions
|
||||||
|
|
@ -1,12 +1,13 @@
|
||||||
from pocketflow import Flow
|
from pocketflow import Flow
|
||||||
from nodes import EmbedDocumentsNode, CreateIndexNode, EmbedQueryNode, RetrieveDocumentNode
|
from nodes import EmbedDocumentsNode, CreateIndexNode, EmbedQueryNode, RetrieveDocumentNode, ChunkDocumentsNode
|
||||||
|
|
||||||
def get_offline_flow():
|
def get_offline_flow():
|
||||||
# Create offline flow for document indexing
|
# Create offline flow for document indexing
|
||||||
|
chunk_docs_node = ChunkDocumentsNode()
|
||||||
embed_docs_node = EmbedDocumentsNode()
|
embed_docs_node = EmbedDocumentsNode()
|
||||||
create_index_node = CreateIndexNode()
|
create_index_node = CreateIndexNode()
|
||||||
embed_docs_node >> create_index_node
|
chunk_docs_node >> embed_docs_node >> create_index_node
|
||||||
offline_flow = Flow(start=embed_docs_node)
|
offline_flow = Flow(start=chunk_docs_node)
|
||||||
return offline_flow
|
return offline_flow
|
||||||
|
|
||||||
def get_online_flow():
|
def get_online_flow():
|
||||||
|
|
|
||||||
|
|
@ -1,9 +1,31 @@
|
||||||
from pocketflow import Node, Flow, BatchNode
|
from pocketflow import Node, Flow, BatchNode
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import faiss
|
import faiss
|
||||||
from utils import get_embedding, get_openai_embedding
|
from utils import get_embedding, get_openai_embedding, fixed_size_chunk
|
||||||
|
|
||||||
# Nodes for the offline flow
|
# Nodes for the offline flow
|
||||||
|
class ChunkDocumentsNode(BatchNode):
|
||||||
|
def prep(self, shared):
|
||||||
|
"""Read texts from shared store"""
|
||||||
|
return shared["texts"]
|
||||||
|
|
||||||
|
def exec(self, text):
|
||||||
|
"""Chunk a single text into smaller pieces"""
|
||||||
|
return fixed_size_chunk(text)
|
||||||
|
|
||||||
|
def post(self, shared, prep_res, exec_res_list):
|
||||||
|
"""Store chunked texts in the shared store"""
|
||||||
|
# Flatten the list of lists into a single list of chunks
|
||||||
|
all_chunks = []
|
||||||
|
for chunks in exec_res_list:
|
||||||
|
all_chunks.extend(chunks)
|
||||||
|
|
||||||
|
# Replace the original texts with the flat list of chunks
|
||||||
|
shared["texts"] = all_chunks
|
||||||
|
|
||||||
|
print(f"✅ Created {len(all_chunks)} chunks from {len(prep_res)} documents")
|
||||||
|
return "default"
|
||||||
|
|
||||||
class EmbedDocumentsNode(BatchNode):
|
class EmbedDocumentsNode(BatchNode):
|
||||||
def prep(self, shared):
|
def prep(self, shared):
|
||||||
"""Read texts from shared store and return as an iterable"""
|
"""Read texts from shared store and return as an iterable"""
|
||||||
|
|
|
||||||
|
|
@ -41,6 +41,11 @@ def get_openai_embedding(text):
|
||||||
# Convert to numpy array for consistency with other embedding functions
|
# Convert to numpy array for consistency with other embedding functions
|
||||||
return np.array(embedding, dtype=np.float32)
|
return np.array(embedding, dtype=np.float32)
|
||||||
|
|
||||||
|
def fixed_size_chunk(text, chunk_size=2000):
|
||||||
|
chunks = []
|
||||||
|
for i in range(0, len(text), chunk_size):
|
||||||
|
chunks.append(text[i : i + chunk_size])
|
||||||
|
return chunks
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
# Test the embedding function
|
# Test the embedding function
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue