From af48dbf402abc82550a7e015d376da3f63abec41 Mon Sep 17 00:00:00 2001 From: zachary62 Date: Wed, 5 Mar 2025 22:11:43 -0500 Subject: [PATCH] add more docs --- docs/design_pattern/rag.md | 6 +- docs/index.md | 3 +- docs/utility_function/chunking.md | 4 +- docs/utility_function/embedding.md | 6 +- docs/utility_function/text_to_speech.md | 107 ++++++++++++ docs/utility_function/vector.md | 218 ++++++++++++++++++++++++ docs/utility_function/websearch.md | 2 +- 7 files changed, 338 insertions(+), 8 deletions(-) create mode 100644 docs/utility_function/text_to_speech.md diff --git a/docs/design_pattern/rag.md b/docs/design_pattern/rag.md index 0bb2497..fa9790e 100644 --- a/docs/design_pattern/rag.md +++ b/docs/design_pattern/rag.md @@ -8,7 +8,11 @@ nav_order: 4 # RAG (Retrieval Augmented Generation) For certain LLM tasks like answering questions, providing context is essential. -Use [vector search](../utility_function/tool.md) to find relevant context for LLM responses. +Most common way to retrive text-based context is through embedding: +1. Given texts, you first [chunk](../utility_function/chunking.md) them. +2. Next, you [embed](../utility_function/embedding.md) each chunk. +3. Then you store the chunks in [vector databases](../utility_function/vector.md). +4. Finally, given a query, you embed the query and find the closest chunk in the vector databases. ### Example: Question Answering diff --git a/docs/index.md b/docs/index.md index a3b0524..466060a 100644 --- a/docs/index.md +++ b/docs/index.md @@ -41,7 +41,8 @@ We model the LLM workflow as a **Graph + Shared Store**: - [(Optional) Web Search](./utility_function/websearch.md) - [(Optional) Chunking](./utility_function/chunking.md) - [(Optional) Embedding](./utility_function/embedding.md) -- [(Optional) Vector](./utility_function/vector.md) +- [(Optional) Vector Databases](./utility_function/vector.md) +- [(Optional) Text-to-Speech](./utility_function/text_to_speech.md) > We do not provide built-in utility functions. Example implementations are provided as reference. {: .warning } diff --git a/docs/utility_function/chunking.md b/docs/utility_function/chunking.md index 9bd609b..c7fd71a 100644 --- a/docs/utility_function/chunking.md +++ b/docs/utility_function/chunking.md @@ -32,7 +32,7 @@ def fixed_size_chunk(text, chunk_size=100): However, sentences are often cut awkwardly, losing coherence. -### Sentence-Based Chunking +### 2. Sentence-Based Chunking ```python import nltk @@ -47,7 +47,7 @@ def sentence_based_chunk(text, max_sentences=2): However, might not handle very long sentences or paragraphs well. -### Other Chunking +### 3. Other Chunking - **Paragraph-Based**: Split text by paragraphs (e.g., newlines). Large paragraphs can create big chunks. - **Semantic**: Use embeddings or topic modeling to chunk by semantic boundaries. diff --git a/docs/utility_function/embedding.md b/docs/utility_function/embedding.md index 90dcca9..6261554 100644 --- a/docs/utility_function/embedding.md +++ b/docs/utility_function/embedding.md @@ -1,7 +1,7 @@ --- layout: default -title: "Web Search" -parent: "Embedding" +title: "Embedding" +parent: "Utility Function" nav_order: 6 --- @@ -15,7 +15,7 @@ Below you will find an overview table of various text embedding APIs, along with {: .best-practice } -| **API** | **Free Tier** | **Pricing** | **Docs** | +| **API** | **Free Tier** | **Pricing Model** | **Docs** | | --- | --- | --- | --- | | **OpenAI** | ~$5 credit | ~$0.0001/1K tokens | [OpenAI Embeddings](https://platform.openai.com/docs/api-reference/embeddings) | | **Azure OpenAI** | $200 credit | Same as OpenAI (~$0.0001/1K tokens) | [Azure OpenAI Embeddings](https://learn.microsoft.com/azure/cognitive-services/openai/how-to/create-resource?tabs=portal) | diff --git a/docs/utility_function/text_to_speech.md b/docs/utility_function/text_to_speech.md new file mode 100644 index 0000000..74445cc --- /dev/null +++ b/docs/utility_function/text_to_speech.md @@ -0,0 +1,107 @@ +--- +layout: default +title: "Text-to-Speech" +parent: "Utility Function" +nav_order: 8 +--- + +## Text-to-Speech + +| **Service** | **Free Tier** | **Pricing Model** | **Docs** | +|----------------------|-----------------------|--------------------------------------------------------------|---------------------------------------------------------------------| +| **Amazon Polly** | 5M std + 1M neural | ~$4 /M (std), ~$16 /M (neural) after free tier | [Polly Docs](https://aws.amazon.com/polly/) | +| **Google Cloud TTS** | 4M std + 1M WaveNet | ~$4 /M (std), ~$16 /M (WaveNet) pay-as-you-go | [Cloud TTS Docs](https://cloud.google.com/text-to-speech) | +| **Azure TTS** | 500K neural ongoing | ~$15 /M (neural), discount at higher volumes | [Azure TTS Docs](https://azure.microsoft.com/products/cognitive-services/text-to-speech/) | +| **IBM Watson TTS** | 10K chars Lite plan | ~$0.02 /1K (i.e. ~$20 /M). Enterprise options available | [IBM Watson Docs](https://www.ibm.com/cloud/watson-text-to-speech) | +| **ElevenLabs** | 10K chars monthly | From ~$5/mo (30K chars) up to $330/mo (2M chars). Enterprise | [ElevenLabs Docs](https://elevenlabs.io) | + +## Example Python Code + +### Amazon Polly +```python +import boto3 + +polly = boto3.client("polly", region_name="us-east-1", + aws_access_key_id="YOUR_AWS_ACCESS_KEY_ID", + aws_secret_access_key="YOUR_AWS_SECRET_ACCESS_KEY") + +resp = polly.synthesize_speech( + Text="Hello from Polly!", + OutputFormat="mp3", + VoiceId="Joanna" +) + +with open("polly.mp3", "wb") as f: + f.write(resp["AudioStream"].read()) +``` + +### Google Cloud TTS +```python +from google.cloud import texttospeech + +client = texttospeech.TextToSpeechClient() +input_text = texttospeech.SynthesisInput(text="Hello from Google Cloud TTS!") +voice = texttospeech.VoiceSelectionParams(language_code="en-US") +audio_cfg = texttospeech.AudioConfig(audio_encoding=texttospeech.AudioEncoding.MP3) + +resp = client.synthesize_speech(input=input_text, voice=voice, audio_config=audio_cfg) + +with open("gcloud_tts.mp3", "wb") as f: + f.write(resp.audio_content) +``` + +### Azure TTS +```python +import azure.cognitiveservices.speech as speechsdk + +speech_config = speechsdk.SpeechConfig( + subscription="AZURE_KEY", region="AZURE_REGION") +audio_cfg = speechsdk.audio.AudioConfig(filename="azure_tts.wav") + +synthesizer = speechsdk.SpeechSynthesizer( + speech_config=speech_config, + audio_config=audio_cfg +) + +synthesizer.speak_text_async("Hello from Azure TTS!").get() +``` + +### IBM Watson TTS +```python +from ibm_watson import TextToSpeechV1 +from ibm_cloud_sdk_core.authenticators import IAMAuthenticator + +auth = IAMAuthenticator("IBM_API_KEY") +service = TextToSpeechV1(authenticator=auth) +service.set_service_url("IBM_SERVICE_URL") + +resp = service.synthesize( + "Hello from IBM Watson!", + voice="en-US_AllisonV3Voice", + accept="audio/mp3" +).get_result() + +with open("ibm_tts.mp3", "wb") as f: + f.write(resp.content) +``` + +### ElevenLabs +```python +import requests + +api_key = "ELEVENLABS_KEY" +voice_id = "ELEVENLABS_VOICE" +url = f"https://api.elevenlabs.io/v1/text-to-speech/{voice_id}" +headers = {"xi-api-key": api_key, "Content-Type": "application/json"} + +json_data = { + "text": "Hello from ElevenLabs!", + "voice_settings": {"stability": 0.75, "similarity_boost": 0.75} +} + +resp = requests.post(url, headers=headers, json=json_data) + +with open("elevenlabs.mp3", "wb") as f: + f.write(resp.content) +``` + diff --git a/docs/utility_function/vector.md b/docs/utility_function/vector.md index e69de29..f309831 100644 --- a/docs/utility_function/vector.md +++ b/docs/utility_function/vector.md @@ -0,0 +1,218 @@ +--- +layout: default +title: "Vector Databases" +parent: "Utility Function" +nav_order: 7 +--- + +# Vector Databases + + +Below is a table of the popular vector search solutions: + +| **Tool** | **Free Tier** | **Pricing Model** | **Docs** | +| --- | --- | --- | --- | +| **FAISS** | N/A, self-host | Open-source | [Faiss.ai](https://faiss.ai) | +| **Pinecone** | 2GB free | From $25/mo | [pinecone.io](https://pinecone.io) | +| **Qdrant** | 1GB free cloud | Pay-as-you-go | [qdrant.tech](https://qdrant.tech) | +| **Weaviate** | 14-day sandbox | From $25/mo | [weaviate.io](https://weaviate.io) | +| **Milvus** | 5GB free cloud | PAYG or $99/mo dedicated | [milvus.io](https://milvus.io) | +| **Chroma** | N/A, self-host | Free (Apache 2.0) | [trychroma.com](https://trychroma.com) | +| **Redis** | 30MB free | From $5/mo | [redis.io](https://redis.io) | + +--- +## Example Python Code + +Below are basic usage snippets for each tool. + +### FAISS +```python +import faiss +import numpy as np + +# Dimensionality of embeddings +d = 128 + +# Create a flat L2 index +index = faiss.IndexFlatL2(d) + +# Random vectors +data = np.random.random((1000, d)).astype('float32') +index.add(data) + +# Query +query = np.random.random((1, d)).astype('float32') +D, I = index.search(query, k=5) + +print("Distances:", D) +print("Neighbors:", I) +``` + +### Pinecone +```python +import pinecone + +pinecone.init(api_key="YOUR_API_KEY", environment="YOUR_ENV") + +index_name = "my-index" + +# Create the index if it doesn't exist +if index_name not in pinecone.list_indexes(): + pinecone.create_index(name=index_name, dimension=128) + +# Connect +index = pinecone.Index(index_name) + +# Upsert +vectors = [ + ("id1", [0.1]*128), + ("id2", [0.2]*128) +] +index.upsert(vectors) + +# Query +response = index.query([[0.15]*128], top_k=3) +print(response) +``` + +### Qdrant +```python +import qdrant_client +from qdrant_client.models import Distance, VectorParams, PointStruct + +client = qdrant_client.QdrantClient( + url="https://YOUR-QDRANT-CLOUD-ENDPOINT", + api_key="YOUR_API_KEY" +) + +collection = "my_collection" +client.recreate_collection( + collection_name=collection, + vectors_config=VectorParams(size=128, distance=Distance.COSINE) +) + +points = [ + PointStruct(id=1, vector=[0.1]*128, payload={"type": "doc1"}), + PointStruct(id=2, vector=[0.2]*128, payload={"type": "doc2"}), +] + +client.upsert(collection_name=collection, points=points) + +results = client.search( + collection_name=collection, + query_vector=[0.15]*128, + limit=2 +) +print(results) +``` + +### Weaviate +```python +import weaviate + +client = weaviate.Client("https://YOUR-WEAVIATE-CLOUD-ENDPOINT") + +schema = { + "classes": [ + { + "class": "Article", + "vectorizer": "none" + } + ] +} +client.schema.create(schema) + +obj = { + "title": "Hello World", + "content": "Weaviate vector search" +} +client.data_object.create(obj, "Article", vector=[0.1]*128) + +resp = ( + client.query + .get("Article", ["title", "content"]) + .with_near_vector({"vector": [0.15]*128}) + .with_limit(3) + .do() +) +print(resp) +``` + +### Milvus +```python +from pymilvus import connections, FieldSchema, CollectionSchema, DataType, Collection +import numpy as np + +connections.connect(alias="default", host="localhost", port="19530") + +fields = [ + FieldSchema(name="id", dtype=DataType.INT64, is_primary=True), + FieldSchema(name="embedding", dtype=DataType.FLOAT_VECTOR, dim=128) +] +schema = CollectionSchema(fields) +collection = Collection("MyCollection", schema) + +emb = np.random.rand(10, 128).astype('float32') +ids = list(range(10)) +collection.insert([ids, emb]) + +index_params = { + "index_type": "IVF_FLAT", + "params": {"nlist": 128}, + "metric_type": "L2" +} +collection.create_index("embedding", index_params) +collection.load() + +query_emb = np.random.rand(1, 128).astype('float32') +results = collection.search(query_emb, "embedding", param={"nprobe": 10}, limit=3) +print(results) +``` + +### Chroma +```python +import chromadb +from chromadb.config import Settings + +client = chromadb.Client(Settings( + chroma_db_impl="duckdb+parquet", + persist_directory="./chroma_data" +)) + +coll = client.create_collection("my_collection") + +vectors = [[0.1, 0.2, 0.3], [0.2, 0.2, 0.2]] +metas = [{"doc": "text1"}, {"doc": "text2"}] +ids = ["id1", "id2"] +coll.add(embeddings=vectors, metadatas=metas, ids=ids) + +res = coll.query(query_embeddings=[[0.15, 0.25, 0.3]], n_results=2) +print(res) +``` + +### Redis +```python +import redis +import struct + +r = redis.Redis(host="localhost", port=6379) + +# Create index +r.execute_command( + "FT.CREATE", "my_idx", "ON", "HASH", + "SCHEMA", "embedding", "VECTOR", "FLAT", "6", + "TYPE", "FLOAT32", "DIM", "128", + "DISTANCE_METRIC", "L2" +) + +# Insert +vec = struct.pack('128f', *[0.1]*128) +r.hset("doc1", mapping={"embedding": vec}) + +# Search +qvec = struct.pack('128f', *[0.15]*128) +q = "*=>[KNN 3 @embedding $BLOB AS dist]" +res = r.ft("my_idx").search(q, query_params={"BLOB": qvec}) +print(res.docs) +``` + diff --git a/docs/utility_function/websearch.md b/docs/utility_function/websearch.md index d13a13d..2ca8a02 100644 --- a/docs/utility_function/websearch.md +++ b/docs/utility_function/websearch.md @@ -8,7 +8,7 @@ nav_order: 4 We recommend some implementations of commonly used web search tools. -| **API** | **Free Tier** | **Pricing Model** | **Official API Page** | +| **API** | **Free Tier** | **Pricing Model** | **Docs** | |---------------------------------|-----------------------------------------------|-----------------------------------------------------------------|------------------------------------------------------------------------| | **Google Custom Search JSON API** | 100 queries/day free | $5 per 1000 queries. | [Link](https://developers.google.com/custom-search/v1/overview) | | **Bing Web Search API** | 1,000 queries/month | $15–$25 per 1,000 queries. | [Link](https://azure.microsoft.com/en-us/services/cognitive-services/bing-web-search-api/) |