implement audio func

2025-05-13 15:47:00 -04:00 · 2025-05-13 15:47:00 -04:00 · 1d6e38c1a3
parent fef5bf10a8
commit 1d6e38c1a3
4 changed files with 78 additions and 16 deletions
--- a/cookbook/pocketflow-voice-chat/utils/call_llm.py
+++ b/cookbook/pocketflow-voice-chat/utils/call_llm.py
@ -2,19 +2,7 @@ import os
 from openai import OpenAI

 def call_llm(prompt, history=None):
-    """
-    Calls the OpenAI API to get a response from an LLM.
-
-    Args:
-        prompt: The user's current prompt.
-        history: A list of previous messages in the conversation, where each message
-                 is a dict with "role" and "content" keys. E.g.,
-                 [{"role": "user", "content": "Hello"}, {"role": "assistant", "content": "Hi there!"}]
-
-    Returns:
-        The LLM's response content as a string.
-    """
-    client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY", "your-api-key")) # Default if not set
+    client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY", "your-api-key"))

    messages = []
    if history:
@ -28,14 +16,11 @@ def call_llm(prompt, history=None):
    return r.choices[0].message.content

 if __name__ == "__main__":
-    # Ensure you have OPENAI_API_KEY set in your environment for this test to work
    print("Testing LLM call...")
    
-    # Test with a simple prompt
    response = call_llm("Tell me a short joke")
    print(f"LLM (Simple Joke): {response}")

-    # Test with history
    chat_history = [
        {"role": "user", "content": "What is the capital of France?"},
        {"role": "assistant", "content": "The capital of France is Paris."}
--- a/cookbook/pocketflow-voice-chat/utils/speech_to_text.py
+++ b/cookbook/pocketflow-voice-chat/utils/speech_to_text.py
@ -0,0 +1,45 @@
+import os
+from openai import OpenAI
+import io
+
+def speech_to_text_api(audio_data: bytes, sample_rate: int):
+    client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY"))
+
+    # The API expects a file-like object. We can use io.BytesIO for in-memory bytes.
+    # We also need to give it a name, as if it were a file upload.
+    audio_file = io.BytesIO(audio_data)
+    audio_file.name = "audio.mp3"  # Provide a dummy filename with a common audio extension
+
+    transcript = client.audio.transcriptions.create(
+        model="gpt-4o-transcribe",
+        file=audio_file
+        # language="en" # Optional: specify language ISO-639-1 code
+        # prompt="PocketFlow, LLM" # Optional: provide a prompt to guide the model
+    )
+    return transcript.text
+
+if __name__ == "__main__":
+    print("Testing Speech-to-Text API...")
+    # The OpenAI client will raise an error if API key is not found or invalid.
+    # No explicit check here to keep it minimal.
+    test_audio_path = "tts_output.mp3"
+    if os.path.exists(test_audio_path):
+        print(f"Found {test_audio_path}, using it for STT test.")
+        with open(test_audio_path, "rb") as f:
+            audio_bytes_for_stt = f.read()
+        
+        # Sample rate for tts_output.mp3 from our TTS script is 24000
+        # but Whisper should ideally infer or handle common formats well.
+        stt_sample_rate = 24000 
+
+        transcribed_text = speech_to_text_api(audio_bytes_for_stt, stt_sample_rate)
+
+        if transcribed_text:
+            print(f"Transcribed text: {transcribed_text}")
+        else:
+            print("Failed to transcribe audio (API returned empty data).")
+    else:
+        print(f"Test audio file '{test_audio_path}' not found.")
+        print("Please run the text_to_speech.py test first to generate it, or place your own audio file")
+        print(" (e.g., named 'test_audio.mp3') in the same directory as this script and modify the path.")
+        print("Make sure it's a common audio format like MP3, WAV, M4A etc.") 
--- a/cookbook/pocketflow-voice-chat/utils/text_to_speech.py
+++ b/cookbook/pocketflow-voice-chat/utils/text_to_speech.py
@ -0,0 +1,32 @@
+import os
+from openai import OpenAI
+
+def text_to_speech_api(text_to_synthesize: str):
+    client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY"))
+
+    response = client.audio.speech.create(
+        model="gpt-4o-mini-tts",
+        voice="alloy", # Other voices: echo, fable, onyx, nova, shimmer
+        input=text_to_synthesize,
+        response_format="mp3" # Other formats: opus, aac, flac. MP3 is widely supported.
+                              # OpenAI default sample rate for tts-1 is 24kHz.
+    )
+    # The response.content is already bytes (the audio data)
+    # Alternatively, for streaming and saving to file: response.stream_to_file("output.mp3")
+    audio_data_bytes = response.content
+    sample_rate = 24000 # OpenAI TTS model tts-1 outputs 24kHz
+    return audio_data_bytes, sample_rate
+
+if __name__ == "__main__":
+    print("Testing Text-to-Speech API...")
+    # The OpenAI client will raise an error if API key is not found or invalid.
+    # No explicit check here to keep it minimal.
+    text = "Hello from PocketFlow! This is a test of the text-to-speech functionality."
+    audio_bytes, rate = text_to_speech_api(text)
+    if audio_bytes and rate:
+        print(f"Successfully converted text to speech. Audio data length: {len(audio_bytes)} bytes, Sample rate: {rate} Hz.")
+        with open('tts_output.mp3', 'wb') as f:
+            f.write(audio_bytes)
+        print("Saved TTS output to tts_output.mp3")
+    else: 
+        print("Failed to convert text to speech (API returned empty data).")
--- a/cookbook/pocketflow-voice-chat/utils/tts_output.mp3
+++ b/cookbook/pocketflow-voice-chat/utils/tts_output.mp3