diff --git a/cookbook/pocketflow-voice-chat/utils/call_llm.py b/cookbook/pocketflow-voice-chat/utils/call_llm.py index 5f7dc09..830185e 100644 --- a/cookbook/pocketflow-voice-chat/utils/call_llm.py +++ b/cookbook/pocketflow-voice-chat/utils/call_llm.py @@ -2,19 +2,7 @@ import os from openai import OpenAI def call_llm(prompt, history=None): - """ - Calls the OpenAI API to get a response from an LLM. - - Args: - prompt: The user's current prompt. - history: A list of previous messages in the conversation, where each message - is a dict with "role" and "content" keys. E.g., - [{"role": "user", "content": "Hello"}, {"role": "assistant", "content": "Hi there!"}] - - Returns: - The LLM's response content as a string. - """ - client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY", "your-api-key")) # Default if not set + client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY", "your-api-key")) messages = [] if history: @@ -28,14 +16,11 @@ def call_llm(prompt, history=None): return r.choices[0].message.content if __name__ == "__main__": - # Ensure you have OPENAI_API_KEY set in your environment for this test to work print("Testing LLM call...") - # Test with a simple prompt response = call_llm("Tell me a short joke") print(f"LLM (Simple Joke): {response}") - # Test with history chat_history = [ {"role": "user", "content": "What is the capital of France?"}, {"role": "assistant", "content": "The capital of France is Paris."} diff --git a/cookbook/pocketflow-voice-chat/utils/speech_to_text.py b/cookbook/pocketflow-voice-chat/utils/speech_to_text.py new file mode 100644 index 0000000..35d533d --- /dev/null +++ b/cookbook/pocketflow-voice-chat/utils/speech_to_text.py @@ -0,0 +1,45 @@ +import os +from openai import OpenAI +import io + +def speech_to_text_api(audio_data: bytes, sample_rate: int): + client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY")) + + # The API expects a file-like object. We can use io.BytesIO for in-memory bytes. + # We also need to give it a name, as if it were a file upload. + audio_file = io.BytesIO(audio_data) + audio_file.name = "audio.mp3" # Provide a dummy filename with a common audio extension + + transcript = client.audio.transcriptions.create( + model="gpt-4o-transcribe", + file=audio_file + # language="en" # Optional: specify language ISO-639-1 code + # prompt="PocketFlow, LLM" # Optional: provide a prompt to guide the model + ) + return transcript.text + +if __name__ == "__main__": + print("Testing Speech-to-Text API...") + # The OpenAI client will raise an error if API key is not found or invalid. + # No explicit check here to keep it minimal. + test_audio_path = "tts_output.mp3" + if os.path.exists(test_audio_path): + print(f"Found {test_audio_path}, using it for STT test.") + with open(test_audio_path, "rb") as f: + audio_bytes_for_stt = f.read() + + # Sample rate for tts_output.mp3 from our TTS script is 24000 + # but Whisper should ideally infer or handle common formats well. + stt_sample_rate = 24000 + + transcribed_text = speech_to_text_api(audio_bytes_for_stt, stt_sample_rate) + + if transcribed_text: + print(f"Transcribed text: {transcribed_text}") + else: + print("Failed to transcribe audio (API returned empty data).") + else: + print(f"Test audio file '{test_audio_path}' not found.") + print("Please run the text_to_speech.py test first to generate it, or place your own audio file") + print(" (e.g., named 'test_audio.mp3') in the same directory as this script and modify the path.") + print("Make sure it's a common audio format like MP3, WAV, M4A etc.") \ No newline at end of file diff --git a/cookbook/pocketflow-voice-chat/utils/text_to_speech.py b/cookbook/pocketflow-voice-chat/utils/text_to_speech.py new file mode 100644 index 0000000..8a90f58 --- /dev/null +++ b/cookbook/pocketflow-voice-chat/utils/text_to_speech.py @@ -0,0 +1,32 @@ +import os +from openai import OpenAI + +def text_to_speech_api(text_to_synthesize: str): + client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY")) + + response = client.audio.speech.create( + model="gpt-4o-mini-tts", + voice="alloy", # Other voices: echo, fable, onyx, nova, shimmer + input=text_to_synthesize, + response_format="mp3" # Other formats: opus, aac, flac. MP3 is widely supported. + # OpenAI default sample rate for tts-1 is 24kHz. + ) + # The response.content is already bytes (the audio data) + # Alternatively, for streaming and saving to file: response.stream_to_file("output.mp3") + audio_data_bytes = response.content + sample_rate = 24000 # OpenAI TTS model tts-1 outputs 24kHz + return audio_data_bytes, sample_rate + +if __name__ == "__main__": + print("Testing Text-to-Speech API...") + # The OpenAI client will raise an error if API key is not found or invalid. + # No explicit check here to keep it minimal. + text = "Hello from PocketFlow! This is a test of the text-to-speech functionality." + audio_bytes, rate = text_to_speech_api(text) + if audio_bytes and rate: + print(f"Successfully converted text to speech. Audio data length: {len(audio_bytes)} bytes, Sample rate: {rate} Hz.") + with open('tts_output.mp3', 'wb') as f: + f.write(audio_bytes) + print("Saved TTS output to tts_output.mp3") + else: + print("Failed to convert text to speech (API returned empty data).") \ No newline at end of file diff --git a/cookbook/pocketflow-voice-chat/utils/tts_output.mp3 b/cookbook/pocketflow-voice-chat/utils/tts_output.mp3 new file mode 100644 index 0000000..7239809 Binary files /dev/null and b/cookbook/pocketflow-voice-chat/utils/tts_output.mp3 differ