implement audio func
This commit is contained in:
parent
fef5bf10a8
commit
1d6e38c1a3
|
|
@ -2,19 +2,7 @@ import os
|
||||||
from openai import OpenAI
|
from openai import OpenAI
|
||||||
|
|
||||||
def call_llm(prompt, history=None):
|
def call_llm(prompt, history=None):
|
||||||
"""
|
client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY", "your-api-key"))
|
||||||
Calls the OpenAI API to get a response from an LLM.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
prompt: The user's current prompt.
|
|
||||||
history: A list of previous messages in the conversation, where each message
|
|
||||||
is a dict with "role" and "content" keys. E.g.,
|
|
||||||
[{"role": "user", "content": "Hello"}, {"role": "assistant", "content": "Hi there!"}]
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
The LLM's response content as a string.
|
|
||||||
"""
|
|
||||||
client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY", "your-api-key")) # Default if not set
|
|
||||||
|
|
||||||
messages = []
|
messages = []
|
||||||
if history:
|
if history:
|
||||||
|
|
@ -28,14 +16,11 @@ def call_llm(prompt, history=None):
|
||||||
return r.choices[0].message.content
|
return r.choices[0].message.content
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
# Ensure you have OPENAI_API_KEY set in your environment for this test to work
|
|
||||||
print("Testing LLM call...")
|
print("Testing LLM call...")
|
||||||
|
|
||||||
# Test with a simple prompt
|
|
||||||
response = call_llm("Tell me a short joke")
|
response = call_llm("Tell me a short joke")
|
||||||
print(f"LLM (Simple Joke): {response}")
|
print(f"LLM (Simple Joke): {response}")
|
||||||
|
|
||||||
# Test with history
|
|
||||||
chat_history = [
|
chat_history = [
|
||||||
{"role": "user", "content": "What is the capital of France?"},
|
{"role": "user", "content": "What is the capital of France?"},
|
||||||
{"role": "assistant", "content": "The capital of France is Paris."}
|
{"role": "assistant", "content": "The capital of France is Paris."}
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,45 @@
|
||||||
|
import os
|
||||||
|
from openai import OpenAI
|
||||||
|
import io
|
||||||
|
|
||||||
|
def speech_to_text_api(audio_data: bytes, sample_rate: int):
|
||||||
|
client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY"))
|
||||||
|
|
||||||
|
# The API expects a file-like object. We can use io.BytesIO for in-memory bytes.
|
||||||
|
# We also need to give it a name, as if it were a file upload.
|
||||||
|
audio_file = io.BytesIO(audio_data)
|
||||||
|
audio_file.name = "audio.mp3" # Provide a dummy filename with a common audio extension
|
||||||
|
|
||||||
|
transcript = client.audio.transcriptions.create(
|
||||||
|
model="gpt-4o-transcribe",
|
||||||
|
file=audio_file
|
||||||
|
# language="en" # Optional: specify language ISO-639-1 code
|
||||||
|
# prompt="PocketFlow, LLM" # Optional: provide a prompt to guide the model
|
||||||
|
)
|
||||||
|
return transcript.text
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
print("Testing Speech-to-Text API...")
|
||||||
|
# The OpenAI client will raise an error if API key is not found or invalid.
|
||||||
|
# No explicit check here to keep it minimal.
|
||||||
|
test_audio_path = "tts_output.mp3"
|
||||||
|
if os.path.exists(test_audio_path):
|
||||||
|
print(f"Found {test_audio_path}, using it for STT test.")
|
||||||
|
with open(test_audio_path, "rb") as f:
|
||||||
|
audio_bytes_for_stt = f.read()
|
||||||
|
|
||||||
|
# Sample rate for tts_output.mp3 from our TTS script is 24000
|
||||||
|
# but Whisper should ideally infer or handle common formats well.
|
||||||
|
stt_sample_rate = 24000
|
||||||
|
|
||||||
|
transcribed_text = speech_to_text_api(audio_bytes_for_stt, stt_sample_rate)
|
||||||
|
|
||||||
|
if transcribed_text:
|
||||||
|
print(f"Transcribed text: {transcribed_text}")
|
||||||
|
else:
|
||||||
|
print("Failed to transcribe audio (API returned empty data).")
|
||||||
|
else:
|
||||||
|
print(f"Test audio file '{test_audio_path}' not found.")
|
||||||
|
print("Please run the text_to_speech.py test first to generate it, or place your own audio file")
|
||||||
|
print(" (e.g., named 'test_audio.mp3') in the same directory as this script and modify the path.")
|
||||||
|
print("Make sure it's a common audio format like MP3, WAV, M4A etc.")
|
||||||
|
|
@ -0,0 +1,32 @@
|
||||||
|
import os
|
||||||
|
from openai import OpenAI
|
||||||
|
|
||||||
|
def text_to_speech_api(text_to_synthesize: str):
|
||||||
|
client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY"))
|
||||||
|
|
||||||
|
response = client.audio.speech.create(
|
||||||
|
model="gpt-4o-mini-tts",
|
||||||
|
voice="alloy", # Other voices: echo, fable, onyx, nova, shimmer
|
||||||
|
input=text_to_synthesize,
|
||||||
|
response_format="mp3" # Other formats: opus, aac, flac. MP3 is widely supported.
|
||||||
|
# OpenAI default sample rate for tts-1 is 24kHz.
|
||||||
|
)
|
||||||
|
# The response.content is already bytes (the audio data)
|
||||||
|
# Alternatively, for streaming and saving to file: response.stream_to_file("output.mp3")
|
||||||
|
audio_data_bytes = response.content
|
||||||
|
sample_rate = 24000 # OpenAI TTS model tts-1 outputs 24kHz
|
||||||
|
return audio_data_bytes, sample_rate
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
print("Testing Text-to-Speech API...")
|
||||||
|
# The OpenAI client will raise an error if API key is not found or invalid.
|
||||||
|
# No explicit check here to keep it minimal.
|
||||||
|
text = "Hello from PocketFlow! This is a test of the text-to-speech functionality."
|
||||||
|
audio_bytes, rate = text_to_speech_api(text)
|
||||||
|
if audio_bytes and rate:
|
||||||
|
print(f"Successfully converted text to speech. Audio data length: {len(audio_bytes)} bytes, Sample rate: {rate} Hz.")
|
||||||
|
with open('tts_output.mp3', 'wb') as f:
|
||||||
|
f.write(audio_bytes)
|
||||||
|
print("Saved TTS output to tts_output.mp3")
|
||||||
|
else:
|
||||||
|
print("Failed to convert text to speech (API returned empty data).")
|
||||||
Binary file not shown.
Loading…
Reference in New Issue