pocketflow/cookbook/pocketflow-voice-chat/utils/text_to_speech.py

import os
from openai import OpenAI

def text_to_speech_api(text_to_synthesize: str):
    client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY"))

    response = client.audio.speech.create(
        model="gpt-4o-mini-tts",
        voice="alloy", # Other voices: echo, fable, onyx, nova, shimmer
        input=text_to_synthesize,
        response_format="mp3" # Other formats: opus, aac, flac. MP3 is widely supported.
                              # OpenAI default sample rate for tts-1 is 24kHz.
    )
    # The response.content is already bytes (the audio data)
    # Alternatively, for streaming and saving to file: response.stream_to_file("output.mp3")
    audio_data_bytes = response.content
    sample_rate = 24000 # OpenAI TTS model tts-1 outputs 24kHz
    return audio_data_bytes, sample_rate

if __name__ == "__main__":
    print("Testing Text-to-Speech API...")
    # The OpenAI client will raise an error if API key is not found or invalid.
    # No explicit check here to keep it minimal.
    text = "Hello from PocketFlow! This is a test of the text-to-speech functionality."
    audio_bytes, rate = text_to_speech_api(text)
    if audio_bytes and rate:
        print(f"Successfully converted text to speech. Audio data length: {len(audio_bytes)} bytes, Sample rate: {rate} Hz.")
        with open('tts_output.mp3', 'wb') as f:
            f.write(audio_bytes)
        print("Saved TTS output to tts_output.mp3")
    else:
        print("Failed to convert text to speech (API returned empty data).")