finish voice chat
This commit is contained in:
parent
c4898bf307
commit
76b5886dd0
|
|
@ -76,12 +76,12 @@ From there, it's easy to implement popular design patterns like ([Multi-](https:
|
||||||
| [Batch](https://github.com/The-Pocket/PocketFlow/tree/main/cookbook/pocketflow-batch) | ☆☆☆ <br> *Dummy* | A batch processor that translates markdown content into multiple languages |
|
| [Batch](https://github.com/The-Pocket/PocketFlow/tree/main/cookbook/pocketflow-batch) | ☆☆☆ <br> *Dummy* | A batch processor that translates markdown content into multiple languages |
|
||||||
| [Streaming](https://github.com/The-Pocket/PocketFlow/tree/main/cookbook/pocketflow-llm-streaming) | ☆☆☆ <br> *Dummy* | A real-time LLM streaming demo with user interrupt capability |
|
| [Streaming](https://github.com/The-Pocket/PocketFlow/tree/main/cookbook/pocketflow-llm-streaming) | ☆☆☆ <br> *Dummy* | A real-time LLM streaming demo with user interrupt capability |
|
||||||
| [Chat Guardrail](https://github.com/The-Pocket/PocketFlow/tree/main/cookbook/pocketflow-chat-guardrail) | ☆☆☆ <br> *Dummy* | A travel advisor chatbot that only processes travel-related queries |
|
| [Chat Guardrail](https://github.com/The-Pocket/PocketFlow/tree/main/cookbook/pocketflow-chat-guardrail) | ☆☆☆ <br> *Dummy* | A travel advisor chatbot that only processes travel-related queries |
|
||||||
| [Map-Reduce](https://github.com/The-Pocket/PocketFlow/tree/main/cookbook/pocketflow-map-reduce) | ★☆☆ <br> *Beginner* | A resume qualification processor using map-reduce pattern for batch evaluation |
|
| [Majority Vote](https://github.com/The-Pocket/PocketFlow/tree/main/cookbook/pocketflow-majority-vote) | ☆☆☆ <br> *Dummy* | Improve reasoning accuracy by aggregating multiple solution attempts |
|
||||||
|
| [Map-Reduce](https://github.com/The-Pocket/PocketFlow/tree/main/cookbook/pocketflow-map-reduce) | ☆☆☆ <br> *Dummy* | A resume qualification processor using map-reduce pattern for batch evaluation |
|
||||||
| [Multi-Agent](https://github.com/The-Pocket/PocketFlow/tree/main/cookbook/pocketflow-multi-agent) | ★☆☆ <br> *Beginner* | A Taboo word game for asynchronous communication between two agents |
|
| [Multi-Agent](https://github.com/The-Pocket/PocketFlow/tree/main/cookbook/pocketflow-multi-agent) | ★☆☆ <br> *Beginner* | A Taboo word game for asynchronous communication between two agents |
|
||||||
| [Supervisor](https://github.com/The-Pocket/PocketFlow/tree/main/cookbook/pocketflow-supervisor) | ★☆☆ <br> *Beginner* | Research agent is getting unreliable... Let's build a supervision process|
|
| [Supervisor](https://github.com/The-Pocket/PocketFlow/tree/main/cookbook/pocketflow-supervisor) | ★☆☆ <br> *Beginner* | Research agent is getting unreliable... Let's build a supervision process|
|
||||||
| [Parallel](https://github.com/The-Pocket/PocketFlow/tree/main/cookbook/pocketflow-parallel-batch) | ★☆☆ <br> *Beginner* | A parallel execution demo that shows 3x speedup |
|
| [Parallel](https://github.com/The-Pocket/PocketFlow/tree/main/cookbook/pocketflow-parallel-batch) | ★☆☆ <br> *Beginner* | A parallel execution demo that shows 3x speedup |
|
||||||
| [Parallel Flow](https://github.com/The-Pocket/PocketFlow/tree/main/cookbook/pocketflow-parallel-batch-flow) | ★☆☆ <br> *Beginner* | A parallel image processing demo showing 8x speedup with multiple filters |
|
| [Parallel Flow](https://github.com/The-Pocket/PocketFlow/tree/main/cookbook/pocketflow-parallel-batch-flow) | ★☆☆ <br> *Beginner* | A parallel image processing demo showing 8x speedup with multiple filters |
|
||||||
| [Majority Vote](https://github.com/The-Pocket/PocketFlow/tree/main/cookbook/pocketflow-majority-vote) | ★☆☆ <br> *Beginner* | Improve reasoning accuracy by aggregating multiple solution attempts |
|
|
||||||
| [Thinking](https://github.com/The-Pocket/PocketFlow/tree/main/cookbook/pocketflow-thinking) | ★☆☆ <br> *Beginner* | Solve complex reasoning problems through Chain-of-Thought |
|
| [Thinking](https://github.com/The-Pocket/PocketFlow/tree/main/cookbook/pocketflow-thinking) | ★☆☆ <br> *Beginner* | Solve complex reasoning problems through Chain-of-Thought |
|
||||||
| [Memory](https://github.com/The-Pocket/PocketFlow/tree/main/cookbook/pocketflow-chat-memory) | ★☆☆ <br> *Beginner* | A chat bot with short-term and long-term memory |
|
| [Memory](https://github.com/The-Pocket/PocketFlow/tree/main/cookbook/pocketflow-chat-memory) | ★☆☆ <br> *Beginner* | A chat bot with short-term and long-term memory |
|
||||||
| [Text2SQL](https://github.com/The-Pocket/PocketFlow/tree/main/cookbook/pocketflow-text2sql) | ★☆☆ <br> *Beginner* | Convert natural language to SQL queries with an auto-debug loop |
|
| [Text2SQL](https://github.com/The-Pocket/PocketFlow/tree/main/cookbook/pocketflow-text2sql) | ★☆☆ <br> *Beginner* | Convert natural language to SQL queries with an auto-debug loop |
|
||||||
|
|
@ -89,6 +89,7 @@ From there, it's easy to implement popular design patterns like ([Multi-](https:
|
||||||
| [A2A](https://github.com/The-Pocket/PocketFlow/tree/main/cookbook/pocketflow-a2a) | ★☆☆ <br> *Beginner* | Agent wrapped with Agent-to-Agent protocol for inter-agent communication |
|
| [A2A](https://github.com/The-Pocket/PocketFlow/tree/main/cookbook/pocketflow-a2a) | ★☆☆ <br> *Beginner* | Agent wrapped with Agent-to-Agent protocol for inter-agent communication |
|
||||||
| [Streamlit HITL](https://github.com/The-Pocket/PocketFlow/tree/main/cookbook/pocketflow-streamlit-hitl) | ★☆☆ <br> *Beginner* | Streamlit app for human-in-the-loop review |
|
| [Streamlit HITL](https://github.com/The-Pocket/PocketFlow/tree/main/cookbook/pocketflow-streamlit-hitl) | ★☆☆ <br> *Beginner* | Streamlit app for human-in-the-loop review |
|
||||||
| [FastAPI HITL](https://github.com/The-Pocket/PocketFlow/tree/main/cookbook/pocketflow-fastapi-hitl) | ★☆☆ <br> *Beginner* | FastAPI app for async human review loop with SSE |
|
| [FastAPI HITL](https://github.com/The-Pocket/PocketFlow/tree/main/cookbook/pocketflow-fastapi-hitl) | ★☆☆ <br> *Beginner* | FastAPI app for async human review loop with SSE |
|
||||||
|
| [Voice Chat](https://github.com/The-Pocket/PocketFlow/tree/main/cookbook/pocketflow-voice-chat) | ★☆☆ <br> *Beginner* | An interactive voice chat application with VAD, STT, LLM, and TTS. |
|
||||||
|
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -1 +1,83 @@
|
||||||
sudo apt-get update && sudo apt-get install -y portaudio19-dev
|
# PocketFlow Voice Chat
|
||||||
|
|
||||||
|
This project demonstrates a voice-based interactive chat application built with PocketFlow. Users can speak their queries, and the system will respond with spoken answers from an LLM, maintaining conversation history.
|
||||||
|
|
||||||
|
## Features
|
||||||
|
|
||||||
|
- **Voice Activity Detection (VAD)**: Automatically detects when the user starts and stops speaking.
|
||||||
|
- **Speech-to-Text (STT)**: Converts spoken audio into text using OpenAI.
|
||||||
|
- **LLM Interaction**: Processes the transcribed text with an LLM (e.g., GPT-4o), maintaining conversation history.
|
||||||
|
- **Text-to-Speech (TTS)**: Converts the LLM's text response back into audible speech using OpenAI.
|
||||||
|
- **Continuous Conversation**: Loops back to listen for the next user query after responding, allowing for an ongoing dialogue.
|
||||||
|
|
||||||
|
## How to Run
|
||||||
|
|
||||||
|
1. **Set your OpenAI API key**:
|
||||||
|
```bash
|
||||||
|
export OPENAI_API_KEY="your-api-key-here"
|
||||||
|
```
|
||||||
|
Ensure this environment variable is set, as the utility scripts for STT, LLM, and TTS rely on it.
|
||||||
|
You can test individual utility functions (e.g., `python utils/call_llm.py`, `python utils/text_to_speech.py`) to help verify your API key and setup.
|
||||||
|
|
||||||
|
2. **Install dependencies**:
|
||||||
|
Make sure you have Python installed. Then, install the required libraries using pip:
|
||||||
|
```bash
|
||||||
|
pip install -r requirements.txt
|
||||||
|
```
|
||||||
|
This will install libraries such as `openai`, `pocketflow`, `sounddevice`, `numpy`, `scipy`, and `soundfile`.
|
||||||
|
|
||||||
|
**Note for Linux users**: `sounddevice` may require PortAudio. If you encounter issues, you might need to install it first:
|
||||||
|
```bash
|
||||||
|
sudo apt-get update && sudo apt-get install -y portaudio19-dev
|
||||||
|
```
|
||||||
|
|
||||||
|
3. **Run the application**:
|
||||||
|
```bash
|
||||||
|
python main.py
|
||||||
|
```
|
||||||
|
Follow the console prompts. The application will start listening when you see "Listening for your query...".
|
||||||
|
|
||||||
|
## How It Works
|
||||||
|
|
||||||
|
The application uses a PocketFlow workflow to manage the conversation steps:
|
||||||
|
|
||||||
|
```mermaid
|
||||||
|
flowchart TD
|
||||||
|
CaptureAudio[Capture Audio] --> SpeechToText[Speech to Text]
|
||||||
|
SpeechToText --> QueryLLM[Query LLM]
|
||||||
|
QueryLLM --> TextToSpeech[Text to Speech & Play]
|
||||||
|
TextToSpeech -- "Next Turn" --> CaptureAudio
|
||||||
|
```
|
||||||
|
|
||||||
|
Here's what each node in the flow does:
|
||||||
|
|
||||||
|
1. **`CaptureAudioNode`**: Records audio from the user's microphone. It uses Voice Activity Detection (VAD) to start recording when speech is detected and stop when silence is detected.
|
||||||
|
2. **`SpeechToTextNode`**: Takes the recorded audio data, converts it to a suitable format, and sends it to OpenAI's STT API (gpt-4o-transcribe) to get the transcribed text.
|
||||||
|
3. **`QueryLLMNode`**: Takes the transcribed text from the user, along with the existing conversation history, and sends it to an LLM (OpenAI's GPT-4o model) to generate an intelligent response.
|
||||||
|
4. **`TextToSpeechNode`**: Receives the text response from the LLM, converts it into audio using OpenAI's TTS API (gpt-4o-mini-tts), and plays the audio back to the user. If the conversation is set to continue, it transitions back to the `CaptureAudioNode`.
|
||||||
|
|
||||||
|
## Example Interaction
|
||||||
|
|
||||||
|
When you run `main.py`:
|
||||||
|
|
||||||
|
1. The console will display:
|
||||||
|
```
|
||||||
|
Starting PocketFlow Voice Chat...
|
||||||
|
Speak your query after 'Listening for your query...' appears.
|
||||||
|
...
|
||||||
|
```
|
||||||
|
2. When you see `Listening for your query...`, speak clearly into your microphone.
|
||||||
|
3. After you stop speaking, the console will show updates:
|
||||||
|
```
|
||||||
|
Audio captured (X.XXs), proceeding to STT.
|
||||||
|
Converting speech to text...
|
||||||
|
User: [Your transcribed query will appear here]
|
||||||
|
Sending query to LLM...
|
||||||
|
LLM: [The LLM's response text will appear here]
|
||||||
|
Converting LLM response to speech...
|
||||||
|
Playing LLM response...
|
||||||
|
```
|
||||||
|
4. You will hear the LLM's response spoken aloud.
|
||||||
|
5. The application will then loop back, and you'll see `Listening for your query...` again, ready for your next input.
|
||||||
|
|
||||||
|
The conversation continues in this manner. To stop the application, you typically need to interrupt it (e.g., Ctrl+C in the terminal), as it's designed to loop continuously.
|
||||||
|
|
@ -53,28 +53,31 @@ flowchart TD
|
||||||
> 2. Include only the necessary utility functions, based on nodes in the flow.
|
> 2. Include only the necessary utility functions, based on nodes in the flow.
|
||||||
|
|
||||||
1. **`record_audio()`** (`utils/audio_utils.py`)
|
1. **`record_audio()`** (`utils/audio_utils.py`)
|
||||||
- *Input*: (Optional) `silence_threshold` (float, e.g., RMS energy), `min_silence_duration_ms` (int), `chunk_size_ms` (int), `sample_rate` (int, Hz), `channels` (int).
|
- *Input*: (Optional) `sample_rate` (int, Hz, e.g., `DEFAULT_SAMPLE_RATE`), `channels` (int, e.g., `DEFAULT_CHANNELS`), `chunk_size_ms` (int, e.g., `DEFAULT_CHUNK_SIZE_MS`), `silence_threshold_rms` (float, e.g., `DEFAULT_SILENCE_THRESHOLD_RMS`), `min_silence_duration_ms` (int, e.g., `DEFAULT_MIN_SILENCE_DURATION_MS`), `max_recording_duration_s` (int, e.g., `DEFAULT_MAX_RECORDING_DURATION_S`), `pre_roll_chunks_count` (int, e.g., `DEFAULT_PRE_ROLL_CHUNKS`).
|
||||||
- *Output*: A tuple `(audio_data, sample_rate)` where `audio_data` is in-memory audio (e.g., bytes or NumPy array) and `sample_rate` is the recording sample rate (int).
|
- *Output*: A tuple `(audio_data, sample_rate)` where `audio_data` is a NumPy array of float32 audio samples, and `sample_rate` is the recording sample rate (int). Returns `(None, sample_rate)` if no speech is detected or recording fails.
|
||||||
- *Description*: Records audio from the microphone. Starts recording when sound is detected above `silence_threshold` (optional, or starts immediately) and stops after `min_silence_duration_ms` of sound below the threshold.
|
- *Description*: Records audio from the microphone using silence-based Voice Activity Detection (VAD). Buffers `pre_roll_chunks_count` of audio and starts full recording when sound is detected above `silence_threshold_rms`. Stops after `min_silence_duration_ms` of sound below the threshold or if `max_recording_duration_s` is reached.
|
||||||
- *Necessity*: Used by `CaptureAudioNode` to get user\'s voice input.
|
- *Necessity*: Used by `CaptureAudioNode` to get user\'s voice input.
|
||||||
|
|
||||||
2. **`speech_to_text_api(audio_data, sample_rate)`** (`utils/speech_to_text.py`)
|
2. **`speech_to_text_api(audio_data, sample_rate)`** (`utils/speech_to_text.py`)
|
||||||
- *Input*: `audio_data` (bytes or NumPy array), `sample_rate` (int).
|
- *Input*: `audio_data` (bytes), `sample_rate` (int, though the API might infer this from the audio format).
|
||||||
- *Output*: `transcribed_text` (str).
|
- *Output*: `transcribed_text` (str).
|
||||||
- *Necessity*: Used by `SpeechToTextNode` to convert in-memory audio data to text.
|
- *Necessity*: Used by `SpeechToTextNode` to convert in-memory audio data to text.
|
||||||
|
- *Example Model*: OpenAI `gpt-4o-transcribe`.
|
||||||
|
|
||||||
3. **`call_llm(prompt, history)`** (`utils/llm_service.py`)
|
3. **`call_llm(messages)`** (`utils/call_llm.py`)
|
||||||
- *Input*: `prompt` (str), `history` (list of dicts, e.g., `[{"role": "user", "content": "..."}]`)
|
- *Input*: `messages` (list of dicts, e.g., `[{"role": "user", "content": "..."}, {"role": "assistant", "content": "..."}]`). This should be the complete conversation history including the latest user query.
|
||||||
- *Output*: `llm_response_text` (str)
|
- *Output*: `llm_response_text` (str)
|
||||||
- *Necessity*: Used by `QueryLLMNode` to get an intelligent response.
|
- *Necessity*: Used by `QueryLLMNode` to get an intelligent response.
|
||||||
|
- *Example Model*: OpenAI `gpt-4o`.
|
||||||
|
|
||||||
4. **`text_to_speech_api(text_to_synthesize)`** (`utils/text_to_speech.py`)
|
4. **`text_to_speech_api(text_to_synthesize)`** (`utils/text_to_speech.py`)
|
||||||
- *Input*: `text_to_synthesize` (str).
|
- *Input*: `text_to_synthesize` (str).
|
||||||
- *Output*: A tuple `(audio_data, sample_rate)` where `audio_data` is in-memory audio (e.g., NumPy array) and `sample_rate` is the audio sample rate (int).
|
- *Output*: A tuple `(audio_data, sample_rate)` where `audio_data` is in-memory audio as bytes (e.g., MP3 format from OpenAI) and `sample_rate` is the audio sample rate (int, e.g., 24000 Hz for OpenAI `gpt-4o-mini-tts`).
|
||||||
- *Necessity*: Used by `TextToSpeechNode` to convert LLM text to speakable in-memory audio data.
|
- *Necessity*: Used by `TextToSpeechNode` to convert LLM text to speakable in-memory audio data.
|
||||||
|
- *Example Model*: OpenAI `gpt-4o-mini-tts`.
|
||||||
|
|
||||||
5. **`play_audio_data(audio_data, sample_rate)`** (`utils/audio_utils.py`)
|
5. **`play_audio_data(audio_data, sample_rate)`** (`utils/audio_utils.py`)
|
||||||
- *Input*: `audio_data` (NumPy array), `sample_rate` (int).
|
- *Input*: `audio_data` (NumPy array of float32 audio samples), `sample_rate` (int).
|
||||||
- *Output*: None
|
- *Output*: None
|
||||||
- *Necessity*: Used by `TextToSpeechNode` (in its `post` method) to play the in-memory synthesized speech.
|
- *Necessity*: Used by `TextToSpeechNode` (in its `post` method) to play the in-memory synthesized speech.
|
||||||
|
|
||||||
|
|
@ -88,11 +91,8 @@ The shared memory structure is organized as follows:
|
||||||
|
|
||||||
```python
|
```python
|
||||||
shared = {
|
shared = {
|
||||||
"user_audio_data": None, # In-memory audio data (bytes or NumPy array) from user
|
"user_audio_data": None, # In-memory audio data (NumPy array) from user
|
||||||
"user_audio_sample_rate": None, # int: Sample rate of the user audio
|
"user_audio_sample_rate": None, # int: Sample rate of the user audio
|
||||||
"user_text_query": None, # str: Transcribed user text
|
|
||||||
"llm_text_response": None, # str: Text response from LLM
|
|
||||||
# "llm_audio_data" and "llm_audio_sample_rate" are handled as exec_res within TextToSpeechNode's post method
|
|
||||||
"chat_history": [], # list: Conversation history [{"role": "user/assistant", "content": "..."}]
|
"chat_history": [], # list: Conversation history [{"role": "user/assistant", "content": "..."}]
|
||||||
"continue_conversation": True # boolean: Flag to control the main conversation loop
|
"continue_conversation": True # boolean: Flag to control the main conversation loop
|
||||||
}
|
}
|
||||||
|
|
@ -107,40 +107,41 @@ shared = {
|
||||||
- *Type*: Regular
|
- *Type*: Regular
|
||||||
- *Steps*:
|
- *Steps*:
|
||||||
- *prep*: Check `shared["continue_conversation"]`. (Potentially load VAD parameters from `shared["config"]` if dynamic).
|
- *prep*: Check `shared["continue_conversation"]`. (Potentially load VAD parameters from `shared["config"]` if dynamic).
|
||||||
- *exec*: Call `utils.audio_utils.record_audio()` (passing VAD parameters if configured).
|
- *exec*: Call `utils.audio_utils.record_audio()` (passing VAD parameters if configured). This returns a NumPy array and sample rate.
|
||||||
- *post*: `audio_data, sample_rate = exec_res`. Write `audio_data` to `shared["user_audio_data"]` and `sample_rate` to `shared["user_audio_sample_rate"]`. Returns `"default"`.
|
- *post*: `audio_numpy_array, sample_rate = exec_res`. Write `audio_numpy_array` to `shared["user_audio_data"]` and `sample_rate` to `shared["user_audio_sample_rate"]`. Returns `"default"`.
|
||||||
|
|
||||||
2. **`SpeechToTextNode`**
|
2. **`SpeechToTextNode`**
|
||||||
- *Purpose*: Convert the recorded in-memory audio to text.
|
- *Purpose*: Convert the recorded in-memory audio to text.
|
||||||
- *Type*: Regular
|
- *Type*: Regular
|
||||||
- *Steps*:
|
- *Steps*:
|
||||||
- *prep*: Read `shared["user_audio_data"]` and `shared["user_audio_sample_rate"]`. Return `(user_audio_data, user_audio_sample_rate)`.
|
- *prep*: Read `shared["user_audio_data"]` (NumPy array) and `shared["user_audio_sample_rate"]`. Return `(user_audio_data_numpy, user_audio_sample_rate)`.
|
||||||
- *exec*: `audio_data, sample_rate = prep_res`. Call `utils.speech_to_text.speech_to_text_api(audio_data, sample_rate)`.
|
- *exec*: `audio_numpy_array, sample_rate = prep_res`. **Convert `audio_numpy_array` to audio `bytes` (e.g., in WAV format using `scipy.io.wavfile.write` to an `io.BytesIO` object).** Call `utils.speech_to_text.speech_to_text_api(audio_bytes, sample_rate)`.
|
||||||
- *post*:
|
- *post*:
|
||||||
- Write `exec_res` (transcribed text) to `shared["user_text_query"]`.
|
- Let `transcribed_text = exec_res`.
|
||||||
- Append `{"role": "user", "content": exec_res}` to `shared["chat_history"]`.
|
- Append `{"role": "user", "content": transcribed_text}` to `shared["chat_history"]`.
|
||||||
- Clear `shared["user_audio_data"]` and `shared["user_audio_sample_rate"]` as they are no longer needed.
|
- Clear `shared["user_audio_data"]` and `shared["user_audio_sample_rate"]` as they are no longer needed.
|
||||||
- Returns `"default"`.
|
- Returns `"default"` (assuming STT is successful as per simplification).
|
||||||
|
|
||||||
3. **`QueryLLMNode`**
|
3. **`QueryLLMNode`**
|
||||||
- *Purpose*: Get a response from the LLM based on the user\'s query and conversation history.
|
- *Purpose*: Get a response from the LLM based on the user's query and conversation history.
|
||||||
- *Type*: Regular
|
- *Type*: Regular
|
||||||
- *Steps*:
|
- *Steps*:
|
||||||
- *prep*: Read `shared["user_text_query"]` and `shared["chat_history"]`. Return `(user_text_query, chat_history)`.
|
- *prep*: Read `shared["chat_history"]`. Return `chat_history`.
|
||||||
- *exec*: Call `utils.llm_service.call_llm(prompt=prep_res[0], history=prep_res[1])`.
|
- *exec*: `history = prep_res`. Call `utils.call_llm.call_llm(messages=history)`.
|
||||||
- *post*:
|
- *post*:
|
||||||
- Write `exec_res` (LLM text response) to `shared["llm_text_response"]`.
|
- Let `llm_response = exec_res`.
|
||||||
- Append `{"role": "assistant", "content": exec_res}` to `shared["chat_history"]`.
|
- Append `{"role": "assistant", "content": llm_response}` to `shared["chat_history"]`.
|
||||||
- Returns `"default"`.
|
- Returns `"default"` (assuming LLM call is successful).
|
||||||
|
|
||||||
4. **`TextToSpeechNode`**
|
4. **`TextToSpeechNode`**
|
||||||
- *Purpose*: Convert the LLM\'s text response into speech and play it.
|
- *Purpose*: Convert the LLM's text response into speech and play it.
|
||||||
- *Type*: Regular
|
- *Type*: Regular
|
||||||
- *Steps*:
|
- *Steps*:
|
||||||
- *prep*: Read `shared["llm_text_response"]`.
|
- *prep*: Read `shared["chat_history"]`. Identify the last message, which should be the LLM's response. Return its content.
|
||||||
- *exec*: Call `utils.text_to_speech.text_to_speech_api(prep_res)`. This returns `(llm_audio_data, llm_sample_rate)`.
|
- *exec*: `text_to_synthesize = prep_res`. Call `utils.text_to_speech.text_to_speech_api(text_to_synthesize)`. This returns `(llm_audio_bytes, llm_sample_rate)`.
|
||||||
- *post*: `llm_audio_data, llm_sample_rate = exec_res`.
|
- *post*: `llm_audio_bytes, llm_sample_rate = exec_res`.
|
||||||
- Call `utils.audio_utils.play_audio_data(llm_audio_data, llm_sample_rate)`.
|
- **Convert `llm_audio_bytes` (e.g., MP3 bytes from TTS API) to a NumPy array of audio samples (e.g., using a library like `pydub` or `soundfile` to decode).**
|
||||||
|
- Call `utils.audio_utils.play_audio_data(llm_audio_numpy_array, llm_sample_rate)`.
|
||||||
- (Optional) Log completion.
|
- (Optional) Log completion.
|
||||||
- If `shared["continue_conversation"]` is `True`, return `"next_turn"` to loop back.
|
- If `shared["continue_conversation"]` is `True`, return `"next_turn"` to loop back.
|
||||||
- Otherwise, return `"end_conversation"`.
|
- Otherwise, return `"end_conversation"`.
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,25 @@
|
||||||
|
from pocketflow import Flow
|
||||||
|
from nodes import CaptureAudioNode, SpeechToTextNode, QueryLLMNode, TextToSpeechNode
|
||||||
|
|
||||||
|
def create_voice_chat_flow() -> Flow:
|
||||||
|
"""Creates and returns the voice chat flow."""
|
||||||
|
# Create nodes
|
||||||
|
capture_audio = CaptureAudioNode()
|
||||||
|
speech_to_text = SpeechToTextNode()
|
||||||
|
query_llm = QueryLLMNode()
|
||||||
|
text_to_speech = TextToSpeechNode()
|
||||||
|
|
||||||
|
# Define transitions
|
||||||
|
capture_audio >> speech_to_text
|
||||||
|
speech_to_text >> query_llm
|
||||||
|
query_llm >> text_to_speech
|
||||||
|
|
||||||
|
# Loop back for next turn or end
|
||||||
|
text_to_speech - "next_turn" >> capture_audio
|
||||||
|
# "end_conversation" action from any node will terminate the flow naturally
|
||||||
|
# if no transition is defined for it from the current node.
|
||||||
|
# Alternatively, one could explicitly transition to an EndNode if desired.
|
||||||
|
|
||||||
|
# Create flow starting with the capture audio node
|
||||||
|
voice_chat_flow = Flow(start=capture_audio)
|
||||||
|
return voice_chat_flow
|
||||||
|
|
@ -0,0 +1,28 @@
|
||||||
|
from flow import create_voice_chat_flow
|
||||||
|
|
||||||
|
def main():
|
||||||
|
"""Runs the PocketFlow Voice Chat application."""
|
||||||
|
print("Starting PocketFlow Voice Chat...")
|
||||||
|
print("Speak your query after 'Listening for your query...' appears.")
|
||||||
|
print("The conversation will continue until an error occurs or the loop is intentionally stopped.")
|
||||||
|
print("To attempt to stop, you might need to cause an error (e.g., silence during capture if not handled by VAD to end gracefully) or modify shared[\"continue_conversation\"] if a mechanism is added.")
|
||||||
|
|
||||||
|
shared = {
|
||||||
|
"user_audio_data": None,
|
||||||
|
"user_audio_sample_rate": None,
|
||||||
|
"user_text_query": None,
|
||||||
|
"llm_text_response": None,
|
||||||
|
"chat_history": [],
|
||||||
|
"continue_conversation": True # Flag to control the main conversation loop
|
||||||
|
}
|
||||||
|
|
||||||
|
# Create the flow
|
||||||
|
voice_chat_flow = create_voice_chat_flow()
|
||||||
|
|
||||||
|
# Run the flow
|
||||||
|
# The flow will loop based on the "next_turn" action from TextToSpeechNode
|
||||||
|
# and the continue_conversation flag checked within nodes or if an error action is returned.
|
||||||
|
voice_chat_flow.run(shared)
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
|
|
@ -0,0 +1,148 @@
|
||||||
|
import numpy as np
|
||||||
|
import scipy.io.wavfile
|
||||||
|
import io
|
||||||
|
import soundfile # For converting MP3 bytes to NumPy array
|
||||||
|
|
||||||
|
from pocketflow import Node
|
||||||
|
from utils.audio_utils import record_audio, play_audio_data
|
||||||
|
from utils.speech_to_text import speech_to_text_api
|
||||||
|
from utils.call_llm import call_llm
|
||||||
|
from utils.text_to_speech import text_to_speech_api
|
||||||
|
|
||||||
|
class CaptureAudioNode(Node):
|
||||||
|
"""Records audio input from the user using VAD."""
|
||||||
|
def exec(self, _): # prep_res is not used as per design
|
||||||
|
print("\nListening for your query...")
|
||||||
|
audio_data, sample_rate = record_audio()
|
||||||
|
if audio_data is None:
|
||||||
|
return None, None
|
||||||
|
return audio_data, sample_rate
|
||||||
|
|
||||||
|
def post(self, shared, prep_res, exec_res):
|
||||||
|
audio_numpy_array, sample_rate = exec_res
|
||||||
|
if audio_numpy_array is None:
|
||||||
|
shared["user_audio_data"] = None
|
||||||
|
shared["user_audio_sample_rate"] = None
|
||||||
|
print("CaptureAudioNode: Failed to capture audio.")
|
||||||
|
return "end_conversation"
|
||||||
|
|
||||||
|
shared["user_audio_data"] = audio_numpy_array
|
||||||
|
shared["user_audio_sample_rate"] = sample_rate
|
||||||
|
print(f"Audio captured ({len(audio_numpy_array)/sample_rate:.2f}s), proceeding to STT.")
|
||||||
|
|
||||||
|
class SpeechToTextNode(Node):
|
||||||
|
"""Converts the recorded in-memory audio to text."""
|
||||||
|
def prep(self, shared):
|
||||||
|
user_audio_data = shared.get("user_audio_data")
|
||||||
|
user_audio_sample_rate = shared.get("user_audio_sample_rate")
|
||||||
|
if user_audio_data is None or user_audio_sample_rate is None:
|
||||||
|
print("SpeechToTextNode: No audio data to process.")
|
||||||
|
return None # Signal to skip exec
|
||||||
|
return user_audio_data, user_audio_sample_rate
|
||||||
|
|
||||||
|
def exec(self, prep_res):
|
||||||
|
if prep_res is None:
|
||||||
|
return None # Skip if no audio data
|
||||||
|
|
||||||
|
audio_numpy_array, sample_rate = prep_res
|
||||||
|
|
||||||
|
# Convert NumPy array to WAV bytes for the API
|
||||||
|
byte_io = io.BytesIO()
|
||||||
|
scipy.io.wavfile.write(byte_io, sample_rate, audio_numpy_array)
|
||||||
|
wav_bytes = byte_io.getvalue()
|
||||||
|
|
||||||
|
print("Converting speech to text...")
|
||||||
|
transcribed_text = speech_to_text_api(audio_data=wav_bytes, sample_rate=sample_rate)
|
||||||
|
return transcribed_text
|
||||||
|
|
||||||
|
def post(self, shared, prep_res, exec_res):
|
||||||
|
if exec_res is None:
|
||||||
|
print("SpeechToTextNode: STT API returned no text.")
|
||||||
|
return "end_conversation"
|
||||||
|
|
||||||
|
transcribed_text = exec_res
|
||||||
|
print(f"User: {transcribed_text}")
|
||||||
|
|
||||||
|
if "chat_history" not in shared:
|
||||||
|
shared["chat_history"] = []
|
||||||
|
shared["chat_history"].append({"role": "user", "content": transcribed_text})
|
||||||
|
|
||||||
|
shared["user_audio_data"] = None
|
||||||
|
shared["user_audio_sample_rate"] = None
|
||||||
|
return "default"
|
||||||
|
|
||||||
|
class QueryLLMNode(Node):
|
||||||
|
"""Gets a response from the LLM."""
|
||||||
|
def prep(self, shared):
|
||||||
|
chat_history = shared.get("chat_history", [])
|
||||||
|
|
||||||
|
if not chat_history:
|
||||||
|
print("QueryLLMNode: Chat history is empty. Skipping LLM call.")
|
||||||
|
return None
|
||||||
|
|
||||||
|
return chat_history
|
||||||
|
|
||||||
|
def exec(self, prep_res):
|
||||||
|
if prep_res is None:
|
||||||
|
return None
|
||||||
|
|
||||||
|
chat_history = prep_res
|
||||||
|
print("Sending query to LLM...")
|
||||||
|
llm_response_text = call_llm(messages=chat_history)
|
||||||
|
return llm_response_text
|
||||||
|
|
||||||
|
def post(self, shared, prep_res, exec_res):
|
||||||
|
if exec_res is None:
|
||||||
|
print("QueryLLMNode: LLM API returned no response.")
|
||||||
|
return "end_conversation"
|
||||||
|
|
||||||
|
llm_response_text = exec_res
|
||||||
|
print(f"LLM: {llm_response_text}")
|
||||||
|
|
||||||
|
shared["chat_history"].append({"role": "assistant", "content": llm_response_text})
|
||||||
|
return "default"
|
||||||
|
|
||||||
|
class TextToSpeechNode(Node):
|
||||||
|
"""Converts the LLM's text response into speech and plays it."""
|
||||||
|
def prep(self, shared):
|
||||||
|
chat_history = shared.get("chat_history", [])
|
||||||
|
if not chat_history:
|
||||||
|
print("TextToSpeechNode: Chat history is empty. No LLM response to synthesize.")
|
||||||
|
return None
|
||||||
|
|
||||||
|
last_message = chat_history[-1]
|
||||||
|
if last_message.get("role") == "assistant" and last_message.get("content"):
|
||||||
|
return last_message.get("content")
|
||||||
|
else:
|
||||||
|
print("TextToSpeechNode: Last message not from assistant or no content. Skipping TTS.")
|
||||||
|
return None
|
||||||
|
|
||||||
|
def exec(self, prep_res):
|
||||||
|
if prep_res is None:
|
||||||
|
return None, None
|
||||||
|
|
||||||
|
llm_text_response = prep_res
|
||||||
|
print("Converting LLM response to speech...")
|
||||||
|
llm_audio_bytes, llm_sample_rate = text_to_speech_api(llm_text_response)
|
||||||
|
return llm_audio_bytes, llm_sample_rate
|
||||||
|
|
||||||
|
def post(self, shared, prep_res, exec_res):
|
||||||
|
if exec_res is None or exec_res[0] is None:
|
||||||
|
print("TextToSpeechNode: TTS failed or was skipped.")
|
||||||
|
return "next_turn"
|
||||||
|
|
||||||
|
llm_audio_bytes, llm_sample_rate = exec_res
|
||||||
|
|
||||||
|
print("Playing LLM response...")
|
||||||
|
try:
|
||||||
|
audio_segment, sr_from_file = soundfile.read(io.BytesIO(llm_audio_bytes))
|
||||||
|
play_audio_data(audio_segment, sr_from_file)
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Error playing TTS audio: {e}")
|
||||||
|
return "next_turn"
|
||||||
|
|
||||||
|
if shared.get("continue_conversation", True):
|
||||||
|
return "next_turn"
|
||||||
|
else:
|
||||||
|
print("Conversation ended by user flag.")
|
||||||
|
return "end_conversation"
|
||||||
|
|
@ -1,5 +1,6 @@
|
||||||
openai
|
openai
|
||||||
sounddevice
|
pocketflow
|
||||||
numpy
|
numpy
|
||||||
|
sounddevice
|
||||||
scipy
|
scipy
|
||||||
soundfile
|
soundfile
|
||||||
|
|
@ -8,7 +8,7 @@ def speech_to_text_api(audio_data: bytes, sample_rate: int):
|
||||||
# The API expects a file-like object. We can use io.BytesIO for in-memory bytes.
|
# The API expects a file-like object. We can use io.BytesIO for in-memory bytes.
|
||||||
# We also need to give it a name, as if it were a file upload.
|
# We also need to give it a name, as if it were a file upload.
|
||||||
audio_file = io.BytesIO(audio_data)
|
audio_file = io.BytesIO(audio_data)
|
||||||
audio_file.name = "audio.mp3" # Provide a dummy filename with a common audio extension
|
audio_file.name = "audio.wav" # Corrected to WAV format
|
||||||
|
|
||||||
transcript = client.audio.transcriptions.create(
|
transcript = client.audio.transcriptions.create(
|
||||||
model="gpt-4o-transcribe",
|
model="gpt-4o-transcribe",
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue