114 lines
4.8 KiB
Python
114 lines
4.8 KiB
Python
import sounddevice as sd
|
|
import numpy as np
|
|
|
|
DEFAULT_SAMPLE_RATE = 44100
|
|
DEFAULT_CHANNELS = 1
|
|
DEFAULT_CHUNK_SIZE_MS = 50 # Process audio in 50ms chunks for VAD
|
|
DEFAULT_SILENCE_THRESHOLD_RMS = 0.01 # RMS value, needs tuning
|
|
DEFAULT_MIN_SILENCE_DURATION_MS = 1000 # 1 second of silence to stop
|
|
DEFAULT_MAX_RECORDING_DURATION_S = 15 # Safety cap for recording
|
|
DEFAULT_PRE_ROLL_CHUNKS = 3 # Number of chunks to keep before speech starts
|
|
|
|
def record_audio(sample_rate = DEFAULT_SAMPLE_RATE,
|
|
channels = DEFAULT_CHANNELS,
|
|
chunk_size_ms = DEFAULT_CHUNK_SIZE_MS,
|
|
silence_threshold_rms = DEFAULT_SILENCE_THRESHOLD_RMS,
|
|
min_silence_duration_ms = DEFAULT_MIN_SILENCE_DURATION_MS,
|
|
max_recording_duration_s = DEFAULT_MAX_RECORDING_DURATION_S,
|
|
pre_roll_chunks_count = DEFAULT_PRE_ROLL_CHUNKS):
|
|
"""
|
|
Records audio from the microphone with silence-based VAD.
|
|
Returns in-memory audio data (NumPy array of float32) and sample rate.
|
|
Returns (None, sample_rate) if recording fails or max duration is met without speech.
|
|
"""
|
|
chunk_size_frames = int(sample_rate * chunk_size_ms / 1000)
|
|
min_silence_chunks = int(min_silence_duration_ms / chunk_size_ms)
|
|
max_chunks = int(max_recording_duration_s * 1000 / chunk_size_ms)
|
|
|
|
print(f"Listening... (max {max_recording_duration_s}s). Speak when ready.")
|
|
print(f"(Silence threshold RMS: {silence_threshold_rms}, Min silence duration: {min_silence_duration_ms}ms)")
|
|
|
|
recorded_frames = []
|
|
pre_roll_frames = []
|
|
is_recording = False
|
|
silence_counter = 0
|
|
chunks_recorded = 0
|
|
|
|
with sd.InputStream(samplerate=sample_rate, channels=channels, dtype='float32') as stream:
|
|
|
|
for i in range(max_chunks):
|
|
audio_chunk, overflowed = stream.read(chunk_size_frames)
|
|
if overflowed:
|
|
print("Warning: Audio buffer overflowed!")
|
|
|
|
rms = np.sqrt(np.mean(audio_chunk**2))
|
|
|
|
if is_recording:
|
|
recorded_frames.append(audio_chunk)
|
|
chunks_recorded += 1
|
|
if rms < silence_threshold_rms:
|
|
silence_counter += 1
|
|
if silence_counter >= min_silence_chunks:
|
|
print("Silence detected, stopping recording.")
|
|
break
|
|
else:
|
|
silence_counter = 0 # Reset silence counter on sound
|
|
else:
|
|
pre_roll_frames.append(audio_chunk)
|
|
if len(pre_roll_frames) > pre_roll_chunks_count:
|
|
pre_roll_frames.pop(0)
|
|
|
|
if rms > silence_threshold_rms:
|
|
print("Speech detected, starting recording.")
|
|
is_recording = True
|
|
for frame_to_add in pre_roll_frames:
|
|
recorded_frames.append(frame_to_add)
|
|
chunks_recorded = len(recorded_frames)
|
|
pre_roll_frames.clear()
|
|
|
|
if i == max_chunks - 1 and not is_recording:
|
|
print("No speech detected within the maximum recording duration.")
|
|
return None, sample_rate
|
|
|
|
if not recorded_frames and is_recording:
|
|
print("Recording started but captured no frames before stopping. This might be due to immediate silence.")
|
|
|
|
if not recorded_frames:
|
|
print("No audio was recorded.")
|
|
return None, sample_rate
|
|
|
|
audio_data = np.concatenate(recorded_frames)
|
|
print(f"Recording finished. Total duration: {len(audio_data)/sample_rate:.2f}s")
|
|
return audio_data, sample_rate
|
|
|
|
def play_audio_data(audio_data, sample_rate):
|
|
"""Plays in-memory audio data (NumPy array)."""
|
|
try:
|
|
print(f"Playing in-memory audio data (Sample rate: {sample_rate} Hz, Duration: {len(audio_data)/sample_rate:.2f}s)")
|
|
sd.play(audio_data, sample_rate)
|
|
sd.wait()
|
|
print("Playback from memory finished.")
|
|
except Exception as e:
|
|
print(f"Error playing in-memory audio: {e}")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
print("--- Testing audio_utils.py ---")
|
|
|
|
# Test 1: record_audio() and play_audio_data() (in-memory)
|
|
print("\n--- Test: Record and Play In-Memory Audio ---")
|
|
print("Please speak into the microphone. Recording will start on sound and stop on silence.")
|
|
recorded_audio, rec_sr = record_audio(
|
|
sample_rate=DEFAULT_SAMPLE_RATE,
|
|
silence_threshold_rms=0.02,
|
|
min_silence_duration_ms=1500,
|
|
max_recording_duration_s=10
|
|
)
|
|
|
|
if recorded_audio is not None and rec_sr is not None:
|
|
print(f"Recorded audio data shape: {recorded_audio.shape}, Sample rate: {rec_sr} Hz")
|
|
play_audio_data(recorded_audio, rec_sr)
|
|
else:
|
|
print("No audio recorded or recording failed.")
|
|
|
|
print("\n--- audio_utils.py tests finished. ---") |