I’m trying to make an app that records my mike’s audio on 1 thread, and creates 5 second chunks. It then places each of those chunks into a queue, that sends it to a second thread to transcribe the audio to text, then adding the text to a second queue. Finally, it goes to a third thread, which will perform a variety of opperations on the string. For now, it just prints it for debug purposes.
import sounddevice as sd
import numpy as np
import threading
import queue
import time
from faster_whisper import WhisperModel
audio_model = WhisperModel(model_path) # Initialize faster_whisper model
fs = 44100 # Sampling rate
audio_q = queue.Queue() # Queue to communicate raw audio data between threads
text_q = queue.Queue() # Queue to communicate transcribed text between threads
short_chunk_duration = 0.1 # Short chunk duration in seconds (e.g., 100ms)
long_chunk_duration = 5 # Desired chunk duration in seconds (5 seconds)
# --- Thread Functions ---
def record_audio():
"""Continuously records audio and adds 5-second chunks to the queue."""
accumulated_buffer = []
def callback(indata, frames, time, status):
nonlocal accumulated_buffer
if status:
print(status)
accumulated_buffer.append(indata.copy())
# Check if we have accumulated enough data for a 5-second chunk
if (
len(accumulated_buffer) * int(short_chunk_duration * fs)
>= long_chunk_duration * fs
):
# Concatenate the accumulated chunks into one array
long_chunk = np.concatenate(accumulated_buffer)
audio_q.put(long_chunk) # Add the 5-second chunk to the queue
accumulated_buffer = [] # Reset the buffer
with sd.InputStream(samplerate=fs, channels=1, callback=callback):
while True:
sd.sleep(int(short_chunk_duration * 1000))
def transcribe_audio():
"""Takes audio from the queue, transcribes it, and puts text in the queue."""
while True:
audio_data = audio_q.get()
audio_array = np.frombuffer(audio_data, dtype=np.float32)
# Confirming audio_array fits the requirements of faster_whisper
if audio_array.ndim == 1:
audio_array = np.expand_dims(audio_array, axis=0)
# Transcribe with faster_whisper
segments, _ = audio_model.transcribe(audio_array)
# The result structure is different; let's collect all transcriptions
text = ""
for segment in segments:
text += segment.text
print(f"Transcribed: {text}")
audio_q.task_done()
text_q.put(text) # Put transcribed text into the queue
def process_text():
pass
# --- Start Threads ---
if __name__ == "__main__":
recording_thread = threading.Thread(target=record_audio)
transcribing_thread = threading.Thread(target=transcribe_audio)
processing_thread = threading.Thread(target=process_text)
recording_thread.daemon = True
transcribing_thread.daemon = True
processing_thread.daemon = True
recording_thread.start()
transcribing_thread.start()
processing_thread.start()
try:
while True:
time.sleep(1)
except KeyboardInterrupt:
print("Terminating threads...")
It should transcribe my microphone in 5 second chunks, then print it to the console, however, it’s giving me an error saying that it’s trying to alocate 960 GB of ram for numpy. I haven’t used numpy much, so I’d guess it’s something to do with the logic that expands the audio object into an array.
Shane Earley is a new contributor to this site. Take care in asking for clarification, commenting, and answering.
Check out our Code of Conduct.