I basically want to create an AI phone agent like that seen on:
vapi.ai or synthflow.ai
But ive really been strugglign with twilio to get the audio from a live call and transcribe that audio. The transcribed audio would then be passed though azure voices and respond in that voice after its text input has been processed by chat gpt.
Sounds simple but has been hell. The code im about to show you doesnt even use WHIPSER it used google speech to text because i went on a long tangent and created a new project file to isolate issues and it all boils down to twilio!
Here is my code:
app.py
from flask import Flask, request, Response
from twilio.twiml.voice_response import VoiceResponse
from twilio.rest import Client
import os
import logging
from dotenv import load_dotenv
from flask_sock import Sock
import requests
import json
from google.cloud import speech
from google.oauth2 import service_account
import whisper
import torch
import numpy as np
import soundfile as sf
import base64
import subprocess
from pydub import AudioSegment
import pywav
from google.oauth2.service_account import Credentials
import time
GOOGLE_APPLICATION_CREDENTIALS_PATH = r’C:AI DownloadsChatGPT Coding ExperimentsGPT Code ReaderEVCCtext2speech-396010-622fe4e7cb7b.json’
creds = Credentials.from_service_account_file(GOOGLE_APPLICATION_CREDENTIALS_PATH)
credentials = service_account.Credentials.from_service_account_file(GOOGLE_APPLICATION_CREDENTIALS_PATH)
speech_client = speech.SpeechClient(credentials=credentials)
Load environment variables from .env file
load_dotenv()
logging.basicConfig(level=logging.DEBUG)
app = Flask(name)
sock = Sock(app)
model = whisper.load_model(“base”, device=”cuda”)
account_sid = os.getenv('TWILIO_ACCOUNT_SID') auth_token = os.getenv('TWILIO_AUTH_TOKEN') twilio_number = os.getenv('TWILIO_PHONE_NUMBER') client = Client(account_sid, auth_token)
if not all([account_sid, auth_token, twilio_number]): raise ValueError("One or more Twilio credentials are not set.")
@app.route("/voice", methods=['POST']) def voice(): response = VoiceResponse() response.say("Hello")
`
connect = response.connect(url=f’wss://{request.host}/websocket’)
connect.stream(url=f’wss://{request.host}/websocket’)
return Response(str(response), mimetype='text/xml')`
@app.route("/call", methods=['POST']) def call():
to_number = request.json.get('to') if not to_number:
return "Missing 'to' number", 400
try: call = client.calls.create( to=to_number,
from_=twilio_number, url="https://7269-31-125-115-242.ngrok-free.app/voice" # Replace with your ngrok URL )
return f"Call initiated: {call.sid}", 200 except Exception as e: logging.error(f"Error initiating call: {str(e)}") return f"Error initiating call: {str(e)}", 500
def get_audio_info(file_path):
# Use ffmpeg to get detailed information
result = subprocess.run([“ffmpeg”, “-f”, “mulaw”, “-ar”, “8000”, “-i”, file_path], stderr=subprocess.PIPE, text=True)
ffmpeg_output = result.stderr
# Extract information from ffmpeg output
format_info = None
codec_info = None
channels = None
frame_rate = None
sample_width = None
for line in ffmpeg_output.split(“n”):
if “Input #” in line:
format_info = line.split(“,”)[1].strip()
if “Audio:” in line:
parts = line.split(“Audio:”)[1].split(“,”)
codec_info = parts[0].strip()
frame_rate = int(parts[1].strip().split(” “)[0])
channels = parts[2].strip().split(” “)[0]
sample_width = int(parts[3].strip().split(” “)[0].replace(” bits”, “”))
audio_info = {
“channels”: channels,
“frame_rate (Hz)”: frame_rate,
“sample_width (bytes)”: sample_width,
“format”: format_info,
“codec”: codec_info
}
return audio_info
@sock.route(‘/websocket’)
def websocket(ws):
logging.info(“WebSocket connection established”)
streaming_config = speech.StreamingRecognitionConfig(
config=speech.RecognitionConfig(
encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16,
sample_rate_hertz=8000,
language_code="en-US",
),
interim_results=True,
)
def generate_requests():
try:
data_list = []
while True:
data = ws.receive()
logging.debug(f"Received data: {data}")
# Write received data to a text file to avoid console spam
with open("received_data.txt", "a") as file:
file.write(data + "n")
if data:
try:
# Parse the received data to extract the payload
data_dict = json.loads(data)
payload = data_dict.get("media", {}).get("payload", "")
if payload:
# Decode the base64 payload and add to data list
audio_content = base64.b64decode(payload)
data_list.append(audio_content)
else:
logging.debug("No payload found in received data")
except Exception as e:
logging.error(f"Error processing data: {e}")
else:
break
# Send audio data periodically to avoid timeout
if time.time() - start_time > 1:
if data_list:
combined_audio = b"".join(data_list)
temp_wav_path = "temp_audio.wav"
wave_write = pywav.WavWrite(temp_wav_path, 1, 8000, 8, 7) # Mono, 8000 Hz, 8 bit, MULAW encoding
wave_write.write(combined_audio)
wave_write.close()
with open(temp_wav_path, "rb") as wav_file:
wav_content = wav_file.read()
yield speech.StreamingRecognizeRequest(audio_content=wav_content)
data_list = []
start_time = time.time()
# Process any remaining audio data
if data_list:
combined_audio = b"".join(data_list)
temp_wav_path = "temp_audio.wav"
wave_write = pywav.WavWrite(temp_wav_path, 1, 8000, 8, 7) # Mono, 8000 Hz, 8 bit, MULAW encoding
wave_write.write(combined_audio)
wave_write.close()
with open(temp_wav_path, "rb") as wav_file:
wav_content = wav_file.read()
yield speech.StreamingRecognizeRequest(audio_content=wav_content)
except Exception as e:
logging.error(f"Error receiving data: {e}")
try:
responses = speech_client.streaming_recognize(config=streaming_config, requests=generate_requests())
for response in responses:
for result in response.results:
logging.info(f"Transcribed audio data: {result.alternatives[0].transcript}")
except Exception as e:
logging.error(f"Error during streaming recognition: {e}")
if name == “main“:
app.run(debug=True)
Can someone please help, i wouldnt mind if you just provided brand new code that actually works as ive been scratching my head for 2 days without eating! I’ve tried the twilio gather api but that just uses the inbuilt twilio transcription which is a really bad one. I want to use whisper. Any help would be more than immennsely appreciated.
What i think is happening is there’s some sort of struggle to convert the media payload. This is some of the output:
{“event”:”connected”,”protocol”:”Call”,”version”:”1.0.0″}
{“event”:”start”,”sequenceNumber”:”1″,”start”:{“accountSid”:”AC2c8d4b5084b102b1d8fab4c2a631866f”,”streamSid”:”MZ3270cabfff2b54d6e9c45dc4235f60d2″,”callSid”:”CA46aefeef8646d5b9b0642991a6d3cf61″,”tracks”:[“inbound”],”mediaFormat”:{“encoding”:”audio/x-mulaw”,”sampleRate”:8000,”channels”:1},”customParameters”:{}},”streamSid”:”MZ3270cabfff2b54d6e9c45dc4235f60d2″}
{“event”:”media”,”sequenceNumber”:”2″,”media”:{“track”:”inbound”,”chunk”:”1″,”timestamp”:”79″,”payload”:”/n5+/v7+/v7+/v7+/n5+fH5+fn7+/v7+/v7+/v7+/v7+/v5+fv5+/n5+fn5+fn7+/v5+fv5+/v7+/v5+fv7+/v7+/n5+/v5+fn7+/v7+/v7+/v7+/v5+fn7+/n7+fn5+fn5+fn7+/v7+/v7+fn5+fH7+/v7+/v7+fn5+/v7+fv7+fn5+fn5+fn5+fn5+fP7+/v5+fn58/v7+/v7+fn7+/g==”},”streamSid”:”MZ3270cabfff2b54d6e9c45dc4235f60d2″}
{“event”:”media”,”sequenceNumber”:”3″,”media”:{“track”:”inbound”,”chunk”:”2″,”timestamp”:”99″,”payload”:”/v5+fv5+/v7+/v7+/v7+/n5+fv7+/v5+fH7+/v7+/n7+/v7+/v7+fn5+fv5+fH7+fv5+/v7+/v7+/v7+/n5+/v7+/n5+/v7+/n5+fv7+/v5+fv7+/v7+fv7+fv7+/v7+/v7+/n5+fHx8/v7+/v7+/v7+/v7+/v7+/v7+/n7+fv5+fn5+/n7+fn5+fv5+/n5+fn5+fv7+/v5+fv7+/v7+fg==”},”streamSid”:”MZ3270cabfff2b54d6e9c45dc4235f60d2″}
{“event”:”media”,”sequenceNumber”:”4″,”media”:{“track”:”inbound”,”chunk”:”3″,”timestamp”:”119″,”payload”:”fv7+/v5+/v7+fn5+fn7+/v7+fn7+/v7+/v7+/v7+/v7+fn7+/v7+/v7+/v7+/v7+/v7+/v5+fv7+/v5+fH7+/n7+fn5+/v7+fn7+/v7+/v7+/v7+fn7+/v7+fv5+/v7+fn7+/v7+/v5+fn5+fH5+/v7+/v7+/v7+fn7+fv5+fn5+fn7+/v7+fn7+/v5+/n7+/v7+/v7+/n5+/v7+/v7+/g==”},”streamSid”:”MZ3270cabfff2b54d6e9c45dc4235f60d2″}
{“event”:”media”,”sequenceNumber”:”5″,”media”:{“track”:”inbound”,”chunk”:”4″,”timestamp”:”139″,”payload”:”/v5+fn5+/n7+fv5+fv7+/v7+/v7+/v5+fv7+/v7+/v7+/v7+fn7+/v7+fn7+fv7+/n5+/v5+fv7+/v7+fn7+/v7+/v5+fv7+/v7+/v7+/v5+fn5+fn5+/v7+fv7+/v7+fv7+/v7+/v7+/v5+fP7+fv5+fn7+/v7+/v7+/n7+fv7+/v7+/v7+fv7+/v5+fn7+/v7+/v7+/v7+/n5+/v7+fg==”},”streamSid”:”MZ3270cabfff2b54d6e9c45dc4235f60d2″}
You can see its picking up audio info but the transcribed text part of the function is never runing.. i dont know why and ive done everything i could to try decode from base64 and convert the audio from mulaw to wav its getting annoying now. Also please dont recommend audioop its deprecated and i cant even download that.
Thanks!
I was expected real time translation, instead i got picked up audio thats never converting
Edash codes is a new contributor to this site. Take care in asking for clarification, commenting, and answering.
Check out our Code of Conduct.
1