I’m trying to integrate Microsoft’s speech services SDK, specifically a front-end will upload an audioData file to my backend (fastapi), and I want to send it to microsoft’s endpoint for evaluation via the sdk. Each time I do so, it hangs and I get the following error after 10+ seconds. I suspect that the error might be similar to Microsoft Cognitive SpeechRecognizer Stuck, but a) I’m using the Python SDK which does not have the FromWavFileInput method, b) I tried adding 100kb of empty buffer, but it still does not work.
I’ve tested the SDK code on a jupyter notebook with a local wav file, and it works, so it’s the integration with fastapi that’s causing the issue. Does anyone have any suggestions on what I can do to solve this?
Alternatively, does anyone know if pronunciation assessment can be done via the API instead of the SDK?
Speech Recognition canceled: CancellationReason.Error
Error details: Timeout: no recognition result received SessionId: ce96699331684cb7ab4dbb1f619bff10
Info: on_underlying_io_bytes_received: Close frame received
Info: on_underlying_io_bytes_received: closing underlying io.
Info: on_underlying_io_close_complete: uws_state: 6.
@app.post('/transcribe')
async def transcriptions(audioData: UploadFile = File(...),
language: Optional[str] = Form(None)):
# Read the file content
audio_content = await audioData.read()
# Append an empty buffer to the audio content
audio_content += b'x00' * 102400 # Add 100KB of silence
with tempfile.NamedTemporaryFile(delete=False, suffix='.wav') as temp_audio:
temp_audio.write(audio_content)
temp_audio_path = temp_audio.name
# Creates an instance of a speech config with specified subscription key and service region.
# Replace with your own subscription key and service region (e.g., "westus").
# Note: The sample is for en-US language.
print(temp_audio_path)
speech_config = speechsdk.SpeechConfig(subscription=speech_key, region=service_region)
audio_config = speechsdk.audio.AudioConfig(filename=temp_audio_path)
reference_text = "I am a boy"
# Create pronunciation assessment config with json string (JSON format is not recommended)
enable_miscue, enable_prosody = False, False
config_json = {
"GradingSystem": "HundredMark",
"Granularity": "Phoneme",
"Dimension": "Comprehensive",
"ScenarioId": "", # "" is the default scenario or ask product team for a customized one
"EnableMiscue": enable_miscue,
"EnableProsodyAssessment": enable_prosody,
"NBestPhonemeCount": 0, # > 0 to enable "spoken phoneme" mode, 0 to disable
}
pronunciation_config = speechsdk.PronunciationAssessmentConfig(json_string=json.dumps(config_json))
pronunciation_config.reference_text = reference_text
# Create a speech recognizer using a file as audio input.
language = 'en-US'
speech_recognizer = speechsdk.SpeechRecognizer(speech_config=speech_config, language=language, audio_config=audio_config)
# Apply pronunciation assessment config to speech recognizer
pronunciation_config.apply_to(speech_recognizer)
result = speech_recognizer.recognize_once_async().get()
# Check the result
if result.reason == speechsdk.ResultReason.RecognizedSpeech:
print('pronunciation assessment for: {}'.format(result.text))
pronunciation_result = json.loads(result.properties.get(speechsdk.PropertyId.SpeechServiceResponse_JsonResult))
print('assessment results:n{}'.format(json.dumps(pronunciation_result, indent=4)))
elif result.reason == speechsdk.ResultReason.NoMatch:
print("No speech could be recognized")
elif result.reason == speechsdk.ResultReason.Canceled:
cancellation_details = result.cancellation_details
print("Speech Recognition canceled: {}".format(cancellation_details.reason))
if cancellation_details.reason == speechsdk.CancellationReason.Error:
print("Error details: {}".format(cancellation_details.error_details))
# ignore this - i'm actually doing transcription with this function
return{'is_subject': True, 'transcription': 'I am a boy'}
if __name__ == "__main__":
uvicorn.run(app, host="0.0.0.0", port=8080)