I don’t understand how I can read the bytestream for a TTS azure service in python.
From the docs: https://learn.microsoft.com/en-us/python/api/azure-cognitiveservices-speech/azure.cognitiveservices.speech.audiodatastream?view=azure-python
bool = can_read_data(requested_bytes: int, pos: int)
and
int = read_data(audio_buffer: bytes, pos: int | None = None)
so
<code>import azure.cognitiveservices.speech as speechsdk
speech_config = speechsdk.SpeechConfig(subscription='key', region='uksouth')
speech_config.set_speech_synthesis_output_format(speechsdk.SpeechSynthesisOutputFormat.Riff16Khz16BitMonoPcm)
speech_synthesizer = speechsdk.SpeechSynthesizer(speech_config=speech_config, audio_config=None)
text = "Hello, world!"
# Synthesize the speech
result = speech_synthesizer.speak_text_async(text).get()
# Create an AudioDataStream from the synthesized result
if result.reason == speechsdk.ResultReason.SynthesizingAudioCompleted:
print("Speech synthesized for text [{}]".format(text))
audio_data_stream = speechsdk.AudioDataStream(result)
audio_data_stream.save_to_wav_file("output.wav")
# Reset the stream position to the beginning since saving to file puts the position to end.
audio_data_stream.position = 0
# Reads data from the stream
audio_buffer = bytes(16000)
total_size = 0
filled_size = audio_data_stream.read_data(audio_buffer)
while filled_size > 0:
print("{} bytes received.".format(filled_size))
total_size += filled_size
filled_size = audio_data_stream.read_data(audio_buffer)
print("Totally {} bytes received for text [{}].".format(total_size, text))
# Initialize playing
from pydub import AudioSegment
import io
audio_segment = AudioSegment(
data=audio_buffer, # The raw audio data you received
sample_width=2, # Bytes per sample
frame_rate=16000, # Sampling frequency
channels=1 # Mono
)
from pydub.playback import play
play(audio_segment)
elif result.reason == speechsdk.ResultReason.Canceled:
cancellation_details = result.cancellation_details
print("Speech synthesis canceled: {}".format(cancellation_details.reason))
if cancellation_details.reason == speechsdk.CancellationReason.Error:
print("Error details: {}".format(cancellation_details.error_details))
</code>
<code>import azure.cognitiveservices.speech as speechsdk
speech_config = speechsdk.SpeechConfig(subscription='key', region='uksouth')
speech_config.set_speech_synthesis_output_format(speechsdk.SpeechSynthesisOutputFormat.Riff16Khz16BitMonoPcm)
speech_synthesizer = speechsdk.SpeechSynthesizer(speech_config=speech_config, audio_config=None)
text = "Hello, world!"
# Synthesize the speech
result = speech_synthesizer.speak_text_async(text).get()
# Create an AudioDataStream from the synthesized result
if result.reason == speechsdk.ResultReason.SynthesizingAudioCompleted:
print("Speech synthesized for text [{}]".format(text))
audio_data_stream = speechsdk.AudioDataStream(result)
audio_data_stream.save_to_wav_file("output.wav")
# Reset the stream position to the beginning since saving to file puts the position to end.
audio_data_stream.position = 0
# Reads data from the stream
audio_buffer = bytes(16000)
total_size = 0
filled_size = audio_data_stream.read_data(audio_buffer)
while filled_size > 0:
print("{} bytes received.".format(filled_size))
total_size += filled_size
filled_size = audio_data_stream.read_data(audio_buffer)
print("Totally {} bytes received for text [{}].".format(total_size, text))
# Initialize playing
from pydub import AudioSegment
import io
audio_segment = AudioSegment(
data=audio_buffer, # The raw audio data you received
sample_width=2, # Bytes per sample
frame_rate=16000, # Sampling frequency
channels=1 # Mono
)
from pydub.playback import play
play(audio_segment)
elif result.reason == speechsdk.ResultReason.Canceled:
cancellation_details = result.cancellation_details
print("Speech synthesis canceled: {}".format(cancellation_details.reason))
if cancellation_details.reason == speechsdk.CancellationReason.Error:
print("Error details: {}".format(cancellation_details.error_details))
</code>
import azure.cognitiveservices.speech as speechsdk
speech_config = speechsdk.SpeechConfig(subscription='key', region='uksouth')
speech_config.set_speech_synthesis_output_format(speechsdk.SpeechSynthesisOutputFormat.Riff16Khz16BitMonoPcm)
speech_synthesizer = speechsdk.SpeechSynthesizer(speech_config=speech_config, audio_config=None)
text = "Hello, world!"
# Synthesize the speech
result = speech_synthesizer.speak_text_async(text).get()
# Create an AudioDataStream from the synthesized result
if result.reason == speechsdk.ResultReason.SynthesizingAudioCompleted:
print("Speech synthesized for text [{}]".format(text))
audio_data_stream = speechsdk.AudioDataStream(result)
audio_data_stream.save_to_wav_file("output.wav")
# Reset the stream position to the beginning since saving to file puts the position to end.
audio_data_stream.position = 0
# Reads data from the stream
audio_buffer = bytes(16000)
total_size = 0
filled_size = audio_data_stream.read_data(audio_buffer)
while filled_size > 0:
print("{} bytes received.".format(filled_size))
total_size += filled_size
filled_size = audio_data_stream.read_data(audio_buffer)
print("Totally {} bytes received for text [{}].".format(total_size, text))
# Initialize playing
from pydub import AudioSegment
import io
audio_segment = AudioSegment(
data=audio_buffer, # The raw audio data you received
sample_width=2, # Bytes per sample
frame_rate=16000, # Sampling frequency
channels=1 # Mono
)
from pydub.playback import play
play(audio_segment)
elif result.reason == speechsdk.ResultReason.Canceled:
cancellation_details = result.cancellation_details
print("Speech synthesis canceled: {}".format(cancellation_details.reason))
if cancellation_details.reason == speechsdk.CancellationReason.Error:
print("Error details: {}".format(cancellation_details.error_details))
Its streaming – and saving. But the stream doesnt sound right. What am I getting wrong?