I’m trying to encode frames from an audio buffer to Opus data in a Swift iOS app, and send them to be encoded on a server by Whisper.
The error I keep getting back from the server is opuslib.exceptions.OpusError: b'buffer too small'
. I’ve noticed that once the encoding happens on the iOS side, I see this log:
Encoded 8 bytes of data.
, which definitely looks too small.
Below is the code for creating the audio sessions and encoding the data:
class AudioSessionManager: ObservableObject {
private var audioEngine: AVAudioEngine?
private weak var appManager: AppManager?
private var canSendData: Bool = false
static let shared = AudioSessionManager(appManager: AppManager.shared)
init(appManager: AppManager) {
self.appManager = appManager
setupAudioSession()
}
// MARK: - Audio Session Setup
private func setupAudioSession() {
let session = AVAudioSession.sharedInstance()
do {
try session.setPreferredSampleRate(48000)
try session.setPreferredInputNumberOfChannels(1)
try session.setCategory(.record, mode: .default, options: [])
try session.setActive(true)
print("Audio session setup complete with sample rate 48000 Hz and mono channel.")
} catch {
print("Failed to set up audio session: (error)")
}
}
// MARK: - Audio Session Setup
func checkMicrophonePermission() -> Bool {
return AVAudioSession.sharedInstance().recordPermission == .granted
}
func requestMicrophoneAccess(completion: @escaping (Bool) -> Void) {
let audioSession = AVAudioSession.sharedInstance()
switch audioSession.recordPermission {
case .granted:
completion(true)
case .denied:
completion(false)
case .undetermined:
audioSession.requestRecordPermission { granted in
DispatchQueue.main.async {
completion(granted)
}
}
@unknown default:
completion(false)
}
}
// MARK: - Audio Engine Setup
func setupAudioEngine() {
audioEngine = AVAudioEngine()
guard let audioEngine = audioEngine else {
print("Audio engine could not be initialized")
return
}
let inputNode = audioEngine.inputNode
let mixerNode = AVAudioMixerNode()
audioEngine.attach(mixerNode)
// [120, 240, 480, 960, 1920, 2880
let opusCompatibleBufferSize: AVAudioFrameCount = 2880
let desiredFormat = AVAudioFormat(standardFormatWithSampleRate: 48000, channels: 1)
audioEngine.connect(inputNode, to: mixerNode, format: inputNode.inputFormat(forBus: 0))
mixerNode.installTap(onBus: 0, bufferSize: opusCompatibleBufferSize, format: desiredFormat) { [weak self] (buffer, when) in
self?.bufferAudioData(buffer)
}
}
private func bufferAudioData(_ buffer: AVAudioPCMBuffer) {
if buffer.format.sampleRate != 48000 || buffer.format.channelCount != 1 {
print("Buffer format mismatch: Expected 48000 Hz, 1 channel, but got (buffer.format.sampleRate) Hz, (buffer.format.channelCount) channels")
return
}
AudioStreamManager.shared.canSendData = canSendData
AudioStreamManager.shared.process(buffer)
}
// MARK: - Recording Control
func startRecording() {
print("Starting recording...")
canSendData = true
setupAudioEngine()
WebSocketManager.shared.sendCommand("aq_start")
do {
try audioEngine?.start()
print("Audio engine started.")
} catch {
print("Failed to start audio engine: (error)")
}
}
func stopRecording() {
print("Stopping recording...")
canSendData = false
audioEngine?.stop()
audioEngine = nil
WebSocketManager.shared.sendCommand("aq_stop")
print("Recording stopped.")
}
}
This is where I’m handling audio stream encoding and sending:
import Foundation
import AVFoundation
import Opus
class AudioStreamManager {
static let shared = AudioStreamManager(appManager: AppManager.shared)
var storage: ContiguousArray<Float> = []
var opusEncoder: Opus.Encoder?
var opusDecoder: Opus.Decoder?
var canSendData: Bool = false
var appManager: AppManager
private let audioEngine = AVAudioEngine()
private let audioPlayerNode = AVAudioPlayerNode()
init(appManager: AppManager) {
self.appManager = appManager
setupOpusEncoder()
setupOpusDecoder()
setupAudioEngine()
}
// MARK: - Audio Processing
func process(_ buffer: AVAudioPCMBuffer) {
buffer.floatChannelData?.withMemoryRebound(to: Float.self, capacity: Int(buffer.frameLength)) { pointer in
for i in 0..<Int(buffer.frameLength) {
storage.append(pointer[i])
}
}
checkOutput()
}
private func checkOutput() {
let desiredSize = 2880
while storage.count >= desiredSize {
let outArray: ContiguousArray<Float> = ContiguousArray<Float>(storage[0...(desiredSize-1)])
if let outBuffer = createPCMBuffer(from: outArray, sampleRate: 48000, channelCount: 1) {
send(buffer: outBuffer)
}
storage.removeFirst(desiredSize)
}
}
private func send(buffer: AVAudioPCMBuffer) {
guard let encoder = opusEncoder else {
print("Opus encoder not initialized")
return
}
var opusData = Data(count: 1500)
do {
let bytesEncoded = try encoder.encode(buffer, to: &opusData)
print("Encoded (bytesEncoded) bytes of data.")
if !opusData.isEmpty && canSendData {
WebSocketManager.shared.send(audioData: opusData, frameSize: UInt32(opusData.count)) {
print("Opus encoded audio data sent.")
}
}
} catch let error as Opus.Error {
print("Failed to encode audio: Opus Error (error.rawValue) - (interpretOpusError(error))")
} catch {
print("Failed to encode audio: (error)")
}
}
// MARK: - Opus Setup
private func setupOpusEncoder() {
let sampleRate = 48000.0
let channels = 1
guard let opusFormat = AVAudioFormat(opusPCMFormat: .float32, sampleRate: sampleRate, channels: AVAudioChannelCount(channels)) else {
print("Invalid audio format parameters")
return
}
do {
opusEncoder = try Opus.Encoder(format: opusFormat)
print("Opus encoder successfully created")
} catch {
print("Failed to create Opus encoder: (error)")
}
}
private func setupOpusDecoder() {
let sampleRate = 48000.0
let channels = 1
guard let opusFormat = AVAudioFormat(opusPCMFormat: .float32, sampleRate: sampleRate, channels: AVAudioChannelCount(channels)) else {
print("Invalid audio format parameters")
return
}
do {
opusDecoder = try Opus.Decoder(format: opusFormat)
print("Opus decoder successfully created")
} catch {
print("Failed to create Opus decoder: (error)")
}
}
func decodeOpusData(_ opusData: Data) -> AVAudioPCMBuffer? {
guard let opusDecoder = opusDecoder else {
print("Opus decoder not initialized")
return nil
}
// do {
// print("Attempting to decode Opus data of length: (opusData.count)")
// let decodedAudioBuffer = try opusDecoder.decode(opusData)
// print("Successfully decoded Opus data")
// return decodedAudioBuffer
// } catch let error as Opus.Error {
// print("Failed to decode audio: Opus Error (error.rawValue) - (interpretOpusError(error))")
// } catch {
// print("Failed to decode audio: (error)")
// }
return nil
}
private func interpretOpusError(_ error: Opus.Error) -> String {
switch error {
case .ok:
return "No error."
case .badArgument:
return "One or more invalid/out of range arguments."
case .bufferTooSmall:
return "The mode struct passed is invalid."
case .internalError:
return "An internal error was detected."
case .invalidPacket:
return "The compressed data passed is corrupted."
case .unimplemented:
return "Invalid/unsupported request number."
case .invalidState:
return "An encoder or decoder structure is invalid or already freed."
case .allocationFailure:
return "Memory allocation has failed."
default:
return "Unknown error."
}
}
// MARK: - Audio Engine Setup
private func setupAudioEngine() {
audioEngine.attach(audioPlayerNode)
let format = AVAudioFormat(standardFormatWithSampleRate: 48000, channels: 1)!
audioEngine.connect(audioPlayerNode, to: audioEngine.mainMixerNode, format: format)
do {
try audioEngine.start()
print("Audio engine started")
} catch {
print("Failed to start audio engine: (error)")
}
}
func playDecodedAudio(_ buffer: AVAudioPCMBuffer) {
audioPlayerNode.scheduleBuffer(buffer, completionHandler: nil)
if !audioPlayerNode.isPlaying {
audioPlayerNode.play()
}
}
}
// MARK: - Utility Function
func createPCMBuffer(from floatArray: ContiguousArray<Float>, sampleRate: Double, channelCount: AVAudioChannelCount) -> AVAudioPCMBuffer? {
guard let format = AVAudioFormat(standardFormatWithSampleRate: sampleRate, channels: channelCount) else {
print("Failed to create AVAudioFormat")
return nil
}
let frameCount = AVAudioFrameCount(floatArray.count / Int(channelCount))
guard let pcmBuffer = AVAudioPCMBuffer(pcmFormat: format, frameCapacity: frameCount) else {
print("Failed to create AVAudioPCMBuffer")
return nil
}
pcmBuffer.frameLength = frameCount
let channels = UnsafeBufferPointer(start: pcmBuffer.floatChannelData, count: Int(channelCount))
for channelIndex in 0..<Int(channelCount) {
let channel = channels[channelIndex]
let stride = Int(channelCount)
for frameIndex in 0..<Int(frameCount) {
channel[frameIndex] = floatArray[frameIndex * stride + channelIndex]
}
}
return pcmBuffer
}
Here’s where I’m handling the sending of the audio data via a Websocket:
func send(audioData: Data, frameSize: UInt32, completion: @escaping () -> Void) {
if isConnected {
let framePrefix = "frame"
var packetData = Data()
packetData.append(framePrefix.data(using: .utf8)!)
var frameSizeBytes = frameSize.littleEndian
packetData.append(Data(bytes: &frameSizeBytes, count: MemoryLayout<UInt32>.size))
packetData.append(audioData)
socket?.write(data: packetData, completion: completion)
} else {
print("WebSocket is not connected")
}
}