From fcaa54c4cbff9f7940dcec7c83c1f116e55c590f Mon Sep 17 00:00:00 2001 From: Chris Hutchins Date: Thu, 26 Mar 2026 09:21:12 -0700 Subject: [PATCH] fix: downmix multi-channel audio to mono for SFSpeechRecognizer SFSpeechRecognizer silently returns no results when receiving multi-channel audio buffers. USB audio interfaces like the RODECaster Pro II send 2-channel 48kHz audio, and the previous `format: nil` tap delivered these buffers unchanged to the recognition request. Create a mono AVAudioFormat at the hardware sample rate when the device has more than one channel and pass it to installTap, letting AVAudioEngine handle the downmix automatically. Co-Authored-By: Claude Opus 4.6 (1M context) --- Textream/Textream/SpeechRecognizer.swift | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/Textream/Textream/SpeechRecognizer.swift b/Textream/Textream/SpeechRecognizer.swift index 0730b1c..e7e8258 100644 --- a/Textream/Textream/SpeechRecognizer.swift +++ b/Textream/Textream/SpeechRecognizer.swift @@ -287,10 +287,10 @@ class SpeechRecognizer { recognitionRequest.shouldReportPartialResults = true let inputNode = audioEngine.inputNode - let recordingFormat = inputNode.outputFormat(forBus: 0) + let hardwareFormat = inputNode.outputFormat(forBus: 0) // Guard against invalid format during device transitions (e.g. mic switch) - guard recordingFormat.sampleRate > 0, recordingFormat.channelCount > 0 else { + guard hardwareFormat.sampleRate > 0, hardwareFormat.channelCount > 0 else { // Retry after a longer delay to let the audio system settle if retryCount < maxRetries { retryCount += 1 @@ -302,6 +302,17 @@ class SpeechRecognizer { return } + // SFSpeechRecognizer requires mono audio. Multi-channel devices (e.g. + // RODECaster Pro II at 2ch/48kHz) cause the recognition task to silently + // return no results. Request a mono tap and let AVAudioEngine downmix. + let monoFormat = AVAudioFormat( + commonFormat: hardwareFormat.commonFormat, + sampleRate: hardwareFormat.sampleRate, + channels: 1, + interleaved: hardwareFormat.isInterleaved + ) + let tapFormat = (hardwareFormat.channelCount > 1) ? monoFormat : hardwareFormat + // Observe audio configuration changes (e.g. mic switched externally) to restart gracefully configurationChangeObserver = NotificationCenter.default.addObserver( forName: .AVAudioEngineConfigurationChange, @@ -315,7 +326,7 @@ class SpeechRecognizer { // Belt-and-suspenders: ensure no stale tap exists before installing inputNode.removeTap(onBus: 0) - inputNode.installTap(onBus: 0, bufferSize: 1024, format: nil) { [weak self] buffer, _ in + inputNode.installTap(onBus: 0, bufferSize: 1024, format: tapFormat) { [weak self] buffer, _ in recognitionRequest.append(buffer) guard let channelData = buffer.floatChannelData?[0] else { return }