From fcaa54c4cbff9f7940dcec7c83c1f116e55c590f Mon Sep 17 00:00:00 2001
From: Chris Hutchins <chris@chrishutchins.com>
Date: Thu, 26 Mar 2026 09:21:12 -0700
Subject: [PATCH] fix: downmix multi-channel audio to mono for
 SFSpeechRecognizer

SFSpeechRecognizer silently returns no results when receiving
multi-channel audio buffers. USB audio interfaces like the RODECaster
Pro II send 2-channel 48kHz audio, and the previous `format: nil` tap
delivered these buffers unchanged to the recognition request.

Create a mono AVAudioFormat at the hardware sample rate when the device
has more than one channel and pass it to installTap, letting
AVAudioEngine handle the downmix automatically.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 Textream/Textream/SpeechRecognizer.swift | 17 ++++++++++++++---
 1 file changed, 14 insertions(+), 3 deletions(-)

diff --git a/Textream/Textream/SpeechRecognizer.swift b/Textream/Textream/SpeechRecognizer.swift
index 0730b1c..e7e8258 100644
--- a/Textream/Textream/SpeechRecognizer.swift
+++ b/Textream/Textream/SpeechRecognizer.swift
@@ -287,10 +287,10 @@ class SpeechRecognizer {
         recognitionRequest.shouldReportPartialResults = true
 
         let inputNode = audioEngine.inputNode
-        let recordingFormat = inputNode.outputFormat(forBus: 0)
+        let hardwareFormat = inputNode.outputFormat(forBus: 0)
 
         // Guard against invalid format during device transitions (e.g. mic switch)
-        guard recordingFormat.sampleRate > 0, recordingFormat.channelCount > 0 else {
+        guard hardwareFormat.sampleRate > 0, hardwareFormat.channelCount > 0 else {
             // Retry after a longer delay to let the audio system settle
             if retryCount < maxRetries {
                 retryCount += 1
@@ -302,6 +302,17 @@ class SpeechRecognizer {
             return
         }
 
+        // SFSpeechRecognizer requires mono audio. Multi-channel devices (e.g.
+        // RODECaster Pro II at 2ch/48kHz) cause the recognition task to silently
+        // return no results. Request a mono tap and let AVAudioEngine downmix.
+        let monoFormat = AVAudioFormat(
+            commonFormat: hardwareFormat.commonFormat,
+            sampleRate: hardwareFormat.sampleRate,
+            channels: 1,
+            interleaved: hardwareFormat.isInterleaved
+        )
+        let tapFormat = (hardwareFormat.channelCount > 1) ? monoFormat : hardwareFormat
+
         // Observe audio configuration changes (e.g. mic switched externally) to restart gracefully
         configurationChangeObserver = NotificationCenter.default.addObserver(
             forName: .AVAudioEngineConfigurationChange,
@@ -315,7 +326,7 @@ class SpeechRecognizer {
         // Belt-and-suspenders: ensure no stale tap exists before installing
         inputNode.removeTap(onBus: 0)
 
-        inputNode.installTap(onBus: 0, bufferSize: 1024, format: nil) { [weak self] buffer, _ in
+        inputNode.installTap(onBus: 0, bufferSize: 1024, format: tapFormat) { [weak self] buffer, _ in
             recognitionRequest.append(buffer)
 
             guard let channelData = buffer.floatChannelData?[0] else { return }