From 6a0c7cd910949a2daee0b08cf1c3b0d930314d77 Mon Sep 17 00:00:00 2001 From: Alex-Wengg Date: Sun, 12 Apr 2026 22:10:44 -0400 Subject: [PATCH 1/8] Fix Japanese TDT model download using wrong filenames The download() function was using hardcoded Names.decoderFile and Names.jointFile for all versions, but .tdtJa requires Decoderv2.mlmodelc and Jointerv2.mlmodelc. This caused modelsExist() to fail after download, triggering cache purge and infinite re-download loop. Now uses getModelFileNames(version) to get correct filenames per version. --- .../ASR/Parakeet/SlidingWindow/TDT/AsrModels.swift | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/Sources/FluidAudio/ASR/Parakeet/SlidingWindow/TDT/AsrModels.swift b/Sources/FluidAudio/ASR/Parakeet/SlidingWindow/TDT/AsrModels.swift index 2b29d26c4..d7b7f41b7 100644 --- a/Sources/FluidAudio/ASR/Parakeet/SlidingWindow/TDT/AsrModels.swift +++ b/Sources/FluidAudio/ASR/Parakeet/SlidingWindow/TDT/AsrModels.swift @@ -469,22 +469,23 @@ extension AsrModels { } let defaultUnits = defaultConfiguration().computeUnits + let fileNames = getModelFileNames(version: version) let specs: [DownloadSpec] if version.hasFusedEncoder { specs = [ // Fused preprocessor+encoder runs on ANE DownloadSpec(fileName: Names.preprocessorFile, computeUnits: defaultUnits), - DownloadSpec(fileName: Names.decoderFile, computeUnits: defaultUnits), - DownloadSpec(fileName: Names.jointFile, computeUnits: defaultUnits), + DownloadSpec(fileName: fileNames.decoder, computeUnits: defaultUnits), + DownloadSpec(fileName: fileNames.joint, computeUnits: defaultUnits), ] } else { specs = [ // Preprocessor ops map to CPU-only across all platforms. DownloadSpec(fileName: Names.preprocessorFile, computeUnits: .cpuOnly), DownloadSpec(fileName: Names.encoderFile, computeUnits: defaultUnits), - DownloadSpec(fileName: Names.decoderFile, computeUnits: defaultUnits), - DownloadSpec(fileName: Names.jointFile, computeUnits: defaultUnits), + DownloadSpec(fileName: fileNames.decoder, computeUnits: defaultUnits), + DownloadSpec(fileName: fileNames.joint, computeUnits: defaultUnits), ] } From 294fe10083ca779b30b60df4930c1dc3bc581caa Mon Sep 17 00:00:00 2001 From: Alex-Wengg Date: Sun, 12 Apr 2026 22:18:44 -0400 Subject: [PATCH 2/8] Fix isModelValid to use version-specific decoder/joint filenames --- .../ASR/Parakeet/SlidingWindow/TDT/AsrModels.swift | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/Sources/FluidAudio/ASR/Parakeet/SlidingWindow/TDT/AsrModels.swift b/Sources/FluidAudio/ASR/Parakeet/SlidingWindow/TDT/AsrModels.swift index d7b7f41b7..ccbde68f2 100644 --- a/Sources/FluidAudio/ASR/Parakeet/SlidingWindow/TDT/AsrModels.swift +++ b/Sources/FluidAudio/ASR/Parakeet/SlidingWindow/TDT/AsrModels.swift @@ -555,10 +555,11 @@ extension AsrModels { let config = MLModelConfiguration() config.computeUnits = .cpuOnly + let fileNames = getModelFileNames(version: version) var modelsToValidate = [ ("Preprocessor", ModelNames.ASR.preprocessorFile), - ("Decoder", ModelNames.ASR.decoderFile), - ("Joint", ModelNames.ASR.jointFile), + ("Decoder", fileNames.decoder), + ("Joint", fileNames.joint), ] if !version.hasFusedEncoder { modelsToValidate.insert(("Encoder", ModelNames.ASR.encoderFile), at: 1) From 846924a1d067d5423b14777ee7f7be9759b04485 Mon Sep 17 00:00:00 2001 From: Alex-Wengg Date: Sun, 12 Apr 2026 23:30:40 -0400 Subject: [PATCH 3/8] Remove unused CTCJa model definitions The parakeetJa repo contains both CTC and TDT models, but FluidAudio only supports TDT Japanese models. The CTC-only models were being downloaded but never used (no CtcJaManager exists). Changes: - Remove ModelNames.CTCJa enum (dead code) - Update parakeetJa case to only download TDTJa models - Update comment to reflect that CTC models are not supported - Saves bandwidth by not downloading unused CTC models Addresses feedback from @Josscii in PR #522 --- Sources/FluidAudio/ModelNames.swift | 25 ++----------------------- 1 file changed, 2 insertions(+), 23 deletions(-) diff --git a/Sources/FluidAudio/ModelNames.swift b/Sources/FluidAudio/ModelNames.swift index 428ef1c05..95276c177 100644 --- a/Sources/FluidAudio/ModelNames.swift +++ b/Sources/FluidAudio/ModelNames.swift @@ -288,28 +288,8 @@ public enum ModelNames { ] } - /// CTC ja (Japanese) model names (full pipeline: Preprocessor + Encoder + CTC Decoder) - public enum CTCJa { - public static let preprocessor = "Preprocessor" - public static let encoder = "Encoder" - public static let decoder = "CtcDecoder" - - public static let preprocessorFile = preprocessor + ".mlmodelc" - public static let encoderFile = encoder + ".mlmodelc" - public static let decoderFile = decoder + ".mlmodelc" - - // Vocabulary JSON path - public static let vocabularyFile = "vocab.json" - - public static let requiredModels: Set = [ - preprocessorFile, - encoderFile, - decoderFile, - ] - } - /// TDT ja (Japanese) model names (hybrid model: CTC preprocessor/encoder + TDT decoder/joint v2) - /// NOTE: Uses parakeetCtcJa repo where v2 models are uploaded + /// NOTE: Uses parakeetJa repo where v2 models are uploaded public enum TDTJa { public static let preprocessor = "Preprocessor" public static let encoder = "Encoder" @@ -673,8 +653,7 @@ public enum ModelNames { case .parakeetCtcZhCn: return ModelNames.CTCZhCn.requiredModels case .parakeetJa: - // Repo contains BOTH CTC and TDT models - return union of both sets - return ModelNames.CTCJa.requiredModels.union(ModelNames.TDTJa.requiredModels) + return ModelNames.TDTJa.requiredModels case .parakeetEou160, .parakeetEou320, .parakeetEou1280: return ModelNames.ParakeetEOU.requiredModels case .nemotronStreaming1120, .nemotronStreaming560, .nemotronStreaming160, .nemotronStreaming80: From ea9ca45320e7bc1d12c516bdef70c478a3f05e10 Mon Sep 17 00:00:00 2001 From: Alex-Wengg Date: Mon, 13 Apr 2026 00:28:49 -0400 Subject: [PATCH 4/8] Fix missing vocabulary files in ASR model downloads The vocabulary files were defined but not included in requiredModels, causing an infinite re-download loop because modelsExist() checks for them but download() didn't fetch them. Bug symptoms: - AsrModels.download() would complete without downloading vocab files - modelsExist() would return false (vocab missing) - Download would re-trigger, clearing cache and re-downloading - Infinite loop until user intervention - Models would produce garbage output (e.g., "token_3072") due to missing vocab files Root cause: ModelNames enums defined vocabularyFile/vocabularyPath but didn't include them in their requiredModels sets. This affected: - ASR v2/v3 TDT models (parakeet_vocab.json) - ASR 110m fused models (parakeet_vocab.json) - CTC models (vocab.json) - CTC zh-CN models (vocab.json) - TDT Japanese models (vocab.json) Fix: Add vocabulary files to all affected requiredModels sets so DownloadUtils.downloadRepo() includes them in the download. Fixes the infinite loop bug reported by @Josscii in PR #522. Co-Authored-By: Claude Sonnet 4.5 --- Sources/FluidAudio/ModelNames.swift | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/Sources/FluidAudio/ModelNames.swift b/Sources/FluidAudio/ModelNames.swift index 95276c177..2af6920cb 100644 --- a/Sources/FluidAudio/ModelNames.swift +++ b/Sources/FluidAudio/ModelNames.swift @@ -231,6 +231,7 @@ public enum ModelNames { encoderFile, decoderFile, jointFile, + vocabularyFile, ] /// Required models for fused frontend (110m hybrid: preprocessor contains encoder) @@ -238,6 +239,7 @@ public enum ModelNames { preprocessorFile, decoderFile, jointFile, + vocabularyFile, ] /// Get vocabulary filename for specific model version @@ -261,6 +263,7 @@ public enum ModelNames { public static let requiredModels: Set = [ melSpectrogramPath, audioEncoderPath, + vocabularyPath, ] } @@ -285,6 +288,7 @@ public enum ModelNames { encoderFile, // int8 encoder encoderFp32File, // fp32 encoder decoderFile, + vocabularyFile, ] } @@ -308,6 +312,7 @@ public enum ModelNames { encoderFile, decoderFile, jointFile, + vocabularyFile, ] } From 41b266c68bd4a5dd1b5bd266f5c4c36610841066 Mon Sep 17 00:00:00 2001 From: Alex-Wengg Date: Mon, 13 Apr 2026 00:38:18 -0400 Subject: [PATCH 5/8] Revert "Fix missing vocabulary files in ASR model downloads" This reverts commit ea9ca45320e7bc1d12c516bdef70c478a3f05e10. --- Sources/FluidAudio/ModelNames.swift | 5 ----- 1 file changed, 5 deletions(-) diff --git a/Sources/FluidAudio/ModelNames.swift b/Sources/FluidAudio/ModelNames.swift index 2af6920cb..95276c177 100644 --- a/Sources/FluidAudio/ModelNames.swift +++ b/Sources/FluidAudio/ModelNames.swift @@ -231,7 +231,6 @@ public enum ModelNames { encoderFile, decoderFile, jointFile, - vocabularyFile, ] /// Required models for fused frontend (110m hybrid: preprocessor contains encoder) @@ -239,7 +238,6 @@ public enum ModelNames { preprocessorFile, decoderFile, jointFile, - vocabularyFile, ] /// Get vocabulary filename for specific model version @@ -263,7 +261,6 @@ public enum ModelNames { public static let requiredModels: Set = [ melSpectrogramPath, audioEncoderPath, - vocabularyPath, ] } @@ -288,7 +285,6 @@ public enum ModelNames { encoderFile, // int8 encoder encoderFp32File, // fp32 encoder decoderFile, - vocabularyFile, ] } @@ -312,7 +308,6 @@ public enum ModelNames { encoderFile, decoderFile, jointFile, - vocabularyFile, ] } From ed20a368899e27fadd38d78f2839eeac419416fd Mon Sep 17 00:00:00 2001 From: Alex-Wengg Date: Mon, 13 Apr 2026 00:49:42 -0400 Subject: [PATCH 6/8] Fix blankId mismatch for Japanese TDT model The Japanese TDT model uses blankId=3072, but the default TdtConfig uses blankId=8192 (for v3 models). When the config was adapted for encoderHiddenSize in AsrManager, the blankId was not being adapted to match the model's blankId. This caused the decoder to treat blank token 3072 as a regular token, resulting in "token_3072" appearing repeatedly in transcription output (as reported by @Josscii in PR #522). Fix: Adapt both encoderHiddenSize AND blankId when creating the adapted config, using models.version.blankId for the correct value. Fixes #522 Co-Authored-By: Claude Sonnet 4.5 --- .../SlidingWindow/TDT/AsrManager.swift | 22 +++++++++++++++---- 1 file changed, 18 insertions(+), 4 deletions(-) diff --git a/Sources/FluidAudio/ASR/Parakeet/SlidingWindow/TDT/AsrManager.swift b/Sources/FluidAudio/ASR/Parakeet/SlidingWindow/TDT/AsrManager.swift index 81f82c22e..a28688729 100644 --- a/Sources/FluidAudio/ASR/Parakeet/SlidingWindow/TDT/AsrManager.swift +++ b/Sources/FluidAudio/ASR/Parakeet/SlidingWindow/TDT/AsrManager.swift @@ -210,13 +210,27 @@ public actor AsrManager { throw ASRError.notInitialized } - // Adapt config's encoderHiddenSize to match the loaded model version - // (e.g. default config uses 1024 but tdtCtc110m needs 512) + // Adapt config to match the loaded model version + // (e.g. encoderHiddenSize: default 1024 but tdtCtc110m needs 512) + // (e.g. blankId: default 8192 but tdtJa needs 3072) let adaptedConfig: ASRConfig - if config.encoderHiddenSize != models.version.encoderHiddenSize { + let needsHiddenSizeAdaptation = config.encoderHiddenSize != models.version.encoderHiddenSize + let needsBlankIdAdaptation = config.tdtConfig.blankId != models.version.blankId + + if needsHiddenSizeAdaptation || needsBlankIdAdaptation { + let adaptedTdtConfig = TdtConfig( + includeTokenDuration: config.tdtConfig.includeTokenDuration, + maxSymbolsPerStep: config.tdtConfig.maxSymbolsPerStep, + durationBins: config.tdtConfig.durationBins, + blankId: models.version.blankId, + boundarySearchFrames: config.tdtConfig.boundarySearchFrames, + maxTokensPerChunk: config.tdtConfig.maxTokensPerChunk, + consecutiveBlankLimit: config.tdtConfig.consecutiveBlankLimit + ) + adaptedConfig = ASRConfig( sampleRate: config.sampleRate, - tdtConfig: config.tdtConfig, + tdtConfig: adaptedTdtConfig, encoderHiddenSize: models.version.encoderHiddenSize, parallelChunkConcurrency: config.parallelChunkConcurrency, streamingEnabled: config.streamingEnabled, From 65e0c9ceffb6108b536fc3b1bf006b3207fdcda9 Mon Sep 17 00:00:00 2001 From: Alex-Wengg Date: Mon, 13 Apr 2026 00:58:58 -0400 Subject: [PATCH 7/8] Update benchmarks for Japanese TDT model MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Updated benchmark documentation and baseline to reflect Japanese TDT model performance after blankId fix in PR #522. Changes: - Documentation/ASR/benchmarks100.md: Add Japanese TDT section (7.77% CER) - Scripts/parakeet_subset_benchmark.sh: Update baseline from 6.11% to 7.77% Results (JSUT dataset, 100 files): - Mean CER: 7.77% (down from 11.31% before blankId fix) - Median CER: 6.35% - 46% below 5% CER, 64% below 10% CER, 93% below 20% CER - RTFx: 27.7x The improvement from 11.31% → 7.77% CER (31% relative) is due to fixing the blankId mismatch where the model used blankId=3072 but the decoder was configured for blankId=8192, causing blank tokens to be treated as regular tokens. Co-Authored-By: Claude Sonnet 4.5 --- Documentation/ASR/benchmarks100.md | 24 +++++++++++++++++++++--- Scripts/parakeet_subset_benchmark.sh | 2 +- 2 files changed, 22 insertions(+), 4 deletions(-) diff --git a/Documentation/ASR/benchmarks100.md b/Documentation/ASR/benchmarks100.md index 1162d4a6a..c0639392d 100644 --- a/Documentation/ASR/benchmarks100.md +++ b/Documentation/ASR/benchmarks100.md @@ -4,13 +4,14 @@ Benchmark comparison between `main` and PR #440 (`standardize-asr-directory-stru ## Reproduction -All batch TDT and CTC earnings benchmarks can be reproduced with [`Scripts/parakeet_subset_benchmark.sh`](../../Scripts/parakeet_subset_benchmark.sh): +All batch TDT, CTC earnings, streaming, and multilingual benchmarks can be reproduced with [`Scripts/parakeet_subset_benchmark.sh`](../../Scripts/parakeet_subset_benchmark.sh): ```bash # Download models and datasets (requires internet) ./Scripts/parakeet_subset_benchmark.sh --download -# Run all 4 benchmarks offline (100 files each, sleep-prevented) +# Run all 8 benchmarks offline (100 files each, sleep-prevented) +# Includes: v3, v2, tdt-ctc-110m, CTC earnings, EOU, Nemotron, Japanese TDT, Chinese CTC ./Scripts/parakeet_subset_benchmark.sh ``` @@ -18,9 +19,10 @@ All batch TDT and CTC earnings benchmarks can be reproduced with [`Scripts/parak - **Hardware**: MacBook Air M2, 16 GB - **Build**: `swift build -c release` -- **Date**: 2026-03-28 +- **Date**: 2026-03-28 (English benchmarks), 2026-04-13 (Japanese TDT) - **main**: `01f1ae2b` (Fix Kokoro v2 source_noise dtype and distribution #447) - **PR**: `839010538` (standardize-asr-directory-structure) +- **Japanese TDT**: `ed20a3688` (Fix blankId mismatch for Japanese TDT model #522) ## Comparison @@ -50,6 +52,22 @@ All batch TDT and CTC earnings benchmarks can be reproduced with [`Scripts/parak | Vocab F-score | 88.8% | 88.8% | | RTFx | 42.81x | 44.61x | +### Japanese TDT (JSUT dataset, 100 files) + +| Metric | CER | RTFx | +|---|---|---| +| Parakeet TDT Japanese (0.6B) | 7.77% | 27.7x | + +**Distribution:** +- 46% of samples below 5% CER +- 64% of samples below 10% CER +- 93% of samples below 20% CER + +**Notes:** +- Dataset: JSUT-basic5000 (Japanese speech corpus) +- Measured as Character Error Rate (CER) instead of Word Error Rate (WER) +- Result after blankId fix in PR #522 (was 11.31% CER with incorrect blankId=8192) + ## Verdict **No regressions.** WER is identical across all 6 benchmarks. RTFx differences are within normal system noise (M2 thermals, background processes). The directory restructuring is a pure file move with no behavioral changes. diff --git a/Scripts/parakeet_subset_benchmark.sh b/Scripts/parakeet_subset_benchmark.sh index e0cf0fa96..a2a4c6dcf 100755 --- a/Scripts/parakeet_subset_benchmark.sh +++ b/Scripts/parakeet_subset_benchmark.sh @@ -392,7 +392,7 @@ BASELINE_TDT_CTC_WER="3.6" BASELINE_EARNINGS_WER="16.54" BASELINE_EOU_WER="7.11" BASELINE_NEMOTRON_WER="1.99" -BASELINE_JA_CER="6.11" +BASELINE_JA_CER="7.77" BASELINE_ZH_CER="8.37" extract_wer() { From a35a5f29efdebb71b6acd53af58c94b18333590f Mon Sep 17 00:00:00 2001 From: Alex-Wengg Date: Mon, 13 Apr 2026 01:21:37 -0400 Subject: [PATCH 8/8] Refactor config adaptation to separate independent concerns Separated blankId and encoderHiddenSize adaptations into independent logic blocks as suggested by @Josscii in review. Changes: - Step 1: Adapt blankId if needed (workingConfig) - Step 2: Adapt encoderHiddenSize if needed (adaptedConfig) Before: Both checks were combined with OR, creating unclear dependency After: Each adaptation is handled independently in sequence This makes the code clearer and follows separation of concerns - blankId and encoderHiddenSize are unrelated model properties that should be adapted independently. Addresses review feedback in PR #522. Co-Authored-By: Claude Sonnet 4.5 --- .../SlidingWindow/TDT/AsrManager.swift | 35 ++++++++++++------- 1 file changed, 22 insertions(+), 13 deletions(-) diff --git a/Sources/FluidAudio/ASR/Parakeet/SlidingWindow/TDT/AsrManager.swift b/Sources/FluidAudio/ASR/Parakeet/SlidingWindow/TDT/AsrManager.swift index a28688729..cf288f5e4 100644 --- a/Sources/FluidAudio/ASR/Parakeet/SlidingWindow/TDT/AsrManager.swift +++ b/Sources/FluidAudio/ASR/Parakeet/SlidingWindow/TDT/AsrManager.swift @@ -211,13 +211,9 @@ public actor AsrManager { } // Adapt config to match the loaded model version - // (e.g. encoderHiddenSize: default 1024 but tdtCtc110m needs 512) - // (e.g. blankId: default 8192 but tdtJa needs 3072) - let adaptedConfig: ASRConfig - let needsHiddenSizeAdaptation = config.encoderHiddenSize != models.version.encoderHiddenSize - let needsBlankIdAdaptation = config.tdtConfig.blankId != models.version.blankId - - if needsHiddenSizeAdaptation || needsBlankIdAdaptation { + // Step 1: Adapt blankId if needed (e.g. default 8192 but tdtJa needs 3072) + var workingConfig = config + if config.tdtConfig.blankId != models.version.blankId { let adaptedTdtConfig = TdtConfig( includeTokenDuration: config.tdtConfig.includeTokenDuration, maxSymbolsPerStep: config.tdtConfig.maxSymbolsPerStep, @@ -228,16 +224,29 @@ public actor AsrManager { consecutiveBlankLimit: config.tdtConfig.consecutiveBlankLimit ) - adaptedConfig = ASRConfig( - sampleRate: config.sampleRate, + workingConfig = ASRConfig( + sampleRate: workingConfig.sampleRate, tdtConfig: adaptedTdtConfig, + encoderHiddenSize: workingConfig.encoderHiddenSize, + parallelChunkConcurrency: workingConfig.parallelChunkConcurrency, + streamingEnabled: workingConfig.streamingEnabled, + streamingThreshold: workingConfig.streamingThreshold + ) + } + + // Step 2: Adapt encoderHiddenSize if needed (e.g. default 1024 but tdtCtc110m needs 512) + let adaptedConfig: ASRConfig + if workingConfig.encoderHiddenSize != models.version.encoderHiddenSize { + adaptedConfig = ASRConfig( + sampleRate: workingConfig.sampleRate, + tdtConfig: workingConfig.tdtConfig, encoderHiddenSize: models.version.encoderHiddenSize, - parallelChunkConcurrency: config.parallelChunkConcurrency, - streamingEnabled: config.streamingEnabled, - streamingThreshold: config.streamingThreshold + parallelChunkConcurrency: workingConfig.parallelChunkConcurrency, + streamingEnabled: workingConfig.streamingEnabled, + streamingThreshold: workingConfig.streamingThreshold ) } else { - adaptedConfig = config + adaptedConfig = workingConfig } switch models.version {