FluidInference · Alex-Wengg · Apr 13, 2026 · Apr 13, 2026 · Apr 13, 2026 · Apr 13, 2026
diff --git a/Documentation/ASR/benchmarks100.md b/Documentation/ASR/benchmarks100.md
@@ -4,23 +4,25 @@ Benchmark comparison between `main` and PR #440 (`standardize-asr-directory-stru
 
 ## Reproduction
 
-All batch TDT and CTC earnings benchmarks can be reproduced with [`Scripts/parakeet_subset_benchmark.sh`](../../Scripts/parakeet_subset_benchmark.sh):
+All batch TDT, CTC earnings, streaming, and multilingual benchmarks can be reproduced with [`Scripts/parakeet_subset_benchmark.sh`](../../Scripts/parakeet_subset_benchmark.sh):
 
 ```bash
 # Download models and datasets (requires internet)
 ./Scripts/parakeet_subset_benchmark.sh --download
 
-# Run all 4 benchmarks offline (100 files each, sleep-prevented)
+# Run all 8 benchmarks offline (100 files each, sleep-prevented)
+# Includes: v3, v2, tdt-ctc-110m, CTC earnings, EOU, Nemotron, Japanese TDT, Chinese CTC
 ./Scripts/parakeet_subset_benchmark.sh
 ```
 
 ## Environment
 
 - **Hardware**: MacBook Air M2, 16 GB
 - **Build**: `swift build -c release`
-- **Date**: 2026-03-28
+- **Date**: 2026-03-28 (English benchmarks), 2026-04-13 (Japanese TDT)
 - **main**: `01f1ae2b` (Fix Kokoro v2 source_noise dtype and distribution #447)
 - **PR**: `839010538` (standardize-asr-directory-structure)
+- **Japanese TDT**: `ed20a3688` (Fix blankId mismatch for Japanese TDT model #522)
 
 ## Comparison
 
@@ -50,6 +52,22 @@ All batch TDT and CTC earnings benchmarks can be reproduced with [`Scripts/parak
 | Vocab F-score | 88.8% | 88.8% |
 | RTFx | 42.81x | 44.61x |
 
+### Japanese TDT (JSUT dataset, 100 files)
+
+| Metric | CER | RTFx |
+|---|---|---|
+| Parakeet TDT Japanese (0.6B) | 7.77% | 27.7x |
+
+**Distribution:**
+- 46% of samples below 5% CER
+- 64% of samples below 10% CER
+- 93% of samples below 20% CER
+
+**Notes:**
+- Dataset: JSUT-basic5000 (Japanese speech corpus)
+- Measured as Character Error Rate (CER) instead of Word Error Rate (WER)
+- Result after blankId fix in PR #522 (was 11.31% CER with incorrect blankId=8192)
+
 ## Verdict
 
 **No regressions.** WER is identical across all 6 benchmarks. RTFx differences are within normal system noise (M2 thermals, background processes). The directory restructuring is a pure file move with no behavioral changes.

diff --git a/Scripts/parakeet_subset_benchmark.sh b/Scripts/parakeet_subset_benchmark.sh
@@ -392,7 +392,7 @@ BASELINE_TDT_CTC_WER="3.6"
 BASELINE_EARNINGS_WER="16.54"
 BASELINE_EOU_WER="7.11"
 BASELINE_NEMOTRON_WER="1.99"
-BASELINE_JA_CER="6.11"
+BASELINE_JA_CER="7.77"
 BASELINE_ZH_CER="8.37"
 
 extract_wer() {

diff --git a/Sources/FluidAudio/ASR/Parakeet/SlidingWindow/TDT/AsrManager.swift b/Sources/FluidAudio/ASR/Parakeet/SlidingWindow/TDT/AsrManager.swift
@@ -210,20 +210,43 @@ public actor AsrManager {
             throw ASRError.notInitialized
         }
 
-        // Adapt config's encoderHiddenSize to match the loaded model version
-        // (e.g. default config uses 1024 but tdtCtc110m needs 512)
+        // Adapt config to match the loaded model version
+        // Step 1: Adapt blankId if needed (e.g. default 8192 but tdtJa needs 3072)
+        var workingConfig = config
+        if config.tdtConfig.blankId != models.version.blankId {
+            let adaptedTdtConfig = TdtConfig(
+                includeTokenDuration: config.tdtConfig.includeTokenDuration,
+                maxSymbolsPerStep: config.tdtConfig.maxSymbolsPerStep,
+                durationBins: config.tdtConfig.durationBins,
+                blankId: models.version.blankId,
+                boundarySearchFrames: config.tdtConfig.boundarySearchFrames,
+                maxTokensPerChunk: config.tdtConfig.maxTokensPerChunk,
+                consecutiveBlankLimit: config.tdtConfig.consecutiveBlankLimit
+            )
+
+            workingConfig = ASRConfig(
+                sampleRate: workingConfig.sampleRate,
+                tdtConfig: adaptedTdtConfig,
+                encoderHiddenSize: workingConfig.encoderHiddenSize,
+                parallelChunkConcurrency: workingConfig.parallelChunkConcurrency,
+                streamingEnabled: workingConfig.streamingEnabled,
+                streamingThreshold: workingConfig.streamingThreshold
+            )
+        }
+
+        // Step 2: Adapt encoderHiddenSize if needed (e.g. default 1024 but tdtCtc110m needs 512)
         let adaptedConfig: ASRConfig
-        if config.encoderHiddenSize != models.version.encoderHiddenSize {
+        if workingConfig.encoderHiddenSize != models.version.encoderHiddenSize {
             adaptedConfig = ASRConfig(
-                sampleRate: config.sampleRate,
-                tdtConfig: config.tdtConfig,
+                sampleRate: workingConfig.sampleRate,
+                tdtConfig: workingConfig.tdtConfig,
                 encoderHiddenSize: models.version.encoderHiddenSize,
-                parallelChunkConcurrency: config.parallelChunkConcurrency,
-                streamingEnabled: config.streamingEnabled,
-                streamingThreshold: config.streamingThreshold
+                parallelChunkConcurrency: workingConfig.parallelChunkConcurrency,
+                streamingEnabled: workingConfig.streamingEnabled,
+                streamingThreshold: workingConfig.streamingThreshold
             )
         } else {
-            adaptedConfig = config
+            adaptedConfig = workingConfig
         }
 
         switch models.version {

diff --git a/Sources/FluidAudio/ASR/Parakeet/SlidingWindow/TDT/AsrModels.swift b/Sources/FluidAudio/ASR/Parakeet/SlidingWindow/TDT/AsrModels.swift
@@ -469,22 +469,23 @@ extension AsrModels {
         }
 
         let defaultUnits = defaultConfiguration().computeUnits
+        let fileNames = getModelFileNames(version: version)
 
         let specs: [DownloadSpec]
         if version.hasFusedEncoder {
             specs = [
                 // Fused preprocessor+encoder runs on ANE
                 DownloadSpec(fileName: Names.preprocessorFile, computeUnits: defaultUnits),
-                DownloadSpec(fileName: Names.decoderFile, computeUnits: defaultUnits),
-                DownloadSpec(fileName: Names.jointFile, computeUnits: defaultUnits),
+                DownloadSpec(fileName: fileNames.decoder, computeUnits: defaultUnits),
+                DownloadSpec(fileName: fileNames.joint, computeUnits: defaultUnits),
             ]
         } else {
             specs = [
                 // Preprocessor ops map to CPU-only across all platforms.
                 DownloadSpec(fileName: Names.preprocessorFile, computeUnits: .cpuOnly),
                 DownloadSpec(fileName: Names.encoderFile, computeUnits: defaultUnits),
-                DownloadSpec(fileName: Names.decoderFile, computeUnits: defaultUnits),
-                DownloadSpec(fileName: Names.jointFile, computeUnits: defaultUnits),
+                DownloadSpec(fileName: fileNames.decoder, computeUnits: defaultUnits),
+                DownloadSpec(fileName: fileNames.joint, computeUnits: defaultUnits),
             ]
         }
 
@@ -554,10 +555,11 @@ extension AsrModels {
         let config = MLModelConfiguration()
         config.computeUnits = .cpuOnly
 
+        let fileNames = getModelFileNames(version: version)
         var modelsToValidate = [
             ("Preprocessor", ModelNames.ASR.preprocessorFile),
-            ("Decoder", ModelNames.ASR.decoderFile),
-            ("Joint", ModelNames.ASR.jointFile),
+            ("Decoder", fileNames.decoder),
+            ("Joint", fileNames.joint),
         ]
         if !version.hasFusedEncoder {
             modelsToValidate.insert(("Encoder", ModelNames.ASR.encoderFile), at: 1)

diff --git a/Sources/FluidAudio/ModelNames.swift b/Sources/FluidAudio/ModelNames.swift
@@ -288,28 +288,8 @@ public enum ModelNames {
         ]
     }
 
-    /// CTC ja (Japanese) model names (full pipeline: Preprocessor + Encoder + CTC Decoder)
-    public enum CTCJa {
-        public static let preprocessor = "Preprocessor"
-        public static let encoder = "Encoder"
-        public static let decoder = "CtcDecoder"
-
-        public static let preprocessorFile = preprocessor + ".mlmodelc"
-        public static let encoderFile = encoder + ".mlmodelc"
-        public static let decoderFile = decoder + ".mlmodelc"
-
-        // Vocabulary JSON path
-        public static let vocabularyFile = "vocab.json"
-
-        public static let requiredModels: Set<String> = [
-            preprocessorFile,
-            encoderFile,
-            decoderFile,
-        ]
-    }
-
     /// TDT ja (Japanese) model names (hybrid model: CTC preprocessor/encoder + TDT decoder/joint v2)
-    /// NOTE: Uses parakeetCtcJa repo where v2 models are uploaded
+    /// NOTE: Uses parakeetJa repo where v2 models are uploaded
     public enum TDTJa {
         public static let preprocessor = "Preprocessor"
         public static let encoder = "Encoder"
@@ -673,8 +653,7 @@ public enum ModelNames {
         case .parakeetCtcZhCn:
             return ModelNames.CTCZhCn.requiredModels
         case .parakeetJa:
-            // Repo contains BOTH CTC and TDT models - return union of both sets
-            return ModelNames.CTCJa.requiredModels.union(ModelNames.TDTJa.requiredModels)
+            return ModelNames.TDTJa.requiredModels
         case .parakeetEou160, .parakeetEou320, .parakeetEou1280:
             return ModelNames.ParakeetEOU.requiredModels
         case .nemotronStreaming1120, .nemotronStreaming560, .nemotronStreaming160, .nemotronStreaming80: