diff --git a/Documentation/ASR/benchmarks100.md b/Documentation/ASR/benchmarks100.md index 1162d4a6a..c0639392d 100644 --- a/Documentation/ASR/benchmarks100.md +++ b/Documentation/ASR/benchmarks100.md @@ -4,13 +4,14 @@ Benchmark comparison between `main` and PR #440 (`standardize-asr-directory-stru ## Reproduction -All batch TDT and CTC earnings benchmarks can be reproduced with [`Scripts/parakeet_subset_benchmark.sh`](../../Scripts/parakeet_subset_benchmark.sh): +All batch TDT, CTC earnings, streaming, and multilingual benchmarks can be reproduced with [`Scripts/parakeet_subset_benchmark.sh`](../../Scripts/parakeet_subset_benchmark.sh): ```bash # Download models and datasets (requires internet) ./Scripts/parakeet_subset_benchmark.sh --download -# Run all 4 benchmarks offline (100 files each, sleep-prevented) +# Run all 8 benchmarks offline (100 files each, sleep-prevented) +# Includes: v3, v2, tdt-ctc-110m, CTC earnings, EOU, Nemotron, Japanese TDT, Chinese CTC ./Scripts/parakeet_subset_benchmark.sh ``` @@ -18,9 +19,10 @@ All batch TDT and CTC earnings benchmarks can be reproduced with [`Scripts/parak - **Hardware**: MacBook Air M2, 16 GB - **Build**: `swift build -c release` -- **Date**: 2026-03-28 +- **Date**: 2026-03-28 (English benchmarks), 2026-04-13 (Japanese TDT) - **main**: `01f1ae2b` (Fix Kokoro v2 source_noise dtype and distribution #447) - **PR**: `839010538` (standardize-asr-directory-structure) +- **Japanese TDT**: `ed20a3688` (Fix blankId mismatch for Japanese TDT model #522) ## Comparison @@ -50,6 +52,22 @@ All batch TDT and CTC earnings benchmarks can be reproduced with [`Scripts/parak | Vocab F-score | 88.8% | 88.8% | | RTFx | 42.81x | 44.61x | +### Japanese TDT (JSUT dataset, 100 files) + +| Metric | CER | RTFx | +|---|---|---| +| Parakeet TDT Japanese (0.6B) | 7.77% | 27.7x | + +**Distribution:** +- 46% of samples below 5% CER +- 64% of samples below 10% CER +- 93% of samples below 20% CER + +**Notes:** +- Dataset: JSUT-basic5000 (Japanese speech corpus) +- Measured as Character Error Rate (CER) instead of Word Error Rate (WER) +- Result after blankId fix in PR #522 (was 11.31% CER with incorrect blankId=8192) + ## Verdict **No regressions.** WER is identical across all 6 benchmarks. RTFx differences are within normal system noise (M2 thermals, background processes). The directory restructuring is a pure file move with no behavioral changes. diff --git a/Scripts/parakeet_subset_benchmark.sh b/Scripts/parakeet_subset_benchmark.sh index e0cf0fa96..a2a4c6dcf 100755 --- a/Scripts/parakeet_subset_benchmark.sh +++ b/Scripts/parakeet_subset_benchmark.sh @@ -392,7 +392,7 @@ BASELINE_TDT_CTC_WER="3.6" BASELINE_EARNINGS_WER="16.54" BASELINE_EOU_WER="7.11" BASELINE_NEMOTRON_WER="1.99" -BASELINE_JA_CER="6.11" +BASELINE_JA_CER="7.77" BASELINE_ZH_CER="8.37" extract_wer() { diff --git a/Sources/FluidAudio/ASR/Parakeet/SlidingWindow/TDT/AsrManager.swift b/Sources/FluidAudio/ASR/Parakeet/SlidingWindow/TDT/AsrManager.swift index 81f82c22e..cf288f5e4 100644 --- a/Sources/FluidAudio/ASR/Parakeet/SlidingWindow/TDT/AsrManager.swift +++ b/Sources/FluidAudio/ASR/Parakeet/SlidingWindow/TDT/AsrManager.swift @@ -210,20 +210,43 @@ public actor AsrManager { throw ASRError.notInitialized } - // Adapt config's encoderHiddenSize to match the loaded model version - // (e.g. default config uses 1024 but tdtCtc110m needs 512) + // Adapt config to match the loaded model version + // Step 1: Adapt blankId if needed (e.g. default 8192 but tdtJa needs 3072) + var workingConfig = config + if config.tdtConfig.blankId != models.version.blankId { + let adaptedTdtConfig = TdtConfig( + includeTokenDuration: config.tdtConfig.includeTokenDuration, + maxSymbolsPerStep: config.tdtConfig.maxSymbolsPerStep, + durationBins: config.tdtConfig.durationBins, + blankId: models.version.blankId, + boundarySearchFrames: config.tdtConfig.boundarySearchFrames, + maxTokensPerChunk: config.tdtConfig.maxTokensPerChunk, + consecutiveBlankLimit: config.tdtConfig.consecutiveBlankLimit + ) + + workingConfig = ASRConfig( + sampleRate: workingConfig.sampleRate, + tdtConfig: adaptedTdtConfig, + encoderHiddenSize: workingConfig.encoderHiddenSize, + parallelChunkConcurrency: workingConfig.parallelChunkConcurrency, + streamingEnabled: workingConfig.streamingEnabled, + streamingThreshold: workingConfig.streamingThreshold + ) + } + + // Step 2: Adapt encoderHiddenSize if needed (e.g. default 1024 but tdtCtc110m needs 512) let adaptedConfig: ASRConfig - if config.encoderHiddenSize != models.version.encoderHiddenSize { + if workingConfig.encoderHiddenSize != models.version.encoderHiddenSize { adaptedConfig = ASRConfig( - sampleRate: config.sampleRate, - tdtConfig: config.tdtConfig, + sampleRate: workingConfig.sampleRate, + tdtConfig: workingConfig.tdtConfig, encoderHiddenSize: models.version.encoderHiddenSize, - parallelChunkConcurrency: config.parallelChunkConcurrency, - streamingEnabled: config.streamingEnabled, - streamingThreshold: config.streamingThreshold + parallelChunkConcurrency: workingConfig.parallelChunkConcurrency, + streamingEnabled: workingConfig.streamingEnabled, + streamingThreshold: workingConfig.streamingThreshold ) } else { - adaptedConfig = config + adaptedConfig = workingConfig } switch models.version { diff --git a/Sources/FluidAudio/ASR/Parakeet/SlidingWindow/TDT/AsrModels.swift b/Sources/FluidAudio/ASR/Parakeet/SlidingWindow/TDT/AsrModels.swift index 2b29d26c4..ccbde68f2 100644 --- a/Sources/FluidAudio/ASR/Parakeet/SlidingWindow/TDT/AsrModels.swift +++ b/Sources/FluidAudio/ASR/Parakeet/SlidingWindow/TDT/AsrModels.swift @@ -469,22 +469,23 @@ extension AsrModels { } let defaultUnits = defaultConfiguration().computeUnits + let fileNames = getModelFileNames(version: version) let specs: [DownloadSpec] if version.hasFusedEncoder { specs = [ // Fused preprocessor+encoder runs on ANE DownloadSpec(fileName: Names.preprocessorFile, computeUnits: defaultUnits), - DownloadSpec(fileName: Names.decoderFile, computeUnits: defaultUnits), - DownloadSpec(fileName: Names.jointFile, computeUnits: defaultUnits), + DownloadSpec(fileName: fileNames.decoder, computeUnits: defaultUnits), + DownloadSpec(fileName: fileNames.joint, computeUnits: defaultUnits), ] } else { specs = [ // Preprocessor ops map to CPU-only across all platforms. DownloadSpec(fileName: Names.preprocessorFile, computeUnits: .cpuOnly), DownloadSpec(fileName: Names.encoderFile, computeUnits: defaultUnits), - DownloadSpec(fileName: Names.decoderFile, computeUnits: defaultUnits), - DownloadSpec(fileName: Names.jointFile, computeUnits: defaultUnits), + DownloadSpec(fileName: fileNames.decoder, computeUnits: defaultUnits), + DownloadSpec(fileName: fileNames.joint, computeUnits: defaultUnits), ] } @@ -554,10 +555,11 @@ extension AsrModels { let config = MLModelConfiguration() config.computeUnits = .cpuOnly + let fileNames = getModelFileNames(version: version) var modelsToValidate = [ ("Preprocessor", ModelNames.ASR.preprocessorFile), - ("Decoder", ModelNames.ASR.decoderFile), - ("Joint", ModelNames.ASR.jointFile), + ("Decoder", fileNames.decoder), + ("Joint", fileNames.joint), ] if !version.hasFusedEncoder { modelsToValidate.insert(("Encoder", ModelNames.ASR.encoderFile), at: 1) diff --git a/Sources/FluidAudio/ModelNames.swift b/Sources/FluidAudio/ModelNames.swift index 428ef1c05..95276c177 100644 --- a/Sources/FluidAudio/ModelNames.swift +++ b/Sources/FluidAudio/ModelNames.swift @@ -288,28 +288,8 @@ public enum ModelNames { ] } - /// CTC ja (Japanese) model names (full pipeline: Preprocessor + Encoder + CTC Decoder) - public enum CTCJa { - public static let preprocessor = "Preprocessor" - public static let encoder = "Encoder" - public static let decoder = "CtcDecoder" - - public static let preprocessorFile = preprocessor + ".mlmodelc" - public static let encoderFile = encoder + ".mlmodelc" - public static let decoderFile = decoder + ".mlmodelc" - - // Vocabulary JSON path - public static let vocabularyFile = "vocab.json" - - public static let requiredModels: Set = [ - preprocessorFile, - encoderFile, - decoderFile, - ] - } - /// TDT ja (Japanese) model names (hybrid model: CTC preprocessor/encoder + TDT decoder/joint v2) - /// NOTE: Uses parakeetCtcJa repo where v2 models are uploaded + /// NOTE: Uses parakeetJa repo where v2 models are uploaded public enum TDTJa { public static let preprocessor = "Preprocessor" public static let encoder = "Encoder" @@ -673,8 +653,7 @@ public enum ModelNames { case .parakeetCtcZhCn: return ModelNames.CTCZhCn.requiredModels case .parakeetJa: - // Repo contains BOTH CTC and TDT models - return union of both sets - return ModelNames.CTCJa.requiredModels.union(ModelNames.TDTJa.requiredModels) + return ModelNames.TDTJa.requiredModels case .parakeetEou160, .parakeetEou320, .parakeetEou1280: return ModelNames.ParakeetEOU.requiredModels case .nemotronStreaming1120, .nemotronStreaming560, .nemotronStreaming160, .nemotronStreaming80: