Fix offline diarization pipeline producing single-speaker output #2190
Workflow file for this run
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| name: Diarizer Performance Benchmark | |
| on: | |
| pull_request: | |
| branches: [main] | |
| types: [opened, synchronize, reopened] | |
| concurrency: | |
| group: ${{ github.workflow }}-${{ github.ref }} | |
| cancel-in-progress: true | |
| jobs: | |
| benchmark: | |
| name: Single File Performance Benchmark | |
| runs-on: macos-15 | |
| permissions: | |
| contents: read | |
| pull-requests: write | |
| steps: | |
| - name: Checkout code | |
| uses: actions/checkout@v5 | |
| - uses: swift-actions/setup-swift@v2 | |
| with: | |
| swift-version: "6.1" | |
| - name: Cache Swift packages and build | |
| uses: actions/cache@v4 | |
| with: | |
| path: | | |
| .build | |
| ~/Library/Caches/org.swift.swiftpm | |
| key: ${{ runner.os }}-diarizer-${{ hashFiles('Package.swift', 'Sources/FluidAudio/ModelRegistry.swift', 'Sources/FluidAudio/ModelNames.swift') }} | |
| - name: Cache Diarizer models | |
| uses: actions/cache@v4 | |
| with: | |
| path: ~/Library/Application Support/FluidAudio/Models/speaker-diarization-coreml | |
| key: ${{ runner.os }}-diarizer-models-${{ hashFiles('Sources/FluidAudio/ModelRegistry.swift', 'Sources/FluidAudio/ModelNames.swift') }} | |
| - name: Cache AMI dataset | |
| uses: actions/cache@v4 | |
| with: | |
| path: ~/FluidAudioDatasets/ami_official | |
| key: ${{ runner.os }}-ami-dataset-${{ hashFiles('Sources/FluidAudioCLI/DatasetParsers/DatasetDownloader.swift') }} | |
| - name: Build package | |
| run: swift build -c release | |
| - name: Run Single File Benchmark | |
| id: benchmark | |
| run: | | |
| echo "🚀 Running single file benchmark..." | |
| # Record start time | |
| BENCHMARK_START=$(date +%s) | |
| swift run fluidaudiocli diarization-benchmark --auto-download --single-file ES2004a --output benchmark_results.json | |
| # Check if results file was generated | |
| if [ -f benchmark_results.json ]; then | |
| echo "SUCCESS=true" >> $GITHUB_OUTPUT | |
| else | |
| echo "❌ Benchmark failed - no results file generated" | |
| echo "SUCCESS=false" >> $GITHUB_OUTPUT | |
| fi | |
| # Calculate execution time | |
| BENCHMARK_END=$(date +%s) | |
| EXECUTION_TIME=$((BENCHMARK_END - BENCHMARK_START)) | |
| EXECUTION_MINS=$((EXECUTION_TIME / 60)) | |
| EXECUTION_SECS=$((EXECUTION_TIME % 60)) | |
| echo "EXECUTION_TIME=${EXECUTION_MINS}m ${EXECUTION_SECS}s" >> $GITHUB_OUTPUT | |
| timeout-minutes: 35 | |
| - name: Show benchmark_results.json | |
| if: always() | |
| run: | | |
| echo "--- benchmark_results.json ---" | |
| cat benchmark_results.json || echo "benchmark_results.json not found" | |
| echo "-----------------------------" | |
| - name: Extract benchmark metrics with jq | |
| id: extract | |
| run: | | |
| # The output is now an array, so we need to access the first element | |
| DER=$(jq '.[0].der' benchmark_results.json) | |
| JER=$(jq '.[0].jer' benchmark_results.json) | |
| RTF=$(jq '.[0].rtfx' benchmark_results.json) | |
| DURATION="1049" # ES2004a duration in seconds | |
| SPEAKER_COUNT=$(jq '.[0].detectedSpeakers' benchmark_results.json) | |
| # Extract detailed timing information | |
| TOTAL_TIME=$(jq '.[0].timings.totalProcessingSeconds' benchmark_results.json) | |
| MODEL_DOWNLOAD_TIME=$(jq '.[0].timings.modelDownloadSeconds' benchmark_results.json) | |
| MODEL_COMPILE_TIME=$(jq '.[0].timings.modelCompilationSeconds' benchmark_results.json) | |
| AUDIO_LOAD_TIME=$(jq '.[0].timings.audioLoadingSeconds' benchmark_results.json) | |
| SEGMENTATION_TIME=$(jq '.[0].timings.segmentationSeconds' benchmark_results.json) | |
| EMBEDDING_TIME=$(jq '.[0].timings.embeddingExtractionSeconds' benchmark_results.json) | |
| CLUSTERING_TIME=$(jq '.[0].timings.speakerClusteringSeconds' benchmark_results.json) | |
| INFERENCE_TIME=$(jq '.[0].timings.totalInferenceSeconds' benchmark_results.json) | |
| echo "DER=${DER}" >> $GITHUB_OUTPUT | |
| echo "JER=${JER}" >> $GITHUB_OUTPUT | |
| echo "RTF=${RTF}" >> $GITHUB_OUTPUT | |
| echo "DURATION=${DURATION}" >> $GITHUB_OUTPUT | |
| echo "SPEAKER_COUNT=${SPEAKER_COUNT}" >> $GITHUB_OUTPUT | |
| echo "TOTAL_TIME=${TOTAL_TIME}" >> $GITHUB_OUTPUT | |
| echo "MODEL_DOWNLOAD_TIME=${MODEL_DOWNLOAD_TIME}" >> $GITHUB_OUTPUT | |
| echo "MODEL_COMPILE_TIME=${MODEL_COMPILE_TIME}" >> $GITHUB_OUTPUT | |
| echo "AUDIO_LOAD_TIME=${AUDIO_LOAD_TIME}" >> $GITHUB_OUTPUT | |
| echo "SEGMENTATION_TIME=${SEGMENTATION_TIME}" >> $GITHUB_OUTPUT | |
| echo "EMBEDDING_TIME=${EMBEDDING_TIME}" >> $GITHUB_OUTPUT | |
| echo "CLUSTERING_TIME=${CLUSTERING_TIME}" >> $GITHUB_OUTPUT | |
| echo "INFERENCE_TIME=${INFERENCE_TIME}" >> $GITHUB_OUTPUT | |
| # Validate RTFx - 0 indicates benchmark failure | |
| if [ "$RTF" = "0" ] || [ -z "$RTF" ]; then | |
| echo "❌ CRITICAL: RTFx is 0 or empty - benchmark failed" | |
| echo "RTFx value: $RTF" | |
| exit 1 | |
| fi | |
| - name: Comment PR with Benchmark Results | |
| if: always() | |
| uses: actions/github-script@v7 | |
| with: | |
| script: | | |
| const der = parseFloat('${{ steps.extract.outputs.DER }}'); | |
| const jer = parseFloat('${{ steps.extract.outputs.JER }}'); | |
| const rtf = parseFloat('${{ steps.extract.outputs.RTF }}'); | |
| const duration = parseFloat('${{ steps.extract.outputs.DURATION }}').toFixed(1); | |
| const speakerCount = '${{ steps.extract.outputs.SPEAKER_COUNT }}'; | |
| const totalTime = parseFloat('${{ steps.extract.outputs.TOTAL_TIME }}'); | |
| const inferenceTime = parseFloat('${{ steps.extract.outputs.INFERENCE_TIME }}'); | |
| const modelDownloadTime = parseFloat('${{ steps.extract.outputs.MODEL_DOWNLOAD_TIME }}'); | |
| const modelCompileTime = parseFloat('${{ steps.extract.outputs.MODEL_COMPILE_TIME }}'); | |
| const audioLoadTime = parseFloat('${{ steps.extract.outputs.AUDIO_LOAD_TIME }}'); | |
| const segmentationTime = parseFloat('${{ steps.extract.outputs.SEGMENTATION_TIME }}'); | |
| const embeddingTime = parseFloat('${{ steps.extract.outputs.EMBEDDING_TIME }}'); | |
| const clusteringTime = parseFloat('${{ steps.extract.outputs.CLUSTERING_TIME }}'); | |
| const executionTime = '${{ steps.benchmark.outputs.EXECUTION_TIME }}' || 'N/A'; | |
| let comment = '## Speaker Diarization Benchmark Results\n\n'; | |
| comment += '### Speaker Diarization Performance\n'; | |
| comment += '_Evaluating "who spoke when" detection accuracy_\n\n'; | |
| comment += '| Metric | Value | Target | Status | Description |\n'; | |
| comment += '|--------|-------|--------|---------|-------------|\n'; | |
| comment += `| **DER** | **${der.toFixed(1)}%** | <30% | ${der < 30 ? '✅' : '⚠️'} | Diarization Error Rate (lower is better) |\n`; | |
| comment += `| **JER** | **${jer.toFixed(1)}%** | <25% | ${jer < 25 ? '✅' : '⚠️'} | Jaccard Error Rate |\n`; | |
| comment += `| **RTFx** | **${rtf.toFixed(2)}x** | >1.0x | ${rtf > 1.0 ? '✅' : '⚠️'} | Real-Time Factor (higher is faster) |\n\n`; | |
| comment += '### Diarization Pipeline Timing Breakdown\n'; | |
| comment += '_Time spent in each stage of speaker diarization_\n\n'; | |
| comment += '| Stage | Time (s) | % | Description |\n'; | |
| comment += '|-------|----------|---|-------------|\n'; | |
| comment += `| Model Download | ${modelDownloadTime.toFixed(3)} | ${(modelDownloadTime/totalTime*100).toFixed(1)} | Fetching diarization models |\n`; | |
| comment += `| Model Compile | ${modelCompileTime.toFixed(3)} | ${(modelCompileTime/totalTime*100).toFixed(1)} | CoreML compilation |\n`; | |
| comment += `| Audio Load | ${audioLoadTime.toFixed(3)} | ${(audioLoadTime/totalTime*100).toFixed(1)} | Loading audio file |\n`; | |
| comment += `| Segmentation | ${segmentationTime.toFixed(3)} | ${(segmentationTime/totalTime*100).toFixed(1)} | Detecting speech regions |\n`; | |
| comment += `| Embedding | ${embeddingTime.toFixed(3)} | ${(embeddingTime/totalTime*100).toFixed(1)} | Extracting speaker voices |\n`; | |
| comment += `| Clustering | ${clusteringTime.toFixed(3)} | ${(clusteringTime/totalTime*100).toFixed(1)} | Grouping same speakers |\n`; | |
| comment += `| **Total** | **${totalTime.toFixed(3)}** | **100** | **Full pipeline** |\n\n`; | |
| comment += '### Speaker Diarization Research Comparison\n'; | |
| comment += '_Research baselines typically achieve 18-30% DER on standard datasets_\n\n'; | |
| comment += '| Method | DER | Notes |\n'; | |
| comment += '|--------|-----|-------|\n'; | |
| comment += '| **FluidAudio** | **' + der.toFixed(1) + '%** | **On-device CoreML** |\n'; | |
| comment += '| Research baseline | 18-30% | Standard dataset performance |\n\n'; | |
| comment += '**Note**: RTFx shown above is from GitHub Actions runner. On Apple Silicon with ANE:\n'; | |
| comment += '- **M2 MacBook Air (2022)**: Runs at **150 RTFx** real-time\n'; | |
| comment += '- Performance scales with Apple Neural Engine capabilities\n\n'; | |
| comment += `<sub>🎯 **Speaker Diarization Test** • AMI Corpus ES2004a • ${duration}s meeting audio • ${inferenceTime.toFixed(1)}s diarization time • Test runtime: ${executionTime} • ${new Date().toLocaleString('en-US', { timeZone: 'America/New_York', year: 'numeric', month: '2-digit', day: '2-digit', hour: '2-digit', minute: '2-digit', hour12: true })} EST</sub>\n\n`; | |
| // Add hidden identifier for reliable comment detection | |
| comment += '<!-- fluidaudio-benchmark-single-file -->'; | |
| try { | |
| // First, try to find existing benchmark comment | |
| const comments = await github.rest.issues.listComments({ | |
| issue_number: context.issue.number, | |
| owner: context.repo.owner, | |
| repo: context.repo.repo, | |
| }); | |
| // Look for existing benchmark comment (identified by the hidden tag) | |
| const existingComment = comments.data.find(comment => { | |
| const isBot = comment.user.type === 'Bot' || | |
| comment.user.login === 'github-actions[bot]' || | |
| comment.user.login.includes('[bot]'); | |
| const hasIdentifier = comment.body.includes('<!-- fluidaudio-benchmark-single-file -->'); | |
| const hasHeader = comment.body.includes('## 🎯 Single File Benchmark Results'); | |
| return isBot && (hasIdentifier || hasHeader); | |
| }); | |
| if (existingComment) { | |
| // Update existing comment | |
| await github.rest.issues.updateComment({ | |
| comment_id: existingComment.id, | |
| owner: context.repo.owner, | |
| repo: context.repo.repo, | |
| body: comment | |
| }); | |
| console.log('✅ Successfully updated existing benchmark comment'); | |
| } else { | |
| // Create new comment if none exists | |
| await github.rest.issues.createComment({ | |
| issue_number: context.issue.number, | |
| owner: context.repo.owner, | |
| repo: context.repo.repo, | |
| body: comment | |
| }); | |
| console.log('✅ Successfully posted new benchmark results comment'); | |
| } | |
| } catch (error) { | |
| console.error('❌ Failed to update/post comment:', error.message); | |
| // Don't fail the workflow just because commenting failed | |
| } |