Fix offline diarization pipeline producing single-speaker output #2190

Workflow file for this run

.github/workflows/diarizer-benchmark.yml at 5b6a765

	name: Diarizer Performance Benchmark
	on:
	pull_request:
	branches: [main]
	types: [opened, synchronize, reopened]

	concurrency:
	group: ${{ github.workflow }}-${{ github.ref }}
	cancel-in-progress: true

	jobs:
	benchmark:
	name: Single File Performance Benchmark
	runs-on: macos-15
	permissions:
	contents: read
	pull-requests: write

	steps:
	- name: Checkout code
	uses: actions/checkout@v5

	- uses: swift-actions/setup-swift@v2
	with:
	swift-version: "6.1"

	- name: Cache Swift packages and build
	uses: actions/cache@v4
	with:
	path: \|
	.build
	~/Library/Caches/org.swift.swiftpm
	key: ${{ runner.os }}-diarizer-${{ hashFiles('Package.swift', 'Sources/FluidAudio/ModelRegistry.swift', 'Sources/FluidAudio/ModelNames.swift') }}

	- name: Cache Diarizer models
	uses: actions/cache@v4
	with:
	path: ~/Library/Application Support/FluidAudio/Models/speaker-diarization-coreml
	key: ${{ runner.os }}-diarizer-models-${{ hashFiles('Sources/FluidAudio/ModelRegistry.swift', 'Sources/FluidAudio/ModelNames.swift') }}

	- name: Cache AMI dataset
	uses: actions/cache@v4
	with:
	path: ~/FluidAudioDatasets/ami_official
	key: ${{ runner.os }}-ami-dataset-${{ hashFiles('Sources/FluidAudioCLI/DatasetParsers/DatasetDownloader.swift') }}

	- name: Build package
	run: swift build -c release

	- name: Run Single File Benchmark
	id: benchmark
	run: \|
	echo "🚀 Running single file benchmark..."

	# Record start time
	BENCHMARK_START=$(date +%s)

	swift run fluidaudiocli diarization-benchmark --auto-download --single-file ES2004a --output benchmark_results.json

	# Check if results file was generated
	if [ -f benchmark_results.json ]; then
	echo "SUCCESS=true" >> $GITHUB_OUTPUT
	else
	echo "❌ Benchmark failed - no results file generated"
	echo "SUCCESS=false" >> $GITHUB_OUTPUT
	fi

	# Calculate execution time
	BENCHMARK_END=$(date +%s)
	EXECUTION_TIME=$((BENCHMARK_END - BENCHMARK_START))
	EXECUTION_MINS=$((EXECUTION_TIME / 60))
	EXECUTION_SECS=$((EXECUTION_TIME % 60))

	echo "EXECUTION_TIME=${EXECUTION_MINS}m ${EXECUTION_SECS}s" >> $GITHUB_OUTPUT
	timeout-minutes: 35

	- name: Show benchmark_results.json
	if: always()
	run: \|
	echo "--- benchmark_results.json ---"
	cat benchmark_results.json \|\| echo "benchmark_results.json not found"
	echo "-----------------------------"

	- name: Extract benchmark metrics with jq
	id: extract
	run: \|
	# The output is now an array, so we need to access the first element
	DER=$(jq '.[0].der' benchmark_results.json)
	JER=$(jq '.[0].jer' benchmark_results.json)
	RTF=$(jq '.[0].rtfx' benchmark_results.json)
	DURATION="1049" # ES2004a duration in seconds
	SPEAKER_COUNT=$(jq '.[0].detectedSpeakers' benchmark_results.json)

	# Extract detailed timing information
	TOTAL_TIME=$(jq '.[0].timings.totalProcessingSeconds' benchmark_results.json)
	MODEL_DOWNLOAD_TIME=$(jq '.[0].timings.modelDownloadSeconds' benchmark_results.json)
	MODEL_COMPILE_TIME=$(jq '.[0].timings.modelCompilationSeconds' benchmark_results.json)
	AUDIO_LOAD_TIME=$(jq '.[0].timings.audioLoadingSeconds' benchmark_results.json)
	SEGMENTATION_TIME=$(jq '.[0].timings.segmentationSeconds' benchmark_results.json)
	EMBEDDING_TIME=$(jq '.[0].timings.embeddingExtractionSeconds' benchmark_results.json)
	CLUSTERING_TIME=$(jq '.[0].timings.speakerClusteringSeconds' benchmark_results.json)
	INFERENCE_TIME=$(jq '.[0].timings.totalInferenceSeconds' benchmark_results.json)

	echo "DER=${DER}" >> $GITHUB_OUTPUT
	echo "JER=${JER}" >> $GITHUB_OUTPUT
	echo "RTF=${RTF}" >> $GITHUB_OUTPUT
	echo "DURATION=${DURATION}" >> $GITHUB_OUTPUT
	echo "SPEAKER_COUNT=${SPEAKER_COUNT}" >> $GITHUB_OUTPUT
	echo "TOTAL_TIME=${TOTAL_TIME}" >> $GITHUB_OUTPUT
	echo "MODEL_DOWNLOAD_TIME=${MODEL_DOWNLOAD_TIME}" >> $GITHUB_OUTPUT
	echo "MODEL_COMPILE_TIME=${MODEL_COMPILE_TIME}" >> $GITHUB_OUTPUT
	echo "AUDIO_LOAD_TIME=${AUDIO_LOAD_TIME}" >> $GITHUB_OUTPUT
	echo "SEGMENTATION_TIME=${SEGMENTATION_TIME}" >> $GITHUB_OUTPUT
	echo "EMBEDDING_TIME=${EMBEDDING_TIME}" >> $GITHUB_OUTPUT
	echo "CLUSTERING_TIME=${CLUSTERING_TIME}" >> $GITHUB_OUTPUT
	echo "INFERENCE_TIME=${INFERENCE_TIME}" >> $GITHUB_OUTPUT

	# Validate RTFx - 0 indicates benchmark failure
	if [ "$RTF" = "0" ] \|\| [ -z "$RTF" ]; then
	echo "❌ CRITICAL: RTFx is 0 or empty - benchmark failed"
	echo "RTFx value: $RTF"
	exit 1
	fi

	- name: Comment PR with Benchmark Results
	if: always()
	uses: actions/github-script@v7
	with:
	script: \|
	const der = parseFloat('${{ steps.extract.outputs.DER }}');
	const jer = parseFloat('${{ steps.extract.outputs.JER }}');
	const rtf = parseFloat('${{ steps.extract.outputs.RTF }}');
	const duration = parseFloat('${{ steps.extract.outputs.DURATION }}').toFixed(1);
	const speakerCount = '${{ steps.extract.outputs.SPEAKER_COUNT }}';
	const totalTime = parseFloat('${{ steps.extract.outputs.TOTAL_TIME }}');
	const inferenceTime = parseFloat('${{ steps.extract.outputs.INFERENCE_TIME }}');
	const modelDownloadTime = parseFloat('${{ steps.extract.outputs.MODEL_DOWNLOAD_TIME }}');
	const modelCompileTime = parseFloat('${{ steps.extract.outputs.MODEL_COMPILE_TIME }}');
	const audioLoadTime = parseFloat('${{ steps.extract.outputs.AUDIO_LOAD_TIME }}');
	const segmentationTime = parseFloat('${{ steps.extract.outputs.SEGMENTATION_TIME }}');
	const embeddingTime = parseFloat('${{ steps.extract.outputs.EMBEDDING_TIME }}');
	const clusteringTime = parseFloat('${{ steps.extract.outputs.CLUSTERING_TIME }}');
	const executionTime = '${{ steps.benchmark.outputs.EXECUTION_TIME }}' \|\| 'N/A';

	let comment = '## Speaker Diarization Benchmark Results\n\n';
	comment += '### Speaker Diarization Performance\n';
	comment += '_Evaluating "who spoke when" detection accuracy_\n\n';
	comment += '\| Metric \| Value \| Target \| Status \| Description \|\n';
	comment += '\|--------\|-------\|--------\|---------\|-------------\|\n';
	comment += `\| DER \| ${der.toFixed(1)}% \| <30% \| ${der < 30 ? '✅' : '⚠️'} \| Diarization Error Rate (lower is better) \|\n`;
	comment += `\| JER \| ${jer.toFixed(1)}% \| <25% \| ${jer < 25 ? '✅' : '⚠️'} \| Jaccard Error Rate \|\n`;
	comment += `\| RTFx \| ${rtf.toFixed(2)}x \| >1.0x \| ${rtf > 1.0 ? '✅' : '⚠️'} \| Real-Time Factor (higher is faster) \|\n\n`;

	comment += '### Diarization Pipeline Timing Breakdown\n';
	comment += '_Time spent in each stage of speaker diarization_\n\n';
	comment += '\| Stage \| Time (s) \| % \| Description \|\n';
	comment += '\|-------\|----------\|---\|-------------\|\n';
	comment += `\| Model Download \| ${modelDownloadTime.toFixed(3)} \| ${(modelDownloadTime/totalTime*100).toFixed(1)} \| Fetching diarization models \|\n`;
	comment += `\| Model Compile \| ${modelCompileTime.toFixed(3)} \| ${(modelCompileTime/totalTime*100).toFixed(1)} \| CoreML compilation \|\n`;
	comment += `\| Audio Load \| ${audioLoadTime.toFixed(3)} \| ${(audioLoadTime/totalTime*100).toFixed(1)} \| Loading audio file \|\n`;
	comment += `\| Segmentation \| ${segmentationTime.toFixed(3)} \| ${(segmentationTime/totalTime*100).toFixed(1)} \| Detecting speech regions \|\n`;
	comment += `\| Embedding \| ${embeddingTime.toFixed(3)} \| ${(embeddingTime/totalTime*100).toFixed(1)} \| Extracting speaker voices \|\n`;
	comment += `\| Clustering \| ${clusteringTime.toFixed(3)} \| ${(clusteringTime/totalTime*100).toFixed(1)} \| Grouping same speakers \|\n`;
	comment += `\| Total \| ${totalTime.toFixed(3)} \| 100 \| Full pipeline \|\n\n`;

	comment += '### Speaker Diarization Research Comparison\n';
	comment += '_Research baselines typically achieve 18-30% DER on standard datasets_\n\n';
	comment += '\| Method \| DER \| Notes \|\n';
	comment += '\|--------\|-----\|-------\|\n';
	comment += '\| FluidAudio \| ' + der.toFixed(1) + '% \| On-device CoreML \|\n';
	comment += '\| Research baseline \| 18-30% \| Standard dataset performance \|\n\n';

	comment += 'Note: RTFx shown above is from GitHub Actions runner. On Apple Silicon with ANE:\n';
	comment += '- M2 MacBook Air (2022): Runs at 150 RTFx real-time\n';
	comment += '- Performance scales with Apple Neural Engine capabilities\n\n';

	comment += `<sub>🎯 Speaker Diarization Test • AMI Corpus ES2004a • ${duration}s meeting audio • ${inferenceTime.toFixed(1)}s diarization time • Test runtime: ${executionTime} • ${new Date().toLocaleString('en-US', { timeZone: 'America/New_York', year: 'numeric', month: '2-digit', day: '2-digit', hour: '2-digit', minute: '2-digit', hour12: true })} EST</sub>\n\n`;

	// Add hidden identifier for reliable comment detection
	comment += '<!-- fluidaudio-benchmark-single-file -->';

	try {
	// First, try to find existing benchmark comment
	const comments = await github.rest.issues.listComments({
	issue_number: context.issue.number,
	owner: context.repo.owner,
	repo: context.repo.repo,
	});

	// Look for existing benchmark comment (identified by the hidden tag)
	const existingComment = comments.data.find(comment => {
	const isBot = comment.user.type === 'Bot' \|\|
	comment.user.login === 'github-actions[bot]' \|\|
	comment.user.login.includes('[bot]');

	const hasIdentifier = comment.body.includes('<!-- fluidaudio-benchmark-single-file -->');
	const hasHeader = comment.body.includes('## 🎯 Single File Benchmark Results');

	return isBot && (hasIdentifier \|\| hasHeader);
	});

	if (existingComment) {
	// Update existing comment
	await github.rest.issues.updateComment({
	comment_id: existingComment.id,
	owner: context.repo.owner,
	repo: context.repo.repo,
	body: comment
	});
	console.log('✅ Successfully updated existing benchmark comment');
	} else {
	// Create new comment if none exists
	await github.rest.issues.createComment({
	issue_number: context.issue.number,
	owner: context.repo.owner,
	repo: context.repo.repo,
	body: comment
	});
	console.log('✅ Successfully posted new benchmark results comment');
	}
	} catch (error) {
	console.error('❌ Failed to update/post comment:', error.message);
	// Don't fail the workflow just because commenting failed
	}

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Fix offline diarization pipeline producing single-speaker output #2190

Workflow file

Fix offline diarization pipeline producing single-speaker output #2190

Uh oh!

Workflow file for this run