Skip to content

Run Evals

Run Evals #13

Workflow file for this run

name: Run Evals
on:
workflow_dispatch:
inputs:
suite_filter:
description: "Comma-separated glob patterns for eval files to run"
required: false
default: "evals/**/*.eval.yaml,examples/**/*.eval.yaml,examples/**/*.EVAL.yaml,examples/**/EVAL.yaml"
target:
description: "Optional target override (leave empty to use each eval's own target)"
required: false
default: ""
threshold:
description: "Minimum score threshold (0-1)"
required: false
default: "0.8"
jobs:
evals:
name: Run AgentV Evals
runs-on: ubuntu-latest
permissions:
contents: read
checks: write
models: read
steps:
- uses: actions/checkout@v4
- uses: ./.github/actions/setup-bun
- name: Build
run: bun run build
- name: Install GitHub Copilot CLI
run: curl -fsSL https://gh.io/copilot-install | bash
- name: Configure credentials
run: |
cat > .env <<EOF
GH_MODELS_TOKEN=${{ secrets.COPILOT_PAT || secrets.GH_MODELS_TOKEN || secrets.GITHUB_TOKEN }}
GH_MODELS_MODEL=${{ vars.GH_MODELS_MODEL || 'gpt-5-mini' }}
COPILOT_MODEL=${{ vars.COPILOT_MODEL || 'gpt-5-mini' }}
AGENT_PROVIDER=${{ vars.AGENT_PROVIDER || 'copilot-cli' }}
AGENT_MODEL=${{ vars.AGENT_MODEL || vars.COPILOT_MODEL || 'gpt-5-mini' }}
EOF
- name: Resolve inputs
id: filter
env:
DEFAULT_PATTERNS: "evals/**/*.eval.yaml,examples/**/*.eval.yaml,examples/**/*.EVAL.yaml,examples/**/EVAL.yaml"
# Multi-provider evals need multiple agent targets installed
# simultaneously. Exclude from default CI (override via repo var).
EXCLUDE_PATTERNS: "examples/showcase/multi-model-benchmark/**"
run: |
RAW_PATTERNS="${{ github.event.inputs.suite_filter || vars.EVAL_PATTERNS || env.DEFAULT_PATTERNS }}"
EXCLUDES="${{ vars.EVAL_EXCLUDE_PATTERNS || env.EXCLUDE_PATTERNS }}"
# Append negated exclude globs so the runner skips multi-provider evals
FINAL="$RAW_PATTERNS"
if [ -n "$EXCLUDES" ]; then
IFS=',' read -ra EXCL <<< "$EXCLUDES"
for pat in "${EXCL[@]}"; do
FINAL="$FINAL,!$pat"
done
fi
echo "patterns=$FINAL" >> "$GITHUB_OUTPUT"
echo "target=${{ github.event.inputs.target || vars.EVAL_TARGET || '' }}" >> "$GITHUB_OUTPUT"
echo "threshold=${{ github.event.inputs.threshold || '0.8' }}" >> "$GITHUB_OUTPUT"
- name: Run AgentV evals
id: run-evals
env:
COPILOT_GITHUB_TOKEN: ${{ secrets.COPILOT_PAT }}
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
run: |
mkdir -p .agentv/ci-results
# Split comma-separated patterns into positional args
IFS=',' read -ra PATTERNS <<< "${{ steps.filter.outputs.patterns }}"
# Build optional --target flag (empty = use each eval's own target)
TARGET_FLAG=()
if [ -n "${{ steps.filter.outputs.target }}" ]; then
TARGET_FLAG=(--target "${{ steps.filter.outputs.target }}")
fi
bun apps/cli/dist/cli.js eval run "${PATTERNS[@]}" \
--targets .agentv/targets.yaml \
"${TARGET_FLAG[@]}" \
--workers 1 \
--threshold ${{ steps.filter.outputs.threshold }} \
-o .agentv/ci-results/junit.xml \
--benchmark-json .agentv/ci-results/benchmark.json \
--artifacts .agentv/ci-results/artifacts \
--verbose \
2>&1 | tee .agentv/ci-results/eval-output.log
echo "exit_code=${PIPESTATUS[0]}" >> "$GITHUB_OUTPUT"
- name: Publish JUnit test results
if: always()
uses: dorny/test-reporter@v1
with:
name: AgentV Eval Results
path: .agentv/ci-results/junit.xml
reporter: java-junit
fail-on-error: false
- name: Upload eval artifacts
if: always()
uses: actions/upload-artifact@v4
with:
name: eval-results-${{ github.run_id }}
path: .agentv/ci-results/
retention-days: 30
- name: Fail if threshold not met
if: always()
run: |
if [ "${{ steps.run-evals.outputs.exit_code }}" != "0" ]; then
echo "::error::Eval score below threshold (${{ steps.filter.outputs.threshold }})"
exit 1
fi