Run Evals #13
Workflow file for this run
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| name: Run Evals | |
| on: | |
| workflow_dispatch: | |
| inputs: | |
| suite_filter: | |
| description: "Comma-separated glob patterns for eval files to run" | |
| required: false | |
| default: "evals/**/*.eval.yaml,examples/**/*.eval.yaml,examples/**/*.EVAL.yaml,examples/**/EVAL.yaml" | |
| target: | |
| description: "Optional target override (leave empty to use each eval's own target)" | |
| required: false | |
| default: "" | |
| threshold: | |
| description: "Minimum score threshold (0-1)" | |
| required: false | |
| default: "0.8" | |
| jobs: | |
| evals: | |
| name: Run AgentV Evals | |
| runs-on: ubuntu-latest | |
| permissions: | |
| contents: read | |
| checks: write | |
| models: read | |
| steps: | |
| - uses: actions/checkout@v4 | |
| - uses: ./.github/actions/setup-bun | |
| - name: Build | |
| run: bun run build | |
| - name: Install GitHub Copilot CLI | |
| run: curl -fsSL https://gh.io/copilot-install | bash | |
| - name: Configure credentials | |
| run: | | |
| cat > .env <<EOF | |
| GH_MODELS_TOKEN=${{ secrets.COPILOT_PAT || secrets.GH_MODELS_TOKEN || secrets.GITHUB_TOKEN }} | |
| GH_MODELS_MODEL=${{ vars.GH_MODELS_MODEL || 'gpt-5-mini' }} | |
| COPILOT_MODEL=${{ vars.COPILOT_MODEL || 'gpt-5-mini' }} | |
| AGENT_PROVIDER=${{ vars.AGENT_PROVIDER || 'copilot-cli' }} | |
| AGENT_MODEL=${{ vars.AGENT_MODEL || vars.COPILOT_MODEL || 'gpt-5-mini' }} | |
| EOF | |
| - name: Resolve inputs | |
| id: filter | |
| env: | |
| DEFAULT_PATTERNS: "evals/**/*.eval.yaml,examples/**/*.eval.yaml,examples/**/*.EVAL.yaml,examples/**/EVAL.yaml" | |
| # Multi-provider evals need multiple agent targets installed | |
| # simultaneously. Exclude from default CI (override via repo var). | |
| EXCLUDE_PATTERNS: "examples/showcase/multi-model-benchmark/**" | |
| run: | | |
| RAW_PATTERNS="${{ github.event.inputs.suite_filter || vars.EVAL_PATTERNS || env.DEFAULT_PATTERNS }}" | |
| EXCLUDES="${{ vars.EVAL_EXCLUDE_PATTERNS || env.EXCLUDE_PATTERNS }}" | |
| # Append negated exclude globs so the runner skips multi-provider evals | |
| FINAL="$RAW_PATTERNS" | |
| if [ -n "$EXCLUDES" ]; then | |
| IFS=',' read -ra EXCL <<< "$EXCLUDES" | |
| for pat in "${EXCL[@]}"; do | |
| FINAL="$FINAL,!$pat" | |
| done | |
| fi | |
| echo "patterns=$FINAL" >> "$GITHUB_OUTPUT" | |
| echo "target=${{ github.event.inputs.target || vars.EVAL_TARGET || '' }}" >> "$GITHUB_OUTPUT" | |
| echo "threshold=${{ github.event.inputs.threshold || '0.8' }}" >> "$GITHUB_OUTPUT" | |
| - name: Run AgentV evals | |
| id: run-evals | |
| env: | |
| COPILOT_GITHUB_TOKEN: ${{ secrets.COPILOT_PAT }} | |
| GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} | |
| run: | | |
| mkdir -p .agentv/ci-results | |
| # Split comma-separated patterns into positional args | |
| IFS=',' read -ra PATTERNS <<< "${{ steps.filter.outputs.patterns }}" | |
| # Build optional --target flag (empty = use each eval's own target) | |
| TARGET_FLAG=() | |
| if [ -n "${{ steps.filter.outputs.target }}" ]; then | |
| TARGET_FLAG=(--target "${{ steps.filter.outputs.target }}") | |
| fi | |
| bun apps/cli/dist/cli.js eval run "${PATTERNS[@]}" \ | |
| --targets .agentv/targets.yaml \ | |
| "${TARGET_FLAG[@]}" \ | |
| --workers 1 \ | |
| --threshold ${{ steps.filter.outputs.threshold }} \ | |
| -o .agentv/ci-results/junit.xml \ | |
| --benchmark-json .agentv/ci-results/benchmark.json \ | |
| --artifacts .agentv/ci-results/artifacts \ | |
| --verbose \ | |
| 2>&1 | tee .agentv/ci-results/eval-output.log | |
| echo "exit_code=${PIPESTATUS[0]}" >> "$GITHUB_OUTPUT" | |
| - name: Publish JUnit test results | |
| if: always() | |
| uses: dorny/test-reporter@v1 | |
| with: | |
| name: AgentV Eval Results | |
| path: .agentv/ci-results/junit.xml | |
| reporter: java-junit | |
| fail-on-error: false | |
| - name: Upload eval artifacts | |
| if: always() | |
| uses: actions/upload-artifact@v4 | |
| with: | |
| name: eval-results-${{ github.run_id }} | |
| path: .agentv/ci-results/ | |
| retention-days: 30 | |
| - name: Fail if threshold not met | |
| if: always() | |
| run: | | |
| if [ "${{ steps.run-evals.outputs.exit_code }}" != "0" ]; then | |
| echo "::error::Eval score below threshold (${{ steps.filter.outputs.threshold }})" | |
| exit 1 | |
| fi |