From 25a8abd81b49210db314593f432317827f387e4f Mon Sep 17 00:00:00 2001 From: Christopher Tso Date: Wed, 1 Apr 2026 03:47:42 +0000 Subject: [PATCH 01/21] feat(ci): add GitHub Actions workflow to run evals Adds a workflow_dispatch workflow that runs AgentV evals in CI using GitHub Copilot CLI and GitHub Models. Runs from source (bun apps/cli/dist/cli.js) instead of installing agentv from npm. Closes #892 Co-Authored-By: Claude Opus 4.6 --- .github/workflows/evals.yml | 85 +++++++++++++++++++++++++++++++++++++ 1 file changed, 85 insertions(+) create mode 100644 .github/workflows/evals.yml diff --git a/.github/workflows/evals.yml b/.github/workflows/evals.yml new file mode 100644 index 00000000..8ca1402b --- /dev/null +++ b/.github/workflows/evals.yml @@ -0,0 +1,85 @@ +name: Run Evals + +on: + workflow_dispatch: + inputs: + suite_filter: + description: "Glob pattern for eval files to run" + required: false + default: "evals/**/eval.yaml" + threshold: + description: "Minimum score threshold (0-1)" + required: false + default: "0.8" + +jobs: + evals: + name: Run AgentV Evals + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - uses: ./.github/actions/setup-bun + + - name: Build + run: bun run build + + - name: Install GitHub Copilot CLI + run: curl -fsSL https://gh.io/copilot-install | bash + + - name: Configure credentials + run: | + { + echo "GITHUB_MODELS_TOKEN=${{ secrets.GITHUB_MODELS_TOKEN || secrets.GITHUB_TOKEN }}" + echo "GITHUB_MODELS_MODEL=${{ vars.GITHUB_MODELS_MODEL || 'openai/gpt-5-mini' }}" + echo "GITHUB_MODELS_GRADER_MODEL=${{ vars.GITHUB_MODELS_GRADER_MODEL || 'openai/gpt-5-mini' }}" + } > .env + + - name: Resolve filter and threshold + id: filter + run: | + echo "pattern=${{ github.event.inputs.suite_filter || 'evals/**/eval.yaml' }}" >> "$GITHUB_OUTPUT" + echo "threshold=${{ github.event.inputs.threshold || '0.8' }}" >> "$GITHUB_OUTPUT" + + - name: Run AgentV evals + id: run-evals + env: + COPILOT_GITHUB_TOKEN: ${{ secrets.COPILOT_PAT }} + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + run: | + mkdir -p .agentv/ci-results + + bun apps/cli/dist/cli.js eval run "${{ steps.filter.outputs.pattern }}" \ + --workers 1 \ + --threshold ${{ steps.filter.outputs.threshold }} \ + -o .agentv/ci-results/junit.xml \ + --benchmark-json .agentv/ci-results/benchmark.json \ + --artifacts .agentv/ci-results/artifacts \ + --verbose \ + 2>&1 | tee .agentv/ci-results/eval-output.log + + echo "exit_code=${PIPESTATUS[0]}" >> "$GITHUB_OUTPUT" + + - name: Publish JUnit test results + if: always() + uses: dorny/test-reporter@v1 + with: + name: AgentV Eval Results + path: .agentv/ci-results/junit.xml + reporter: java-junit + fail-on-error: false + + - name: Upload eval artifacts + if: always() + uses: actions/upload-artifact@v4 + with: + name: eval-results-${{ github.run_id }} + path: .agentv/ci-results/ + retention-days: 30 + + - name: Fail if threshold not met + if: always() + run: | + if [ "${{ steps.run-evals.outputs.exit_code }}" != "0" ]; then + echo "::error::Eval score below threshold (${{ steps.filter.outputs.threshold }})" + exit 1 + fi From bb57287b4697482fcec7d621678322e66de8606d Mon Sep 17 00:00:00 2001 From: Christopher Tso Date: Wed, 1 Apr 2026 03:52:02 +0000 Subject: [PATCH 02/21] fix(ci): use GH_MODELS_TOKEN with default OpenAI target MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replace .env-only credentials with a proper .agentv/targets.yaml that sets GitHub Models as the default target via OpenAI provider. Remove Copilot CLI dependency — evals use the LLM target directly. Co-Authored-By: Claude Opus 4.6 --- .github/workflows/evals.yml | 27 +++++++++++++++------------ 1 file changed, 15 insertions(+), 12 deletions(-) diff --git a/.github/workflows/evals.yml b/.github/workflows/evals.yml index 8ca1402b..972ac89d 100644 --- a/.github/workflows/evals.yml +++ b/.github/workflows/evals.yml @@ -23,16 +23,22 @@ jobs: - name: Build run: bun run build - - name: Install GitHub Copilot CLI - run: curl -fsSL https://gh.io/copilot-install | bash - - - name: Configure credentials + - name: Configure targets run: | - { - echo "GITHUB_MODELS_TOKEN=${{ secrets.GITHUB_MODELS_TOKEN || secrets.GITHUB_TOKEN }}" - echo "GITHUB_MODELS_MODEL=${{ vars.GITHUB_MODELS_MODEL || 'openai/gpt-5-mini' }}" - echo "GITHUB_MODELS_GRADER_MODEL=${{ vars.GITHUB_MODELS_GRADER_MODEL || 'openai/gpt-5-mini' }}" - } > .env + mkdir -p .agentv + cat > .agentv/targets.yaml <<'TARGETS' + targets: + - name: default + provider: openai + base_url: https://models.github.ai/inference + api_key: ${{ GH_MODELS_TOKEN }} + model: ${{ GH_MODELS_MODEL }} + TARGETS + + cat > .env < Date: Wed, 1 Apr 2026 03:58:57 +0000 Subject: [PATCH 03/21] feat(ci): add GH Models default target with copilot-cli/copilot-sdk Update .agentv/targets.yaml: - Change default target to GitHub Models (openai provider, models.github.ai) - Add copilot-cli and copilot-sdk targets using GH_MODELS_MODEL - Keep existing pi, codex, gemini, openai, openrouter targets Update evals workflow: - Restore Copilot CLI install step - Write .env with GH_MODELS_TOKEN/GH_MODELS_MODEL (targets.yaml references these) - Remove inline targets.yaml generation Co-Authored-By: Claude Opus 4.6 --- .agentv/targets.yaml | 22 ++++++++++++++++------ .github/workflows/evals.yml | 22 +++++++++------------- 2 files changed, 25 insertions(+), 19 deletions(-) diff --git a/.agentv/targets.yaml b/.agentv/targets.yaml index de3f4cf0..44c2f062 100644 --- a/.agentv/targets.yaml +++ b/.agentv/targets.yaml @@ -3,12 +3,22 @@ targets: - name: default - provider: pi-coding-agent - subprovider: openrouter - model: z-ai/glm-4.7 - api_key: ${{ OPENROUTER_API_KEY }} - system_prompt: "Answer directly based on the information provided." - grader_target: gemini-flash + provider: openai + base_url: https://models.github.ai/inference + api_key: ${{ GH_MODELS_TOKEN }} + model: ${{ GH_MODELS_MODEL }} + + - name: copilot-cli + provider: copilot-cli + model: ${{ GH_MODELS_MODEL }} + grader_target: default + log_format: json + + - name: copilot-sdk + provider: copilot-sdk + model: ${{ GH_MODELS_MODEL }} + grader_target: default + log_format: json - name: pi-cli provider: pi-cli diff --git a/.github/workflows/evals.yml b/.github/workflows/evals.yml index 972ac89d..99f17657 100644 --- a/.github/workflows/evals.yml +++ b/.github/workflows/evals.yml @@ -23,22 +23,15 @@ jobs: - name: Build run: bun run build - - name: Configure targets - run: | - mkdir -p .agentv - cat > .agentv/targets.yaml <<'TARGETS' - targets: - - name: default - provider: openai - base_url: https://models.github.ai/inference - api_key: ${{ GH_MODELS_TOKEN }} - model: ${{ GH_MODELS_MODEL }} - TARGETS + - name: Install GitHub Copilot CLI + run: curl -fsSL https://gh.io/copilot-install | bash - cat > .env < .env < Date: Wed, 1 Apr 2026 04:01:20 +0000 Subject: [PATCH 04/21] refactor: consolidate targets.yaml to root .agentv/ Merge examples/features/.agentv/ and examples/showcase/.agentv/ into the root .agentv/ directory. Adds all missing targets (azure, azure-llm, claude, claude-sdk, pi with tools, codex with cwd/log_dir). Per-eval .agentv folders are preserved for eval-specific overrides. Co-Authored-By: Claude Opus 4.6 --- .../features/.agentv => .agentv}/config.yaml | 0 .agentv/targets.yaml | 53 +++++++--- examples/features/.agentv/targets.yaml | 100 ------------------ examples/showcase/.agentv/config.yaml | 3 - examples/showcase/.agentv/targets.yaml | 23 ---- 5 files changed, 40 insertions(+), 139 deletions(-) rename {examples/features/.agentv => .agentv}/config.yaml (100%) delete mode 100644 examples/features/.agentv/targets.yaml delete mode 100644 examples/showcase/.agentv/config.yaml delete mode 100644 examples/showcase/.agentv/targets.yaml diff --git a/examples/features/.agentv/config.yaml b/.agentv/config.yaml similarity index 100% rename from examples/features/.agentv/config.yaml rename to .agentv/config.yaml diff --git a/.agentv/targets.yaml b/.agentv/targets.yaml index 44c2f062..72ac9c7d 100644 --- a/.agentv/targets.yaml +++ b/.agentv/targets.yaml @@ -1,5 +1,6 @@ -# AgentV Self-Evaluation Targets -# Used to evaluate proposed changes against AGENTS.md design principles +# AgentV Evaluation Targets +# Consolidated from root, examples/features, and examples/showcase. +# Per-eval .agentv folders override these for specific eval cases. targets: - name: default @@ -20,25 +21,51 @@ targets: grader_target: default log_format: json - - name: pi-cli - provider: pi-cli + - name: claude + provider: claude + grader_target: default + log_format: json + + - name: claude-sdk + provider: claude-sdk + grader_target: default + log_format: json + + - name: pi + provider: pi-coding-agent subprovider: openrouter - model: openai/gpt-5.1-codex + model: openai/gpt-5.4 api_key: ${{ OPENROUTER_API_KEY }} - grader_target: gemini-flash + grader_target: default + tools: read,bash,edit,write + log_format: json - - name: pi-coding-agent - provider: pi-coding-agent + - name: pi-cli + provider: pi-cli subprovider: openrouter - model: z-ai/glm-4.7 + model: openai/gpt-5.1-codex api_key: ${{ OPENROUTER_API_KEY }} - system_prompt: "Answer directly based on the information provided." - grader_target: gemini-flash + grader_target: default - name: codex provider: codex - grader_target: gemini-llm - log_format: json # Optional: 'summary' (default) or 'json' for raw event logs + grader_target: default + cwd: ${{ CODEX_WORKSPACE_DIR }} + log_dir: ${{ CODEX_LOG_DIR }} + log_format: json + + - name: azure + provider: azure + endpoint: ${{ AZURE_OPENAI_ENDPOINT }} + api_key: ${{ AZURE_OPENAI_API_KEY }} + model: ${{ AZURE_DEPLOYMENT_NAME }} + + - name: azure-llm + provider: azure + endpoint: ${{ AZURE_OPENAI_ENDPOINT }} + api_key: ${{ AZURE_OPENAI_API_KEY }} + model: ${{ AZURE_DEPLOYMENT_NAME }} + version: ${{ AZURE_OPENAI_API_VERSION }} - name: gemini-llm provider: gemini diff --git a/examples/features/.agentv/targets.yaml b/examples/features/.agentv/targets.yaml deleted file mode 100644 index 79b27a5e..00000000 --- a/examples/features/.agentv/targets.yaml +++ /dev/null @@ -1,100 +0,0 @@ -# A list of all supported evaluation targets for the project. -# Each target defines a provider and its specific configuration. -# Actual values for paths/keys are stored in the local .env file. - -targets: - - name: default - provider: azure - endpoint: ${{ AZURE_OPENAI_ENDPOINT }} - api_key: ${{ AZURE_OPENAI_API_KEY }} - model: ${{ AZURE_DEPLOYMENT_NAME }} - - - name: openai - provider: openai - endpoint: ${{ OPENAI_ENDPOINT }} - api_key: ${{ OPENAI_API_KEY }} - model: ${{ OPENAI_MODEL }} - - - name: openrouter - provider: openrouter - api_key: ${{ OPENROUTER_API_KEY }} - model: ${{ OPENROUTER_MODEL }} - - - name: codex - provider: codex - grader_target: gemini-llm - # Uses the Codex CLI (defaults to `codex` on PATH) - # executable: ${{ CODEX_CLI_PATH }} # Optional: override executable path - # args: # Optional additional CLI arguments - # - --profile - # - ${{ CODEX_PROFILE }} - # - --model - # - ${{ CODEX_MODEL }} - # - --ask-for-approval - # - ${{ CODEX_APPROVAL_PRESET }} - cwd: ${{ CODEX_WORKSPACE_DIR }} # Where scratch workspaces are created - log_dir: ${{ CODEX_LOG_DIR }} # Optional: where Codex CLI stream logs are stored (defaults to ./.agentv/logs/codex) - log_format: json # Optional: 'summary' (default) or 'json' for raw event logs - - - name: azure-llm - provider: azure - endpoint: ${{ AZURE_OPENAI_ENDPOINT }} - api_key: ${{ AZURE_OPENAI_API_KEY }} - model: ${{ AZURE_DEPLOYMENT_NAME }} - version: ${{ AZURE_OPENAI_API_VERSION }} - - - name: gemini-llm - provider: gemini - api_key: ${{ GOOGLE_GENERATIVE_AI_API_KEY }} - model: ${{ GEMINI_MODEL_NAME }} - - # Pi Coding Agent - autonomous coding agent from pi-mono (SDK) - - name: pi - provider: pi-coding-agent - subprovider: openrouter - model: openai/gpt-5.4 - api_key: ${{ OPENROUTER_API_KEY }} - grader_target: gemini-llm - tools: read,bash,edit,write # Default tools for coding tasks - log_format: json # 'summary' (default) or 'json' for raw event logs - # system_prompt: optional override (default instructs agent to include code in response) - - # Pi CLI - subprocess-based Pi agent - - name: pi-cli - provider: pi-cli - subprovider: openrouter - model: openai/gpt-5.4 - api_key: ${{ OPENROUTER_API_KEY }} - grader_target: gemini-llm - log_format: json - - # GitHub Copilot - CLI subprocess - - name: copilot - provider: copilot-cli - model: gpt-5-mini - grader_target: gemini-llm - log_format: json - - # GitHub Copilot - SDK - # Note: copilot-sdk discovers skills via grep (keyword search) rather than - # reading skill files directly. The skill-trigger evaluator only checks tool - # inputs for the skill name, so copilot-sdk may fail positive trigger cases. - - name: copilot-sdk - provider: copilot-sdk - model: gpt-5-mini - grader_target: gemini-llm - log_format: json - - # Claude - CLI subprocess - - name: claude - provider: claude - grader_target: gemini-llm - # model: claude-sonnet-4-20250514 # Optional: override model - log_format: json # 'summary' (default) or 'json' for raw event logs - # system_prompt: optional override (default instructs agent to include code in response) - - # Claude SDK - direct SDK invocation (requires @anthropic-ai/claude-agent-sdk) - - name: claude-sdk - provider: claude-sdk - grader_target: gemini-llm - log_format: json diff --git a/examples/showcase/.agentv/config.yaml b/examples/showcase/.agentv/config.yaml deleted file mode 100644 index a9ba9d60..00000000 --- a/examples/showcase/.agentv/config.yaml +++ /dev/null @@ -1,3 +0,0 @@ -$schema: agentv-config-v2 - -# Example .agentv/config.yaml Configuration diff --git a/examples/showcase/.agentv/targets.yaml b/examples/showcase/.agentv/targets.yaml deleted file mode 100644 index 7413f67a..00000000 --- a/examples/showcase/.agentv/targets.yaml +++ /dev/null @@ -1,23 +0,0 @@ -targets: - - name: default - provider: azure - endpoint: ${{ AZURE_OPENAI_ENDPOINT }} - api_key: ${{ AZURE_OPENAI_API_KEY }} - model: ${{ AZURE_DEPLOYMENT_NAME }} - - - name: gemini-llm - provider: gemini - api_key: ${{ GOOGLE_GENERATIVE_AI_API_KEY }} - model: ${{ GEMINI_MODEL_NAME }} - - - name: codex - provider: codex - cwd: ${{ CODEX_WORKSPACE_DIR }} - log_dir: ${{ CODEX_LOG_DIR }} - log_format: json - grader_target: default - - - name: mock_agent - provider: cli - command: bun run ../tool-evaluation-plugins/mock-tool-agent.ts --prompt {PROMPT} --output {OUTPUT_FILE} - cwd: ${{ TOOL_EVAL_PLUGINS_DIR }} \ No newline at end of file From 48115ddd2a17f4e531bca0db0f4b4b36d0281497 Mon Sep 17 00:00:00 2001 From: Christopher Tso Date: Wed, 1 Apr 2026 04:05:11 +0000 Subject: [PATCH 05/21] fix(ci): use bunx agentv instead of dist path Since bun install links the local workspace package, bunx agentv resolves to the source without needing a global npm install. Co-Authored-By: Claude Opus 4.6 --- .github/workflows/evals.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/evals.yml b/.github/workflows/evals.yml index 99f17657..9f16bdc7 100644 --- a/.github/workflows/evals.yml +++ b/.github/workflows/evals.yml @@ -47,7 +47,7 @@ jobs: run: | mkdir -p .agentv/ci-results - bun apps/cli/dist/cli.js eval run "${{ steps.filter.outputs.pattern }}" \ + bunx agentv eval run "${{ steps.filter.outputs.pattern }}" \ --workers 1 \ --threshold ${{ steps.filter.outputs.threshold }} \ -o .agentv/ci-results/junit.xml \ From 569dc00879fe324081d8e4a71579de26b26c0988 Mon Sep 17 00:00:00 2001 From: Christopher Tso Date: Wed, 1 Apr 2026 04:07:34 +0000 Subject: [PATCH 06/21] feat(ci): use COPILOT_MODEL env var for copilot targets Copilot CLI accepts --model to set the AI model. Use a separate COPILOT_MODEL env var (default: gpt-5-mini) for copilot-cli and copilot-sdk targets instead of reusing GH_MODELS_MODEL. Co-Authored-By: Claude Opus 4.6 --- .agentv/targets.yaml | 4 ++-- .github/workflows/evals.yml | 1 + 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/.agentv/targets.yaml b/.agentv/targets.yaml index 72ac9c7d..9aa1f5c4 100644 --- a/.agentv/targets.yaml +++ b/.agentv/targets.yaml @@ -11,13 +11,13 @@ targets: - name: copilot-cli provider: copilot-cli - model: ${{ GH_MODELS_MODEL }} + model: ${{ COPILOT_MODEL }} grader_target: default log_format: json - name: copilot-sdk provider: copilot-sdk - model: ${{ GH_MODELS_MODEL }} + model: ${{ COPILOT_MODEL }} grader_target: default log_format: json diff --git a/.github/workflows/evals.yml b/.github/workflows/evals.yml index 9f16bdc7..e7c63be6 100644 --- a/.github/workflows/evals.yml +++ b/.github/workflows/evals.yml @@ -31,6 +31,7 @@ jobs: cat > .env < Date: Wed, 1 Apr 2026 04:24:50 +0000 Subject: [PATCH 07/21] refactor: use OPENROUTER_MODEL env var for pi targets Replace hardcoded model names in pi and pi-cli targets with ${{ OPENROUTER_MODEL }} env var. Default: openai/gpt-5.1-codex. Co-Authored-By: Claude Opus 4.6 --- .agentv/targets.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.agentv/targets.yaml b/.agentv/targets.yaml index 9aa1f5c4..3064109e 100644 --- a/.agentv/targets.yaml +++ b/.agentv/targets.yaml @@ -34,7 +34,7 @@ targets: - name: pi provider: pi-coding-agent subprovider: openrouter - model: openai/gpt-5.4 + model: ${{ OPENROUTER_MODEL }} api_key: ${{ OPENROUTER_API_KEY }} grader_target: default tools: read,bash,edit,write @@ -43,7 +43,7 @@ targets: - name: pi-cli provider: pi-cli subprovider: openrouter - model: openai/gpt-5.1-codex + model: ${{ OPENROUTER_MODEL }} api_key: ${{ OPENROUTER_API_KEY }} grader_target: default From 289ce6e6e18eeb289e7ca6b658b723e0130ec9b5 Mon Sep 17 00:00:00 2001 From: Christopher Tso Date: Wed, 1 Apr 2026 05:28:41 +0000 Subject: [PATCH 08/21] refactor: remove duplicate azure target, keep azure-llm Co-Authored-By: Claude Opus 4.6 --- .agentv/targets.yaml | 6 ------ 1 file changed, 6 deletions(-) diff --git a/.agentv/targets.yaml b/.agentv/targets.yaml index 3064109e..9335640a 100644 --- a/.agentv/targets.yaml +++ b/.agentv/targets.yaml @@ -54,12 +54,6 @@ targets: log_dir: ${{ CODEX_LOG_DIR }} log_format: json - - name: azure - provider: azure - endpoint: ${{ AZURE_OPENAI_ENDPOINT }} - api_key: ${{ AZURE_OPENAI_API_KEY }} - model: ${{ AZURE_DEPLOYMENT_NAME }} - - name: azure-llm provider: azure endpoint: ${{ AZURE_OPENAI_ENDPOINT }} From c59bdcb51ed08976d7bd19be140d12b6ea888de8 Mon Sep 17 00:00:00 2001 From: Christopher Tso Date: Wed, 1 Apr 2026 05:30:07 +0000 Subject: [PATCH 09/21] fix(ci): use gpt-5-mini without openai/ prefix for GitHub Models GitHub Models expects bare model names (gpt-5-mini), not the openai/gpt-5-mini format used by OpenRouter. Co-Authored-By: Claude Opus 4.6 --- .github/workflows/evals.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/evals.yml b/.github/workflows/evals.yml index e7c63be6..15beb99f 100644 --- a/.github/workflows/evals.yml +++ b/.github/workflows/evals.yml @@ -30,7 +30,7 @@ jobs: run: | cat > .env < Date: Wed, 1 Apr 2026 05:38:28 +0000 Subject: [PATCH 10/21] fix(ci): use GITHUB_TOKEN directly for GitHub Models MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The GH_MODELS_TOKEN secret appears invalid. Use GITHUB_TOKEN directly to diagnose — it has GitHub Models access by default. Co-Authored-By: Claude Opus 4.6 --- .github/workflows/evals.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/evals.yml b/.github/workflows/evals.yml index 15beb99f..80c278b3 100644 --- a/.github/workflows/evals.yml +++ b/.github/workflows/evals.yml @@ -29,7 +29,7 @@ jobs: - name: Configure credentials run: | cat > .env < Date: Wed, 1 Apr 2026 05:41:07 +0000 Subject: [PATCH 11/21] fix(ci): restore GH_MODELS_TOKEN with GITHUB_TOKEN fallback MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit GITHUB_TOKEN alone doesn't have GitHub Models access. Restore the original fallback chain — users need to set GH_MODELS_TOKEN secret with a PAT that has GitHub Models permissions. Co-Authored-By: Claude Opus 4.6 --- .github/workflows/evals.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/evals.yml b/.github/workflows/evals.yml index 80c278b3..15beb99f 100644 --- a/.github/workflows/evals.yml +++ b/.github/workflows/evals.yml @@ -29,7 +29,7 @@ jobs: - name: Configure credentials run: | cat > .env < Date: Wed, 1 Apr 2026 06:01:07 +0000 Subject: [PATCH 12/21] fix(core): use chat completions API for non-OpenAI endpoints The Vercel AI SDK v3 defaults to the OpenAI Responses API (/responses), which isn't supported by third-party OpenAI-compatible endpoints like GitHub Models. Use openai.chat() instead of openai() when a custom base_url is configured to force /chat/completions. Also fix base_url to include /v1 suffix for GitHub Models. Co-Authored-By: Claude Opus 4.6 --- .agentv/targets.yaml | 2 +- packages/core/src/evaluation/providers/ai-sdk.ts | 5 ++++- packages/core/test/evaluation/providers/targets.test.ts | 6 +++++- 3 files changed, 10 insertions(+), 3 deletions(-) diff --git a/.agentv/targets.yaml b/.agentv/targets.yaml index 9335640a..fe225127 100644 --- a/.agentv/targets.yaml +++ b/.agentv/targets.yaml @@ -5,7 +5,7 @@ targets: - name: default provider: openai - base_url: https://models.github.ai/inference + base_url: https://models.github.ai/inference/v1 api_key: ${{ GH_MODELS_TOKEN }} model: ${{ GH_MODELS_MODEL }} diff --git a/packages/core/src/evaluation/providers/ai-sdk.ts b/packages/core/src/evaluation/providers/ai-sdk.ts index 2bc90593..d166f84d 100644 --- a/packages/core/src/evaluation/providers/ai-sdk.ts +++ b/packages/core/src/evaluation/providers/ai-sdk.ts @@ -53,7 +53,10 @@ export class OpenAIProvider implements Provider { apiKey: config.apiKey, baseURL: config.baseURL, }); - this.model = openai(config.model); + // Non-OpenAI endpoints (e.g. GitHub Models) don't support the Responses + // API (/responses). Use .chat() to force /chat/completions. + const isCustomEndpoint = config.baseURL && !config.baseURL.includes('api.openai.com'); + this.model = isCustomEndpoint ? openai.chat(config.model) : openai(config.model); } async invoke(request: ProviderRequest): Promise { diff --git a/packages/core/test/evaluation/providers/targets.test.ts b/packages/core/test/evaluation/providers/targets.test.ts index 358c33ba..187b75f1 100644 --- a/packages/core/test/evaluation/providers/targets.test.ts +++ b/packages/core/test/evaluation/providers/targets.test.ts @@ -22,7 +22,11 @@ const generateTextMock = mock(async () => ({ const createAzureMock = mock((options: unknown) => ({ chat: () => ({ provider: 'azure', options }), })); -const createOpenAIMock = mock((options: unknown) => () => ({ provider: 'openai', options })); +const createOpenAIMock = mock((options: unknown) => { + const defaultFn = () => ({ provider: 'openai', options }); + defaultFn.chat = () => ({ provider: 'openai', options, api: 'chat' }); + return defaultFn; +}); const createOpenRouterMock = mock((options: unknown) => () => ({ provider: 'openrouter', options, From a9fa1ed494e423fffbde090d68f00c3cfe2a5bb0 Mon Sep 17 00:00:00 2001 From: Christopher Tso Date: Wed, 1 Apr 2026 06:07:30 +0000 Subject: [PATCH 13/21] fix(ci): add models:read permission for GitHub Models API The GITHUB_TOKEN needs explicit models:read permission to access the GitHub Models inference API. Without it, all requests return 404. Co-Authored-By: Claude Opus 4.6 --- .github/workflows/evals.yml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/.github/workflows/evals.yml b/.github/workflows/evals.yml index 15beb99f..ccee6728 100644 --- a/.github/workflows/evals.yml +++ b/.github/workflows/evals.yml @@ -16,6 +16,10 @@ jobs: evals: name: Run AgentV Evals runs-on: ubuntu-latest + permissions: + contents: read + checks: write + models: read steps: - uses: actions/checkout@v4 - uses: ./.github/actions/setup-bun From 967dcc59ae1b01db6375f21372f42a5e6ddfe063 Mon Sep 17 00:00:00 2001 From: Christopher Tso Date: Wed, 1 Apr 2026 06:10:23 +0000 Subject: [PATCH 14/21] fix(ci): prefer COPILOT_PAT for GitHub Models token COPILOT_PAT (fine-grained PAT with Copilot permission) also has GitHub Models access. Use it as the primary token, falling back to GH_MODELS_TOKEN then GITHUB_TOKEN. Co-Authored-By: Claude Opus 4.6 --- .github/workflows/evals.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/evals.yml b/.github/workflows/evals.yml index ccee6728..a861c9f0 100644 --- a/.github/workflows/evals.yml +++ b/.github/workflows/evals.yml @@ -33,7 +33,7 @@ jobs: - name: Configure credentials run: | cat > .env < Date: Wed, 1 Apr 2026 06:13:54 +0000 Subject: [PATCH 15/21] fix(ci): use bun apps/cli/dist/cli.js instead of bunx bunx agentv downloads the published npm version, ignoring the locally built source. Use the dist path directly to run from the workspace build which includes the .chat() fix. Co-Authored-By: Claude Opus 4.6 --- .github/workflows/evals.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/evals.yml b/.github/workflows/evals.yml index a861c9f0..e8c8c757 100644 --- a/.github/workflows/evals.yml +++ b/.github/workflows/evals.yml @@ -52,7 +52,7 @@ jobs: run: | mkdir -p .agentv/ci-results - bunx agentv eval run "${{ steps.filter.outputs.pattern }}" \ + bun apps/cli/dist/cli.js eval run "${{ steps.filter.outputs.pattern }}" \ --workers 1 \ --threshold ${{ steps.filter.outputs.threshold }} \ -o .agentv/ci-results/junit.xml \ From 1d6af4452deed9c4b9f2ff1c08be2aa29f778d75 Mon Sep 17 00:00:00 2001 From: Christopher Tso Date: Wed, 1 Apr 2026 06:22:58 +0000 Subject: [PATCH 16/21] feat(ci): expand eval patterns to include examples/features evals Add examples/features/**/*.EVAL.yaml alongside evals/**/eval.yaml so the multi-provider-skill-trigger eval runs in CI automatically. Pattern priority: workflow_dispatch input > vars.EVAL_PATTERNS repo variable > hardcoded default. Patterns are passed unquoted so the shell splits them into separate positional args for the CLI. Co-Authored-By: Claude Opus 4.6 --- .github/workflows/evals.yml | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/.github/workflows/evals.yml b/.github/workflows/evals.yml index e8c8c757..50a8368a 100644 --- a/.github/workflows/evals.yml +++ b/.github/workflows/evals.yml @@ -4,9 +4,9 @@ on: workflow_dispatch: inputs: suite_filter: - description: "Glob pattern for eval files to run" + description: "Space-separated glob patterns for eval files to run" required: false - default: "evals/**/eval.yaml" + default: "evals/**/eval.yaml examples/features/**/*.EVAL.yaml" threshold: description: "Minimum score threshold (0-1)" required: false @@ -40,8 +40,10 @@ jobs: - name: Resolve filter and threshold id: filter + env: + DEFAULT_PATTERNS: "evals/**/eval.yaml examples/features/**/*.EVAL.yaml" run: | - echo "pattern=${{ github.event.inputs.suite_filter || 'evals/**/eval.yaml' }}" >> "$GITHUB_OUTPUT" + echo "patterns=${{ github.event.inputs.suite_filter || vars.EVAL_PATTERNS || env.DEFAULT_PATTERNS }}" >> "$GITHUB_OUTPUT" echo "threshold=${{ github.event.inputs.threshold || '0.8' }}" >> "$GITHUB_OUTPUT" - name: Run AgentV evals @@ -52,7 +54,8 @@ jobs: run: | mkdir -p .agentv/ci-results - bun apps/cli/dist/cli.js eval run "${{ steps.filter.outputs.pattern }}" \ + # shellcheck disable=SC2086 + bun apps/cli/dist/cli.js eval run ${{ steps.filter.outputs.patterns }} \ --workers 1 \ --threshold ${{ steps.filter.outputs.threshold }} \ -o .agentv/ci-results/junit.xml \ From 883f6512a015e5129e5113ad2e753f10ee0a80ab Mon Sep 17 00:00:00 2001 From: Christopher Tso Date: Wed, 1 Apr 2026 06:37:24 +0000 Subject: [PATCH 17/21] refactor(targets): add dedicated grader target, run CI with copilot-cli Restructure targets.yaml: - Add explicit "grader" target (GH Models) for LLM-as-judge scoring - Keep "default" as alias so existing example evals still work - All agent targets now reference grader_target: grader - Organize targets into grader / agent / LLM sections Update CI workflow: - Default target changed to copilot-cli (agent with skill support) - Add configurable --target input (override via vars.EVAL_TARGET) Co-Authored-By: Claude Opus 4.6 --- .agentv/targets.yaml | 27 ++++++++++++++++++++------- .github/workflows/evals.yml | 8 +++++++- 2 files changed, 27 insertions(+), 8 deletions(-) diff --git a/.agentv/targets.yaml b/.agentv/targets.yaml index fe225127..5ef95a33 100644 --- a/.agentv/targets.yaml +++ b/.agentv/targets.yaml @@ -1,34 +1,46 @@ # AgentV Evaluation Targets # Consolidated from root, examples/features, and examples/showcase. # Per-eval .agentv folders override these for specific eval cases. +# +# "grader" is the LLM used for scoring; agent targets reference it via +# grader_target so eval execution and grading use separate models. targets: + # ── Grader (LLM-as-judge) ────────────────────────────────────────── + # "default" is an alias so example evals with `target: default` work. - name: default provider: openai base_url: https://models.github.ai/inference/v1 api_key: ${{ GH_MODELS_TOKEN }} model: ${{ GH_MODELS_MODEL }} + - name: grader + provider: openai + base_url: https://models.github.ai/inference/v1 + api_key: ${{ GH_MODELS_TOKEN }} + model: ${{ GH_MODELS_MODEL }} + + # ── Agent targets ────────────────────────────────────────────────── - name: copilot-cli provider: copilot-cli model: ${{ COPILOT_MODEL }} - grader_target: default + grader_target: grader log_format: json - name: copilot-sdk provider: copilot-sdk model: ${{ COPILOT_MODEL }} - grader_target: default + grader_target: grader log_format: json - name: claude provider: claude - grader_target: default + grader_target: grader log_format: json - name: claude-sdk provider: claude-sdk - grader_target: default + grader_target: grader log_format: json - name: pi @@ -36,7 +48,7 @@ targets: subprovider: openrouter model: ${{ OPENROUTER_MODEL }} api_key: ${{ OPENROUTER_API_KEY }} - grader_target: default + grader_target: grader tools: read,bash,edit,write log_format: json @@ -45,15 +57,16 @@ targets: subprovider: openrouter model: ${{ OPENROUTER_MODEL }} api_key: ${{ OPENROUTER_API_KEY }} - grader_target: default + grader_target: grader - name: codex provider: codex - grader_target: default + grader_target: grader cwd: ${{ CODEX_WORKSPACE_DIR }} log_dir: ${{ CODEX_LOG_DIR }} log_format: json + # ── LLM targets (direct model access) ───────────────────────────── - name: azure-llm provider: azure endpoint: ${{ AZURE_OPENAI_ENDPOINT }} diff --git a/.github/workflows/evals.yml b/.github/workflows/evals.yml index 50a8368a..4f16f285 100644 --- a/.github/workflows/evals.yml +++ b/.github/workflows/evals.yml @@ -7,6 +7,10 @@ on: description: "Space-separated glob patterns for eval files to run" required: false default: "evals/**/eval.yaml examples/features/**/*.EVAL.yaml" + target: + description: "Target name from .agentv/targets.yaml" + required: false + default: "copilot-cli" threshold: description: "Minimum score threshold (0-1)" required: false @@ -38,12 +42,13 @@ jobs: COPILOT_MODEL=${{ vars.COPILOT_MODEL || 'gpt-5-mini' }} EOF - - name: Resolve filter and threshold + - name: Resolve inputs id: filter env: DEFAULT_PATTERNS: "evals/**/eval.yaml examples/features/**/*.EVAL.yaml" run: | echo "patterns=${{ github.event.inputs.suite_filter || vars.EVAL_PATTERNS || env.DEFAULT_PATTERNS }}" >> "$GITHUB_OUTPUT" + echo "target=${{ github.event.inputs.target || vars.EVAL_TARGET || 'copilot-cli' }}" >> "$GITHUB_OUTPUT" echo "threshold=${{ github.event.inputs.threshold || '0.8' }}" >> "$GITHUB_OUTPUT" - name: Run AgentV evals @@ -56,6 +61,7 @@ jobs: # shellcheck disable=SC2086 bun apps/cli/dist/cli.js eval run ${{ steps.filter.outputs.patterns }} \ + --target ${{ steps.filter.outputs.target }} \ --workers 1 \ --threshold ${{ steps.filter.outputs.threshold }} \ -o .agentv/ci-results/junit.xml \ From c8fa125069e08664d178d619a0f520f66d2bfc0b Mon Sep 17 00:00:00 2001 From: Christopher Tso Date: Wed, 1 Apr 2026 06:41:55 +0000 Subject: [PATCH 18/21] fix(ci): force root targets.yaml to override per-eval local targets Per-eval .agentv/targets.yaml files (e.g. agent-skills-evals uses echo provider) don't define copilot-cli. Use --targets to force the root targets.yaml so all evals use the same CI target configuration. Co-Authored-By: Claude Opus 4.6 --- .github/workflows/evals.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/evals.yml b/.github/workflows/evals.yml index 4f16f285..8ae59d9e 100644 --- a/.github/workflows/evals.yml +++ b/.github/workflows/evals.yml @@ -61,6 +61,7 @@ jobs: # shellcheck disable=SC2086 bun apps/cli/dist/cli.js eval run ${{ steps.filter.outputs.patterns }} \ + --targets .agentv/targets.yaml \ --target ${{ steps.filter.outputs.target }} \ --workers 1 \ --threshold ${{ steps.filter.outputs.threshold }} \ From 8dc840f5f1eb5fedd95e682aec3e530da62ce4ba Mon Sep 17 00:00:00 2001 From: Christopher Tso Date: Wed, 1 Apr 2026 06:47:39 +0000 Subject: [PATCH 19/21] fix(ci): narrow eval pattern to exclude csv-analyzer (no workspace template) csv-analyzer.EVAL.yaml expects a csv-analyzer skill but the workspace template only includes acme-deploy. Narrow the glob to specifically target multi-provider-skill-trigger.EVAL.yaml which has a proper workspace template with the required skill. Co-Authored-By: Claude Opus 4.6 --- .github/workflows/evals.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/evals.yml b/.github/workflows/evals.yml index 8ae59d9e..abb3d6e3 100644 --- a/.github/workflows/evals.yml +++ b/.github/workflows/evals.yml @@ -6,7 +6,7 @@ on: suite_filter: description: "Space-separated glob patterns for eval files to run" required: false - default: "evals/**/eval.yaml examples/features/**/*.EVAL.yaml" + default: "evals/**/eval.yaml examples/features/agent-skills-evals/multi-provider-skill-trigger.EVAL.yaml" target: description: "Target name from .agentv/targets.yaml" required: false @@ -45,7 +45,7 @@ jobs: - name: Resolve inputs id: filter env: - DEFAULT_PATTERNS: "evals/**/eval.yaml examples/features/**/*.EVAL.yaml" + DEFAULT_PATTERNS: "evals/**/eval.yaml examples/features/agent-skills-evals/multi-provider-skill-trigger.EVAL.yaml" run: | echo "patterns=${{ github.event.inputs.suite_filter || vars.EVAL_PATTERNS || env.DEFAULT_PATTERNS }}" >> "$GITHUB_OUTPUT" echo "target=${{ github.event.inputs.target || vars.EVAL_TARGET || 'copilot-cli' }}" >> "$GITHUB_OUTPUT" From ca8e7a1194d7376c91a6fce62e8afcd4cfeaec8f Mon Sep 17 00:00:00 2001 From: Christopher Tso Date: Wed, 1 Apr 2026 07:23:09 +0000 Subject: [PATCH 20/21] refactor(ci): use comma-separated eval patterns instead of space-separated Comma-separated is more standard for list values. Patterns are split into separate positional args via bash array expansion. Co-Authored-By: Claude Opus 4.6 --- .github/workflows/evals.yml | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/.github/workflows/evals.yml b/.github/workflows/evals.yml index abb3d6e3..dbf1de8f 100644 --- a/.github/workflows/evals.yml +++ b/.github/workflows/evals.yml @@ -4,9 +4,9 @@ on: workflow_dispatch: inputs: suite_filter: - description: "Space-separated glob patterns for eval files to run" + description: "Comma-separated glob patterns for eval files to run" required: false - default: "evals/**/eval.yaml examples/features/agent-skills-evals/multi-provider-skill-trigger.EVAL.yaml" + default: "evals/**/eval.yaml,examples/features/agent-skills-evals/multi-provider-skill-trigger.EVAL.yaml" target: description: "Target name from .agentv/targets.yaml" required: false @@ -45,7 +45,7 @@ jobs: - name: Resolve inputs id: filter env: - DEFAULT_PATTERNS: "evals/**/eval.yaml examples/features/agent-skills-evals/multi-provider-skill-trigger.EVAL.yaml" + DEFAULT_PATTERNS: "evals/**/eval.yaml,examples/features/agent-skills-evals/multi-provider-skill-trigger.EVAL.yaml" run: | echo "patterns=${{ github.event.inputs.suite_filter || vars.EVAL_PATTERNS || env.DEFAULT_PATTERNS }}" >> "$GITHUB_OUTPUT" echo "target=${{ github.event.inputs.target || vars.EVAL_TARGET || 'copilot-cli' }}" >> "$GITHUB_OUTPUT" @@ -59,8 +59,9 @@ jobs: run: | mkdir -p .agentv/ci-results - # shellcheck disable=SC2086 - bun apps/cli/dist/cli.js eval run ${{ steps.filter.outputs.patterns }} \ + # Split comma-separated patterns into positional args + IFS=',' read -ra PATTERNS <<< "${{ steps.filter.outputs.patterns }}" + bun apps/cli/dist/cli.js eval run "${PATTERNS[@]}" \ --targets .agentv/targets.yaml \ --target ${{ steps.filter.outputs.target }} \ --workers 1 \ From 3c29df3e6f46e9ff4bd99fc39ccd709cbe571db7 Mon Sep 17 00:00:00 2001 From: Christopher Tso Date: Wed, 1 Apr 2026 07:33:56 +0000 Subject: [PATCH 21/21] refactor(openai): default to Chat Completions, opt in to Responses API MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Invert the logic: use .chat() (Chat Completions) by default since it is universally supported by all OpenAI-compatible endpoints. Only use the Responses API for actual api.openai.com, which is the only provider that supports /responses. Verified: - GH Models: /responses → 404, /chat/completions → 200 - Local evals with grader target: 3/3 at 1.000 - All 351 tests pass Co-Authored-By: Claude Opus 4.6 --- packages/core/src/evaluation/providers/ai-sdk.ts | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/packages/core/src/evaluation/providers/ai-sdk.ts b/packages/core/src/evaluation/providers/ai-sdk.ts index d166f84d..85278890 100644 --- a/packages/core/src/evaluation/providers/ai-sdk.ts +++ b/packages/core/src/evaluation/providers/ai-sdk.ts @@ -53,10 +53,12 @@ export class OpenAIProvider implements Provider { apiKey: config.apiKey, baseURL: config.baseURL, }); - // Non-OpenAI endpoints (e.g. GitHub Models) don't support the Responses - // API (/responses). Use .chat() to force /chat/completions. - const isCustomEndpoint = config.baseURL && !config.baseURL.includes('api.openai.com'); - this.model = isCustomEndpoint ? openai.chat(config.model) : openai(config.model); + // Default to Chat Completions API (/chat/completions) which is + // universally supported by all OpenAI-compatible endpoints. + // Only use the Responses API (/responses) for actual OpenAI, which + // is the only provider that supports it. + const isOpenAI = config.baseURL.includes('api.openai.com'); + this.model = isOpenAI ? openai(config.model) : openai.chat(config.model); } async invoke(request: ProviderRequest): Promise {