Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
21 commits
Select commit Hold shift + click to select a range
25a8abd
feat(ci): add GitHub Actions workflow to run evals
christso Apr 1, 2026
bb57287
fix(ci): use GH_MODELS_TOKEN with default OpenAI target
christso Apr 1, 2026
45a7a91
feat(ci): add GH Models default target with copilot-cli/copilot-sdk
christso Apr 1, 2026
0e99e7d
refactor: consolidate targets.yaml to root .agentv/
christso Apr 1, 2026
48115dd
fix(ci): use bunx agentv instead of dist path
christso Apr 1, 2026
569dc00
feat(ci): use COPILOT_MODEL env var for copilot targets
christso Apr 1, 2026
86ef97c
refactor: use OPENROUTER_MODEL env var for pi targets
christso Apr 1, 2026
289ce6e
refactor: remove duplicate azure target, keep azure-llm
christso Apr 1, 2026
c59bdcb
fix(ci): use gpt-5-mini without openai/ prefix for GitHub Models
christso Apr 1, 2026
a807256
fix(ci): use GITHUB_TOKEN directly for GitHub Models
christso Apr 1, 2026
c07b9a7
fix(ci): restore GH_MODELS_TOKEN with GITHUB_TOKEN fallback
christso Apr 1, 2026
04c2914
fix(core): use chat completions API for non-OpenAI endpoints
christso Apr 1, 2026
a9fa1ed
fix(ci): add models:read permission for GitHub Models API
christso Apr 1, 2026
967dcc5
fix(ci): prefer COPILOT_PAT for GitHub Models token
christso Apr 1, 2026
909b138
fix(ci): use bun apps/cli/dist/cli.js instead of bunx
christso Apr 1, 2026
1d6af44
feat(ci): expand eval patterns to include examples/features evals
christso Apr 1, 2026
883f651
refactor(targets): add dedicated grader target, run CI with copilot-cli
christso Apr 1, 2026
c8fa125
fix(ci): force root targets.yaml to override per-eval local targets
christso Apr 1, 2026
8dc840f
fix(ci): narrow eval pattern to exclude csv-analyzer (no workspace te…
christso Apr 1, 2026
ca8e7a1
refactor(ci): use comma-separated eval patterns instead of space-sepa…
christso Apr 1, 2026
3c29df3
refactor(openai): default to Chat Completions, opt in to Responses API
christso Apr 1, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
File renamed without changes.
78 changes: 61 additions & 17 deletions .agentv/targets.yaml
Original file line number Diff line number Diff line change
@@ -1,34 +1,78 @@
# AgentV Self-Evaluation Targets
# Used to evaluate proposed changes against AGENTS.md design principles
# AgentV Evaluation Targets
# Consolidated from root, examples/features, and examples/showcase.
# Per-eval .agentv folders override these for specific eval cases.
#
# "grader" is the LLM used for scoring; agent targets reference it via
# grader_target so eval execution and grading use separate models.

targets:
# ── Grader (LLM-as-judge) ──────────────────────────────────────────
# "default" is an alias so example evals with `target: default` work.
- name: default
provider: openai
base_url: https://models.github.ai/inference/v1
api_key: ${{ GH_MODELS_TOKEN }}
model: ${{ GH_MODELS_MODEL }}

- name: grader
provider: openai
base_url: https://models.github.ai/inference/v1
api_key: ${{ GH_MODELS_TOKEN }}
model: ${{ GH_MODELS_MODEL }}

# ── Agent targets ──────────────────────────────────────────────────
- name: copilot-cli
provider: copilot-cli
model: ${{ COPILOT_MODEL }}
grader_target: grader
log_format: json

- name: copilot-sdk
provider: copilot-sdk
model: ${{ COPILOT_MODEL }}
grader_target: grader
log_format: json

- name: claude
provider: claude
grader_target: grader
log_format: json

- name: claude-sdk
provider: claude-sdk
grader_target: grader
log_format: json

- name: pi
provider: pi-coding-agent
subprovider: openrouter
model: z-ai/glm-4.7
model: ${{ OPENROUTER_MODEL }}
api_key: ${{ OPENROUTER_API_KEY }}
system_prompt: "Answer directly based on the information provided."
grader_target: gemini-flash
grader_target: grader
tools: read,bash,edit,write
log_format: json

- name: pi-cli
provider: pi-cli
subprovider: openrouter
model: openai/gpt-5.1-codex
api_key: ${{ OPENROUTER_API_KEY }}
grader_target: gemini-flash

- name: pi-coding-agent
provider: pi-coding-agent
subprovider: openrouter
model: z-ai/glm-4.7
model: ${{ OPENROUTER_MODEL }}
api_key: ${{ OPENROUTER_API_KEY }}
system_prompt: "Answer directly based on the information provided."
grader_target: gemini-flash
grader_target: grader

- name: codex
provider: codex
grader_target: gemini-llm
log_format: json # Optional: 'summary' (default) or 'json' for raw event logs
grader_target: grader
cwd: ${{ CODEX_WORKSPACE_DIR }}
log_dir: ${{ CODEX_LOG_DIR }}
log_format: json

# ── LLM targets (direct model access) ─────────────────────────────
- name: azure-llm
provider: azure
endpoint: ${{ AZURE_OPENAI_ENDPOINT }}
api_key: ${{ AZURE_OPENAI_API_KEY }}
model: ${{ AZURE_DEPLOYMENT_NAME }}
version: ${{ AZURE_OPENAI_API_VERSION }}

- name: gemini-llm
provider: gemini
Expand Down
100 changes: 100 additions & 0 deletions .github/workflows/evals.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,100 @@
name: Run Evals

on:
workflow_dispatch:
inputs:
suite_filter:
description: "Comma-separated glob patterns for eval files to run"
required: false
default: "evals/**/eval.yaml,examples/features/agent-skills-evals/multi-provider-skill-trigger.EVAL.yaml"
target:
description: "Target name from .agentv/targets.yaml"
required: false
default: "copilot-cli"
threshold:
description: "Minimum score threshold (0-1)"
required: false
default: "0.8"

jobs:
evals:
name: Run AgentV Evals
runs-on: ubuntu-latest
permissions:
contents: read
checks: write
models: read
steps:
- uses: actions/checkout@v4
- uses: ./.github/actions/setup-bun

- name: Build
run: bun run build

- name: Install GitHub Copilot CLI
run: curl -fsSL https://gh.io/copilot-install | bash

- name: Configure credentials
run: |
cat > .env <<EOF
GH_MODELS_TOKEN=${{ secrets.COPILOT_PAT || secrets.GH_MODELS_TOKEN || secrets.GITHUB_TOKEN }}
GH_MODELS_MODEL=${{ vars.GH_MODELS_MODEL || 'gpt-5-mini' }}
COPILOT_MODEL=${{ vars.COPILOT_MODEL || 'gpt-5-mini' }}
EOF

- name: Resolve inputs
id: filter
env:
DEFAULT_PATTERNS: "evals/**/eval.yaml,examples/features/agent-skills-evals/multi-provider-skill-trigger.EVAL.yaml"
run: |
echo "patterns=${{ github.event.inputs.suite_filter || vars.EVAL_PATTERNS || env.DEFAULT_PATTERNS }}" >> "$GITHUB_OUTPUT"
echo "target=${{ github.event.inputs.target || vars.EVAL_TARGET || 'copilot-cli' }}" >> "$GITHUB_OUTPUT"
echo "threshold=${{ github.event.inputs.threshold || '0.8' }}" >> "$GITHUB_OUTPUT"

- name: Run AgentV evals
id: run-evals
env:
COPILOT_GITHUB_TOKEN: ${{ secrets.COPILOT_PAT }}
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
run: |
mkdir -p .agentv/ci-results

# Split comma-separated patterns into positional args
IFS=',' read -ra PATTERNS <<< "${{ steps.filter.outputs.patterns }}"
bun apps/cli/dist/cli.js eval run "${PATTERNS[@]}" \
--targets .agentv/targets.yaml \
--target ${{ steps.filter.outputs.target }} \
--workers 1 \
--threshold ${{ steps.filter.outputs.threshold }} \
-o .agentv/ci-results/junit.xml \
--benchmark-json .agentv/ci-results/benchmark.json \
--artifacts .agentv/ci-results/artifacts \
--verbose \
2>&1 | tee .agentv/ci-results/eval-output.log

echo "exit_code=${PIPESTATUS[0]}" >> "$GITHUB_OUTPUT"

- name: Publish JUnit test results
if: always()
uses: dorny/test-reporter@v1
with:
name: AgentV Eval Results
path: .agentv/ci-results/junit.xml
reporter: java-junit
fail-on-error: false

- name: Upload eval artifacts
if: always()
uses: actions/upload-artifact@v4
with:
name: eval-results-${{ github.run_id }}
path: .agentv/ci-results/
retention-days: 30

- name: Fail if threshold not met
if: always()
run: |
if [ "${{ steps.run-evals.outputs.exit_code }}" != "0" ]; then
echo "::error::Eval score below threshold (${{ steps.filter.outputs.threshold }})"
exit 1
fi
100 changes: 0 additions & 100 deletions examples/features/.agentv/targets.yaml

This file was deleted.

3 changes: 0 additions & 3 deletions examples/showcase/.agentv/config.yaml

This file was deleted.

23 changes: 0 additions & 23 deletions examples/showcase/.agentv/targets.yaml

This file was deleted.

7 changes: 6 additions & 1 deletion packages/core/src/evaluation/providers/ai-sdk.ts
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,12 @@ export class OpenAIProvider implements Provider {
apiKey: config.apiKey,
baseURL: config.baseURL,
});
this.model = openai(config.model);
// Default to Chat Completions API (/chat/completions) which is
// universally supported by all OpenAI-compatible endpoints.
// Only use the Responses API (/responses) for actual OpenAI, which
// is the only provider that supports it.
const isOpenAI = config.baseURL.includes('api.openai.com');
this.model = isOpenAI ? openai(config.model) : openai.chat(config.model);
}

async invoke(request: ProviderRequest): Promise<ProviderResponse> {
Expand Down
6 changes: 5 additions & 1 deletion packages/core/test/evaluation/providers/targets.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,11 @@ const generateTextMock = mock(async () => ({
const createAzureMock = mock((options: unknown) => ({
chat: () => ({ provider: 'azure', options }),
}));
const createOpenAIMock = mock((options: unknown) => () => ({ provider: 'openai', options }));
const createOpenAIMock = mock((options: unknown) => {
const defaultFn = () => ({ provider: 'openai', options });
defaultFn.chat = () => ({ provider: 'openai', options, api: 'chat' });
return defaultFn;
});
const createOpenRouterMock = mock((options: unknown) => () => ({
provider: 'openrouter',
options,
Expand Down
Loading