diff --git a/examples/features/.agentv/config.yaml b/.agentv/config.yaml similarity index 100% rename from examples/features/.agentv/config.yaml rename to .agentv/config.yaml diff --git a/.agentv/targets.yaml b/.agentv/targets.yaml index de3f4cf0b..5ef95a332 100644 --- a/.agentv/targets.yaml +++ b/.agentv/targets.yaml @@ -1,34 +1,78 @@ -# AgentV Self-Evaluation Targets -# Used to evaluate proposed changes against AGENTS.md design principles +# AgentV Evaluation Targets +# Consolidated from root, examples/features, and examples/showcase. +# Per-eval .agentv folders override these for specific eval cases. +# +# "grader" is the LLM used for scoring; agent targets reference it via +# grader_target so eval execution and grading use separate models. targets: + # ── Grader (LLM-as-judge) ────────────────────────────────────────── + # "default" is an alias so example evals with `target: default` work. - name: default + provider: openai + base_url: https://models.github.ai/inference/v1 + api_key: ${{ GH_MODELS_TOKEN }} + model: ${{ GH_MODELS_MODEL }} + + - name: grader + provider: openai + base_url: https://models.github.ai/inference/v1 + api_key: ${{ GH_MODELS_TOKEN }} + model: ${{ GH_MODELS_MODEL }} + + # ── Agent targets ────────────────────────────────────────────────── + - name: copilot-cli + provider: copilot-cli + model: ${{ COPILOT_MODEL }} + grader_target: grader + log_format: json + + - name: copilot-sdk + provider: copilot-sdk + model: ${{ COPILOT_MODEL }} + grader_target: grader + log_format: json + + - name: claude + provider: claude + grader_target: grader + log_format: json + + - name: claude-sdk + provider: claude-sdk + grader_target: grader + log_format: json + + - name: pi provider: pi-coding-agent subprovider: openrouter - model: z-ai/glm-4.7 + model: ${{ OPENROUTER_MODEL }} api_key: ${{ OPENROUTER_API_KEY }} - system_prompt: "Answer directly based on the information provided." - grader_target: gemini-flash + grader_target: grader + tools: read,bash,edit,write + log_format: json - name: pi-cli provider: pi-cli subprovider: openrouter - model: openai/gpt-5.1-codex - api_key: ${{ OPENROUTER_API_KEY }} - grader_target: gemini-flash - - - name: pi-coding-agent - provider: pi-coding-agent - subprovider: openrouter - model: z-ai/glm-4.7 + model: ${{ OPENROUTER_MODEL }} api_key: ${{ OPENROUTER_API_KEY }} - system_prompt: "Answer directly based on the information provided." - grader_target: gemini-flash + grader_target: grader - name: codex provider: codex - grader_target: gemini-llm - log_format: json # Optional: 'summary' (default) or 'json' for raw event logs + grader_target: grader + cwd: ${{ CODEX_WORKSPACE_DIR }} + log_dir: ${{ CODEX_LOG_DIR }} + log_format: json + + # ── LLM targets (direct model access) ───────────────────────────── + - name: azure-llm + provider: azure + endpoint: ${{ AZURE_OPENAI_ENDPOINT }} + api_key: ${{ AZURE_OPENAI_API_KEY }} + model: ${{ AZURE_DEPLOYMENT_NAME }} + version: ${{ AZURE_OPENAI_API_VERSION }} - name: gemini-llm provider: gemini diff --git a/.github/workflows/evals.yml b/.github/workflows/evals.yml new file mode 100644 index 000000000..dbf1de8f3 --- /dev/null +++ b/.github/workflows/evals.yml @@ -0,0 +1,100 @@ +name: Run Evals + +on: + workflow_dispatch: + inputs: + suite_filter: + description: "Comma-separated glob patterns for eval files to run" + required: false + default: "evals/**/eval.yaml,examples/features/agent-skills-evals/multi-provider-skill-trigger.EVAL.yaml" + target: + description: "Target name from .agentv/targets.yaml" + required: false + default: "copilot-cli" + threshold: + description: "Minimum score threshold (0-1)" + required: false + default: "0.8" + +jobs: + evals: + name: Run AgentV Evals + runs-on: ubuntu-latest + permissions: + contents: read + checks: write + models: read + steps: + - uses: actions/checkout@v4 + - uses: ./.github/actions/setup-bun + + - name: Build + run: bun run build + + - name: Install GitHub Copilot CLI + run: curl -fsSL https://gh.io/copilot-install | bash + + - name: Configure credentials + run: | + cat > .env <> "$GITHUB_OUTPUT" + echo "target=${{ github.event.inputs.target || vars.EVAL_TARGET || 'copilot-cli' }}" >> "$GITHUB_OUTPUT" + echo "threshold=${{ github.event.inputs.threshold || '0.8' }}" >> "$GITHUB_OUTPUT" + + - name: Run AgentV evals + id: run-evals + env: + COPILOT_GITHUB_TOKEN: ${{ secrets.COPILOT_PAT }} + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + run: | + mkdir -p .agentv/ci-results + + # Split comma-separated patterns into positional args + IFS=',' read -ra PATTERNS <<< "${{ steps.filter.outputs.patterns }}" + bun apps/cli/dist/cli.js eval run "${PATTERNS[@]}" \ + --targets .agentv/targets.yaml \ + --target ${{ steps.filter.outputs.target }} \ + --workers 1 \ + --threshold ${{ steps.filter.outputs.threshold }} \ + -o .agentv/ci-results/junit.xml \ + --benchmark-json .agentv/ci-results/benchmark.json \ + --artifacts .agentv/ci-results/artifacts \ + --verbose \ + 2>&1 | tee .agentv/ci-results/eval-output.log + + echo "exit_code=${PIPESTATUS[0]}" >> "$GITHUB_OUTPUT" + + - name: Publish JUnit test results + if: always() + uses: dorny/test-reporter@v1 + with: + name: AgentV Eval Results + path: .agentv/ci-results/junit.xml + reporter: java-junit + fail-on-error: false + + - name: Upload eval artifacts + if: always() + uses: actions/upload-artifact@v4 + with: + name: eval-results-${{ github.run_id }} + path: .agentv/ci-results/ + retention-days: 30 + + - name: Fail if threshold not met + if: always() + run: | + if [ "${{ steps.run-evals.outputs.exit_code }}" != "0" ]; then + echo "::error::Eval score below threshold (${{ steps.filter.outputs.threshold }})" + exit 1 + fi diff --git a/examples/features/.agentv/targets.yaml b/examples/features/.agentv/targets.yaml deleted file mode 100644 index 79b27a5e2..000000000 --- a/examples/features/.agentv/targets.yaml +++ /dev/null @@ -1,100 +0,0 @@ -# A list of all supported evaluation targets for the project. -# Each target defines a provider and its specific configuration. -# Actual values for paths/keys are stored in the local .env file. - -targets: - - name: default - provider: azure - endpoint: ${{ AZURE_OPENAI_ENDPOINT }} - api_key: ${{ AZURE_OPENAI_API_KEY }} - model: ${{ AZURE_DEPLOYMENT_NAME }} - - - name: openai - provider: openai - endpoint: ${{ OPENAI_ENDPOINT }} - api_key: ${{ OPENAI_API_KEY }} - model: ${{ OPENAI_MODEL }} - - - name: openrouter - provider: openrouter - api_key: ${{ OPENROUTER_API_KEY }} - model: ${{ OPENROUTER_MODEL }} - - - name: codex - provider: codex - grader_target: gemini-llm - # Uses the Codex CLI (defaults to `codex` on PATH) - # executable: ${{ CODEX_CLI_PATH }} # Optional: override executable path - # args: # Optional additional CLI arguments - # - --profile - # - ${{ CODEX_PROFILE }} - # - --model - # - ${{ CODEX_MODEL }} - # - --ask-for-approval - # - ${{ CODEX_APPROVAL_PRESET }} - cwd: ${{ CODEX_WORKSPACE_DIR }} # Where scratch workspaces are created - log_dir: ${{ CODEX_LOG_DIR }} # Optional: where Codex CLI stream logs are stored (defaults to ./.agentv/logs/codex) - log_format: json # Optional: 'summary' (default) or 'json' for raw event logs - - - name: azure-llm - provider: azure - endpoint: ${{ AZURE_OPENAI_ENDPOINT }} - api_key: ${{ AZURE_OPENAI_API_KEY }} - model: ${{ AZURE_DEPLOYMENT_NAME }} - version: ${{ AZURE_OPENAI_API_VERSION }} - - - name: gemini-llm - provider: gemini - api_key: ${{ GOOGLE_GENERATIVE_AI_API_KEY }} - model: ${{ GEMINI_MODEL_NAME }} - - # Pi Coding Agent - autonomous coding agent from pi-mono (SDK) - - name: pi - provider: pi-coding-agent - subprovider: openrouter - model: openai/gpt-5.4 - api_key: ${{ OPENROUTER_API_KEY }} - grader_target: gemini-llm - tools: read,bash,edit,write # Default tools for coding tasks - log_format: json # 'summary' (default) or 'json' for raw event logs - # system_prompt: optional override (default instructs agent to include code in response) - - # Pi CLI - subprocess-based Pi agent - - name: pi-cli - provider: pi-cli - subprovider: openrouter - model: openai/gpt-5.4 - api_key: ${{ OPENROUTER_API_KEY }} - grader_target: gemini-llm - log_format: json - - # GitHub Copilot - CLI subprocess - - name: copilot - provider: copilot-cli - model: gpt-5-mini - grader_target: gemini-llm - log_format: json - - # GitHub Copilot - SDK - # Note: copilot-sdk discovers skills via grep (keyword search) rather than - # reading skill files directly. The skill-trigger evaluator only checks tool - # inputs for the skill name, so copilot-sdk may fail positive trigger cases. - - name: copilot-sdk - provider: copilot-sdk - model: gpt-5-mini - grader_target: gemini-llm - log_format: json - - # Claude - CLI subprocess - - name: claude - provider: claude - grader_target: gemini-llm - # model: claude-sonnet-4-20250514 # Optional: override model - log_format: json # 'summary' (default) or 'json' for raw event logs - # system_prompt: optional override (default instructs agent to include code in response) - - # Claude SDK - direct SDK invocation (requires @anthropic-ai/claude-agent-sdk) - - name: claude-sdk - provider: claude-sdk - grader_target: gemini-llm - log_format: json diff --git a/examples/showcase/.agentv/config.yaml b/examples/showcase/.agentv/config.yaml deleted file mode 100644 index a9ba9d600..000000000 --- a/examples/showcase/.agentv/config.yaml +++ /dev/null @@ -1,3 +0,0 @@ -$schema: agentv-config-v2 - -# Example .agentv/config.yaml Configuration diff --git a/examples/showcase/.agentv/targets.yaml b/examples/showcase/.agentv/targets.yaml deleted file mode 100644 index 7413f67a2..000000000 --- a/examples/showcase/.agentv/targets.yaml +++ /dev/null @@ -1,23 +0,0 @@ -targets: - - name: default - provider: azure - endpoint: ${{ AZURE_OPENAI_ENDPOINT }} - api_key: ${{ AZURE_OPENAI_API_KEY }} - model: ${{ AZURE_DEPLOYMENT_NAME }} - - - name: gemini-llm - provider: gemini - api_key: ${{ GOOGLE_GENERATIVE_AI_API_KEY }} - model: ${{ GEMINI_MODEL_NAME }} - - - name: codex - provider: codex - cwd: ${{ CODEX_WORKSPACE_DIR }} - log_dir: ${{ CODEX_LOG_DIR }} - log_format: json - grader_target: default - - - name: mock_agent - provider: cli - command: bun run ../tool-evaluation-plugins/mock-tool-agent.ts --prompt {PROMPT} --output {OUTPUT_FILE} - cwd: ${{ TOOL_EVAL_PLUGINS_DIR }} \ No newline at end of file diff --git a/packages/core/src/evaluation/providers/ai-sdk.ts b/packages/core/src/evaluation/providers/ai-sdk.ts index 2bc905932..85278890b 100644 --- a/packages/core/src/evaluation/providers/ai-sdk.ts +++ b/packages/core/src/evaluation/providers/ai-sdk.ts @@ -53,7 +53,12 @@ export class OpenAIProvider implements Provider { apiKey: config.apiKey, baseURL: config.baseURL, }); - this.model = openai(config.model); + // Default to Chat Completions API (/chat/completions) which is + // universally supported by all OpenAI-compatible endpoints. + // Only use the Responses API (/responses) for actual OpenAI, which + // is the only provider that supports it. + const isOpenAI = config.baseURL.includes('api.openai.com'); + this.model = isOpenAI ? openai(config.model) : openai.chat(config.model); } async invoke(request: ProviderRequest): Promise { diff --git a/packages/core/test/evaluation/providers/targets.test.ts b/packages/core/test/evaluation/providers/targets.test.ts index 358c33ba5..187b75f18 100644 --- a/packages/core/test/evaluation/providers/targets.test.ts +++ b/packages/core/test/evaluation/providers/targets.test.ts @@ -22,7 +22,11 @@ const generateTextMock = mock(async () => ({ const createAzureMock = mock((options: unknown) => ({ chat: () => ({ provider: 'azure', options }), })); -const createOpenAIMock = mock((options: unknown) => () => ({ provider: 'openai', options })); +const createOpenAIMock = mock((options: unknown) => { + const defaultFn = () => ({ provider: 'openai', options }); + defaultFn.chat = () => ({ provider: 'openai', options, api: 'chat' }); + return defaultFn; +}); const createOpenRouterMock = mock((options: unknown) => () => ({ provider: 'openrouter', options,