Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
22 commits
Select commit Hold shift + click to select a range
7188f5e
feat: add evals control plane operations
notgitika Mar 8, 2026
8baa7a2
feat: add functionality to run evaluation and online evals
notgitika Mar 12, 2026
2097cbf
fix: eval data plane code clean up
notgitika Mar 12, 2026
28885cc
tests: add test coverage for evals
notgitika Mar 12, 2026
f52ae0a
add support for running evals on agents created outside the cli
notgitika Mar 12, 2026
9aca0bd
feat: add eval discovery commands, status enrichment, and schema updates
notgitika Mar 12, 2026
c513c54
fix: add API limit of 10 on spanIds and TUI changes
notgitika Mar 12, 2026
023fc3d
feat: add evals in resourcegraph
notgitika Mar 12, 2026
b84e5db
feat: add eval TUI screens, online eval dashboard, and run eval wizard
notgitika Mar 17, 2026
2acfa74
feat: remove stop online-eval command, use remove + deploy instead
notgitika Mar 18, 2026
bcea755
chore: remove unused get-eval-run module, add ARN mode tests for paus…
notgitika Mar 18, 2026
1ed222e
docs: update AGENTS.md with eval primitives, clarify sampling rate an…
notgitika Mar 18, 2026
44fb07f
fix: bump aws-cdk-lib to 2.243.0 and remove description from UpdateOn…
notgitika Mar 18, 2026
dd73909
fix: skip requireProject for run eval in ARN mode
notgitika Mar 18, 2026
f825b2a
feat: add session discovery and selection step to run eval TUI
notgitika Mar 18, 2026
92f89a8
feat: improve command docs, rating scale clarity, ARN support messaging
notgitika Mar 18, 2026
a11fbd2
feat: add eval level guidance, sampling rate context, score interpret…
notgitika Mar 18, 2026
4818a0f
chore: rename eval command to evals throughout codebase
notgitika Mar 19, 2026
cf0ed98
fix: require --instructions and validate placeholders for add evaluat…
notgitika Mar 19, 2026
fc6dc7a
fix: restore MCP/A2A code lost during rebase conflict resolution, fix…
notgitika Mar 19, 2026
5f0efc9
Merge branch 'main' into feat/eval-support
notgitika Mar 19, 2026
8c85fa8
fix: agents.md
notgitika Mar 19, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 8 additions & 2 deletions AGENTS.md
Original file line number Diff line number Diff line change
Expand Up @@ -24,12 +24,16 @@ Note: CDK L3 constructs are in a separate package `@aws/agentcore-cdk`.
## CLI Commands

- `create` - Create new AgentCore project
- `add` - Add resources (agent, memory, identity, target)
- `remove` - Remove resources (agent, memory, identity, target, all)
- `add` - Add resources (agent, memory, identity, evaluator, online-eval, target)
- `remove` - Remove resources (agent, memory, identity, evaluator, online-eval, target, all)
- `deploy` - Deploy infrastructure to AWS
- `status` - Check deployment status
- `dev` - Local development server (CodeZip: uvicorn with hot-reload; Container: Docker build + run with volume mount)
- `invoke` - Invoke agents (local or deployed)
- `run eval` - Run on-demand evaluation against agent sessions
- `eval history` - View past eval run results
- `pause online-eval` - Pause (disable) a deployed online eval config
- `resume online-eval` - Resume (enable) a paused online eval config
- `package` - Package agent artifacts without deploying (zip for CodeZip, container image build for Container)
- `validate` - Validate configuration files
- `update` - Check for CLI updates
Expand Down Expand Up @@ -60,6 +64,8 @@ Current primitives:
- `AgentPrimitive` — agent creation (template + BYO), removal, credential resolution
- `MemoryPrimitive` — memory creation with strategies, removal
- `CredentialPrimitive` — credential/identity creation, .env management, removal
- `EvaluatorPrimitive` — custom evaluator creation/removal with cross-reference validation
- `OnlineEvalConfigPrimitive` — online eval config creation/removal
- `GatewayPrimitive` — MCP gateway creation/removal
- `GatewayTargetPrimitive` — MCP tool creation/removal with code generation

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -351,7 +351,7 @@ exports[`Assets Directory Snapshots > CDK assets > cdk/cdk/package.json should m
},
"dependencies": {
"@aws/agentcore-cdk": "^0.1.0-alpha.1",
"aws-cdk-lib": "2.239.0",
"aws-cdk-lib": "2.243.0",
"constructs": "^10.0.0"
}
}
Expand All @@ -372,6 +372,8 @@ test('AgentCoreStack synthesizes with empty spec', () => {
agents: [],
memories: [],
credentials: [],
evaluators: [],
onlineEvalConfigs: [],
},
});
const template = Template.fromStack(stack);
Expand Down
2 changes: 1 addition & 1 deletion src/assets/cdk/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@
},
"dependencies": {
"@aws/agentcore-cdk": "^0.1.0-alpha.1",
"aws-cdk-lib": "2.239.0",
"aws-cdk-lib": "2.243.0",
"constructs": "^10.0.0"
}
}
2 changes: 2 additions & 0 deletions src/assets/cdk/test/cdk.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@ test('AgentCoreStack synthesizes with empty spec', () => {
agents: [],
memories: [],
credentials: [],
evaluators: [],
onlineEvalConfigs: [],
},
});
const template = Template.fromStack(stack);
Expand Down
309 changes: 308 additions & 1 deletion src/cli/aws/__tests__/agentcore-control.test.ts
Original file line number Diff line number Diff line change
@@ -1,4 +1,10 @@
import { getAgentRuntimeStatus } from '../agentcore-control.js';
import {
getAgentRuntimeStatus,
getEvaluator,
getOnlineEvaluationConfig,
listEvaluators,
updateOnlineEvalExecutionStatus,
} from '../agentcore-control.js';
import { beforeEach, describe, expect, it, vi } from 'vitest';

const { mockSend } = vi.hoisted(() => ({
Expand All @@ -12,6 +18,18 @@ vi.mock('@aws-sdk/client-bedrock-agentcore-control', () => ({
GetAgentRuntimeCommand: class {
constructor(public input: unknown) {}
},
GetEvaluatorCommand: class {
constructor(public input: unknown) {}
},
GetOnlineEvaluationConfigCommand: class {
constructor(public input: unknown) {}
},
ListEvaluatorsCommand: class {
constructor(public input: unknown) {}
},
UpdateOnlineEvaluationConfigCommand: class {
constructor(public input: unknown) {}
},
}));

vi.mock('../account', () => ({
Expand Down Expand Up @@ -56,3 +74,292 @@ describe('getAgentRuntimeStatus', () => {
);
});
});

describe('getEvaluator', () => {
beforeEach(() => {
vi.clearAllMocks();
});

it('returns evaluator details', async () => {
mockSend.mockResolvedValue({
evaluatorId: 'eval-123',
evaluatorArn: 'arn:aws:bedrock-agentcore:us-east-1:123456:evaluator/eval-123',
evaluatorName: 'my-evaluator',
level: 'SESSION',
status: 'ACTIVE',
description: 'A test evaluator',
});

const result = await getEvaluator({ region: 'us-east-1', evaluatorId: 'eval-123' });
expect(result.evaluatorId).toBe('eval-123');
expect(result.evaluatorName).toBe('my-evaluator');
expect(result.level).toBe('SESSION');
expect(result.status).toBe('ACTIVE');
expect(result.description).toBe('A test evaluator');
});

it('throws when no evaluatorId in response', async () => {
mockSend.mockResolvedValue({ evaluatorId: undefined });

await expect(getEvaluator({ region: 'us-east-1', evaluatorId: 'eval-missing' })).rejects.toThrow(
'No evaluator found for ID eval-missing'
);
});

it('passes correct evaluatorId in command', async () => {
mockSend.mockResolvedValue({
evaluatorId: 'eval-abc',
evaluatorName: 'test',
level: 'TRACE',
status: 'ACTIVE',
});

await getEvaluator({ region: 'us-west-2', evaluatorId: 'eval-abc' });

const command = mockSend.mock.calls[0]![0];
expect(command.input.evaluatorId).toBe('eval-abc');
});

it('defaults level to SESSION when undefined', async () => {
mockSend.mockResolvedValue({
evaluatorId: 'eval-no-level',
level: undefined,
status: 'ACTIVE',
});

const result = await getEvaluator({ region: 'us-east-1', evaluatorId: 'eval-no-level' });
expect(result.level).toBe('SESSION');
});

it('propagates SDK errors', async () => {
mockSend.mockRejectedValue(new Error('AccessDenied'));

await expect(getEvaluator({ region: 'us-east-1', evaluatorId: 'eval-err' })).rejects.toThrow('AccessDenied');
});
});

describe('updateOnlineEvalExecutionStatus', () => {
beforeEach(() => {
vi.clearAllMocks();
});

it('sends DISABLED to pause and returns result', async () => {
mockSend.mockResolvedValue({
onlineEvaluationConfigId: 'cfg-123',
executionStatus: 'DISABLED',
status: 'ACTIVE',
});

const result = await updateOnlineEvalExecutionStatus({
region: 'us-east-1',
onlineEvaluationConfigId: 'cfg-123',
executionStatus: 'DISABLED',
});

expect(result.configId).toBe('cfg-123');
expect(result.executionStatus).toBe('DISABLED');
expect(result.status).toBe('ACTIVE');
});

it('sends ENABLED to resume', async () => {
mockSend.mockResolvedValue({
onlineEvaluationConfigId: 'cfg-456',
executionStatus: 'ENABLED',
status: 'ACTIVE',
});

const result = await updateOnlineEvalExecutionStatus({
region: 'us-west-2',
onlineEvaluationConfigId: 'cfg-456',
executionStatus: 'ENABLED',
});

expect(result.configId).toBe('cfg-456');
expect(result.executionStatus).toBe('ENABLED');
});

it('passes correct params in command', async () => {
mockSend.mockResolvedValue({
onlineEvaluationConfigId: 'cfg-789',
executionStatus: 'DISABLED',
status: 'ACTIVE',
});

await updateOnlineEvalExecutionStatus({
region: 'us-east-1',
onlineEvaluationConfigId: 'cfg-789',
executionStatus: 'DISABLED',
});

const command = mockSend.mock.calls[0]![0];
expect(command.input.onlineEvaluationConfigId).toBe('cfg-789');
expect(command.input.executionStatus).toBe('DISABLED');
});

it('falls back to input values when response fields are undefined', async () => {
mockSend.mockResolvedValue({});

const result = await updateOnlineEvalExecutionStatus({
region: 'us-east-1',
onlineEvaluationConfigId: 'cfg-fallback',
executionStatus: 'ENABLED',
});

expect(result.configId).toBe('cfg-fallback');
expect(result.executionStatus).toBe('ENABLED');
expect(result.status).toBe('UNKNOWN');
});

it('propagates SDK errors', async () => {
mockSend.mockRejectedValue(new Error('Throttling'));

await expect(
updateOnlineEvalExecutionStatus({
region: 'us-east-1',
onlineEvaluationConfigId: 'cfg-err',
executionStatus: 'DISABLED',
})
).rejects.toThrow('Throttling');
});
});

describe('getOnlineEvaluationConfig', () => {
beforeEach(() => {
vi.clearAllMocks();
});

it('returns config details with output log group', async () => {
mockSend.mockResolvedValue({
onlineEvaluationConfigId: 'oec-123',
onlineEvaluationConfigArn: 'arn:aws:bedrock-agentcore:us-east-1:123456:online-eval/oec-123',
onlineEvaluationConfigName: 'my-online-eval',
status: 'ACTIVE',
executionStatus: 'ENABLED',
description: 'Production eval',
outputConfig: {
cloudWatchConfig: { logGroupName: '/aws/bedrock-agentcore/evaluations/oec-123' },
},
});

const result = await getOnlineEvaluationConfig({ region: 'us-east-1', configId: 'oec-123' });
expect(result.configId).toBe('oec-123');
expect(result.configName).toBe('my-online-eval');
expect(result.status).toBe('ACTIVE');
expect(result.executionStatus).toBe('ENABLED');
expect(result.description).toBe('Production eval');
expect(result.outputLogGroupName).toBe('/aws/bedrock-agentcore/evaluations/oec-123');
});

it('throws when no configId in response', async () => {
mockSend.mockResolvedValue({ onlineEvaluationConfigId: undefined });

await expect(getOnlineEvaluationConfig({ region: 'us-east-1', configId: 'oec-missing' })).rejects.toThrow(
'No online evaluation config found for ID oec-missing'
);
});

it('returns failureReason when present', async () => {
mockSend.mockResolvedValue({
onlineEvaluationConfigId: 'oec-fail',
onlineEvaluationConfigName: 'broken-eval',
status: 'CREATE_FAILED',
executionStatus: 'DISABLED',
failureReason: 'IAM role not found',
});

const result = await getOnlineEvaluationConfig({ region: 'us-east-1', configId: 'oec-fail' });
expect(result.status).toBe('CREATE_FAILED');
expect(result.failureReason).toBe('IAM role not found');
});

it('handles missing outputConfig', async () => {
mockSend.mockResolvedValue({
onlineEvaluationConfigId: 'oec-no-output',
status: 'CREATING',
executionStatus: 'DISABLED',
});

const result = await getOnlineEvaluationConfig({ region: 'us-east-1', configId: 'oec-no-output' });
expect(result.outputLogGroupName).toBeUndefined();
});

it('passes correct configId in command', async () => {
mockSend.mockResolvedValue({
onlineEvaluationConfigId: 'oec-abc',
status: 'ACTIVE',
executionStatus: 'ENABLED',
});

await getOnlineEvaluationConfig({ region: 'us-west-2', configId: 'oec-abc' });

const command = mockSend.mock.calls[0]![0];
expect(command.input.onlineEvaluationConfigId).toBe('oec-abc');
});

it('propagates SDK errors', async () => {
mockSend.mockRejectedValue(new Error('ResourceNotFoundException'));

await expect(getOnlineEvaluationConfig({ region: 'us-east-1', configId: 'oec-err' })).rejects.toThrow(
'ResourceNotFoundException'
);
});
});

describe('listEvaluators', () => {
beforeEach(() => {
vi.clearAllMocks();
});

it('returns evaluator summaries', async () => {
mockSend.mockResolvedValue({
evaluators: [
{
evaluatorId: 'eval-1',
evaluatorArn: 'arn:aws:bedrock-agentcore:us-east-1:123456:evaluator/eval-1',
evaluatorName: 'Faithfulness',
evaluatorType: 'Builtin',
status: 'ACTIVE',
},
{
evaluatorId: 'eval-2',
evaluatorArn: 'arn:aws:bedrock-agentcore:us-east-1:123456:evaluator/eval-2',
evaluatorName: 'my-custom',
evaluatorType: 'Custom',
status: 'ACTIVE',
description: 'A custom evaluator',
},
],
});

const result = await listEvaluators({ region: 'us-east-1' });
expect(result.evaluators).toHaveLength(2);
expect(result.evaluators[0]!.evaluatorName).toBe('Faithfulness');
expect(result.evaluators[0]!.evaluatorType).toBe('Builtin');
expect(result.evaluators[1]!.evaluatorName).toBe('my-custom');
expect(result.evaluators[1]!.description).toBe('A custom evaluator');
});

it('returns empty array when no evaluators', async () => {
mockSend.mockResolvedValue({ evaluators: undefined });

const result = await listEvaluators({ region: 'us-east-1' });
expect(result.evaluators).toEqual([]);
});

it('passes maxResults and nextToken', async () => {
mockSend.mockResolvedValue({ evaluators: [], nextToken: 'token-2' });

const result = await listEvaluators({ region: 'us-east-1', maxResults: 5, nextToken: 'token-1' });

const command = mockSend.mock.calls[0]![0];
expect(command.input.maxResults).toBe(5);
expect(command.input.nextToken).toBe('token-1');
expect(result.nextToken).toBe('token-2');
});

it('propagates SDK errors', async () => {
mockSend.mockRejectedValue(new Error('AccessDeniedException'));

await expect(listEvaluators({ region: 'us-east-1' })).rejects.toThrow('AccessDeniedException');
});
});
Loading
Loading