Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
50 changes: 50 additions & 0 deletions docs/examples/evals-sdk/coherence_evaluator.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
from strands import Agent

from strands_evals import Case, Experiment
from strands_evals.evaluators import CoherenceEvaluator
from strands_evals.mappers import StrandsInMemorySessionMapper
from strands_evals.telemetry import StrandsEvalsTelemetry

# Setup telemetry
telemetry = StrandsEvalsTelemetry().setup_in_memory_exporter()
memory_exporter = telemetry.in_memory_exporter

# 1. Define a task function
def user_task_function(case: Case) -> dict:
agent = Agent(
# IMPORTANT: trace_attributes with session IDs are required when using StrandsInMemorySessionMapper
# to prevent spans from different test cases from being mixed together in the memory exporter
trace_attributes={"gen_ai.conversation.id": case.session_id, "session.id": case.session_id},
callback_handler=None,
)
agent_response = agent(case.input)
finished_spans = memory_exporter.get_finished_spans()
mapper = StrandsInMemorySessionMapper()
session = mapper.map_to_session(finished_spans, session_id=case.session_id)

return {"output": str(agent_response), "trajectory": session}


# 2. Create test cases
test_cases = [
Case[str, str](
name="multi-step-reasoning",
input="Explain how photosynthesis works and why it is important for life on Earth.",
metadata={"category": "coherence"},
),
Case[str, str](
name="compare-contrast",
input="Compare and contrast renewable and non-renewable energy sources.",
metadata={"category": "coherence"},
),
]

# 3. Create evaluators
evaluators = [CoherenceEvaluator()]

# 4. Create an experiment
experiment = Experiment[str, str](cases=test_cases, evaluators=evaluators)

# 5. Run evaluations
reports = experiment.run_evaluations(user_task_function)
reports[0].run_display()
42 changes: 42 additions & 0 deletions docs/examples/evals-sdk/correctness_evaluator.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
from strands import Agent

from strands_evals import Case, Experiment
from strands_evals.evaluators import CorrectnessEvaluator
from strands_evals.mappers import StrandsInMemorySessionMapper
from strands_evals.telemetry import StrandsEvalsTelemetry

# Setup telemetry
telemetry = StrandsEvalsTelemetry().setup_in_memory_exporter()
memory_exporter = telemetry.in_memory_exporter

# 1. Define a task function
def user_task_function(case: Case) -> dict:
agent = Agent(
# IMPORTANT: trace_attributes with session IDs are required when using StrandsInMemorySessionMapper
# to prevent spans from different test cases from being mixed together in the memory exporter
trace_attributes={"gen_ai.conversation.id": case.session_id, "session.id": case.session_id},
callback_handler=None,
)
agent_response = agent(case.input)
finished_spans = memory_exporter.get_finished_spans()
mapper = StrandsInMemorySessionMapper()
session = mapper.map_to_session(finished_spans, session_id=case.session_id)

return {"output": str(agent_response), "trajectory": session}


# 2. Create test cases
test_cases = [
Case[str, str](name="math-1", input="What is 25 * 4?", metadata={"category": "math"}),
Case[str, str](name="math-2", input="Calculate the square root of 144", metadata={"category": "math"}),
]

# 3. Create evaluators
evaluators = [CorrectnessEvaluator()]

# 4. Create an experiment
experiment = Experiment[str, str](cases=test_cases, evaluators=evaluators)

# 5. Run evaluations
reports = experiment.run_evaluations(user_task_function)
reports[0].run_display()
53 changes: 53 additions & 0 deletions docs/examples/evals-sdk/correctness_with_assertions_evaluator.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
from strands import Agent

from strands_evals import Case, Experiment
from strands_evals.evaluators import CorrectnessEvaluator
from strands_evals.mappers import StrandsInMemorySessionMapper
from strands_evals.telemetry import StrandsEvalsTelemetry

# Setup telemetry
telemetry = StrandsEvalsTelemetry().setup_in_memory_exporter()
memory_exporter = telemetry.in_memory_exporter

# 1. Define a task function
def user_task_function(case: Case) -> dict:
agent = Agent(
# IMPORTANT: trace_attributes with session IDs are required when using StrandsInMemorySessionMapper
# to prevent spans from different test cases from being mixed together in the memory exporter
trace_attributes={"gen_ai.conversation.id": case.session_id, "session.id": case.session_id},
callback_handler=None,
)
agent_response = agent(case.input)
finished_spans = memory_exporter.get_finished_spans()
mapper = StrandsInMemorySessionMapper()
session = mapper.map_to_session(finished_spans, session_id=case.session_id)

return {"output": str(agent_response), "trajectory": session}


# 2. Create test cases with expected_assertion
# When expected_assertion is provided, the evaluator uses assertion mode:
# it judges whether the agent's response is correct by comparing it to the expected assertion,
# using a binary CORRECT/INCORRECT rubric rather than the 3-level basic rubric.
test_cases = [
Case[str, str](
name="math-1",
input="What is 25 * 4?",
expected_assertion="The agent should return the correct answer of 100.",
),
Case[str, str](
name="math-2",
input="Calculate the square root of 144",
expected_assertion="The agent should return the correct answer of 12.",
),
]

# 3. Create evaluators
evaluators = [CorrectnessEvaluator()]

# 4. Create an experiment
experiment = Experiment[str, str](cases=test_cases, evaluators=evaluators)

# 5. Run evaluations
reports = experiment.run_evaluations(user_task_function)
reports[0].run_display()
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
from strands import Agent

from strands_evals import Case, Experiment
from strands_evals.evaluators import GoalSuccessRateEvaluator
from strands_evals.mappers import StrandsInMemorySessionMapper
from strands_evals.telemetry import StrandsEvalsTelemetry

# Setup telemetry
telemetry = StrandsEvalsTelemetry().setup_in_memory_exporter()
memory_exporter = telemetry.in_memory_exporter

# 1. Define a task function
def user_task_function(case: Case) -> dict:
agent = Agent(
# IMPORTANT: trace_attributes with session IDs are required when using StrandsInMemorySessionMapper
# to prevent spans from different test cases from being mixed together in the memory exporter
trace_attributes={"gen_ai.conversation.id": case.session_id, "session.id": case.session_id},
callback_handler=None,
)
agent_response = agent(case.input)
finished_spans = memory_exporter.get_finished_spans()
mapper = StrandsInMemorySessionMapper()
session = mapper.map_to_session(finished_spans, session_id=case.session_id)

return {"output": str(agent_response), "trajectory": session}


# 2. Create test cases with expected_assertion
# When expected_assertion is provided, the evaluator uses assertion mode:
# it judges whether the agent's behavior satisfies the specified success assertions
# rather than inferring goals from the conversation.
test_cases = [
Case[str, str](
name="math-1",
input="What is 25 * 4?",
expected_assertion="The agent should return the correct answer of 100.",
),
Case[str, str](
name="math-2",
input="Calculate the square root of 144",
expected_assertion="The agent should return the correct answer of 12.",
),
]

# 3. Create evaluators
evaluators = [GoalSuccessRateEvaluator()]

# 4. Create an experiment
experiment = Experiment[str, str](cases=test_cases, evaluators=evaluators)

# 5. Run evaluations
reports = experiment.run_evaluations(user_task_function)
reports[0].run_display()
Loading