From a728bd0f07447483699232ac2152487cab39ff4f Mon Sep 17 00:00:00 2001 From: Alexander Alderman Webb Date: Mon, 23 Mar 2026 08:40:31 +0100 Subject: [PATCH 1/5] test(langchain): Add tool execution test --- tests/conftest.py | 59 +++++ .../integrations/langchain/test_langchain.py | 187 ++++++++++++++++ .../openai_agents/test_openai_agents.py | 209 +++++++----------- 3 files changed, 326 insertions(+), 129 deletions(-) diff --git a/tests/conftest.py b/tests/conftest.py index 5dd62931f1..1aa5f04f6a 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1099,6 +1099,65 @@ def nonstreaming_responses_model_response(): ) +@pytest.fixture +def responses_tool_call_model_responses(): + def inner( + tool_name: str, + arguments: str, + response_model: str, + response_text: str, + response_ids: "Iterator[str]", + usages: "Iterator[openai.types.responses.ResponseUsage]", + ): + yield openai.types.responses.Response( + id=next(response_ids), + output=[ + openai.types.responses.ResponseFunctionToolCall( + id="call_123", + call_id="call_123", + name=tool_name, + type="function_call", + arguments=arguments, + ) + ], + parallel_tool_calls=False, + tool_choice="none", + tools=[], + created_at=10000000, + model=response_model, + object="response", + usage=next(usages), + ) + + yield openai.types.responses.Response( + id=next(response_ids), + output=[ + openai.types.responses.ResponseOutputMessage( + id="msg_final", + type="message", + status="completed", + content=[ + openai.types.responses.ResponseOutputText( + text=response_text, + type="output_text", + annotations=[], + ) + ], + role="assistant", + ) + ], + parallel_tool_calls=False, + tool_choice="none", + tools=[], + created_at=10000000, + model=response_model, + object="response", + usage=next(usages), + ) + + return inner + + class MockServerRequestHandler(BaseHTTPRequestHandler): def do_GET(self): # noqa: N802 # Process an HTTP GET request and return a response. diff --git a/tests/integrations/langchain/test_langchain.py b/tests/integrations/langchain/test_langchain.py index 99ee2b7851..86f8e6ad1a 100644 --- a/tests/integrations/langchain/test_langchain.py +++ b/tests/integrations/langchain/test_langchain.py @@ -54,6 +54,14 @@ CompletionUsage, ) +from openai.types.responses import ( + ResponseUsage, +) +from openai.types.responses.response_usage import ( + InputTokensDetails, + OutputTokensDetails, +) + LANGCHAIN_VERSION = package_version("langchain") @@ -209,6 +217,185 @@ def test_langchain_create_agent( assert SPANDATA.GEN_AI_RESPONSE_TEXT not in chat_spans[0].get("data", {}) +@pytest.mark.skipif( + LANGCHAIN_VERSION < (1,), + reason="LangChain 1.0+ required (ONE AGENT refactor)", +) +@pytest.mark.parametrize( + "send_default_pii, include_prompts", + [ + (True, True), + (True, False), + (False, True), + (False, False), + ], +) +def test_tool_execution_span( + sentry_init, + capture_events, + send_default_pii, + include_prompts, + get_model_response, + responses_tool_call_model_responses, +): + sentry_init( + integrations=[ + LangchainIntegration( + include_prompts=include_prompts, + ) + ], + traces_sample_rate=1.0, + send_default_pii=send_default_pii, + ) + events = capture_events() + + responses = responses_tool_call_model_responses( + tool_name="get_word_length", + arguments='{"word": "eudca"}', + response_model="gpt-4-0613", + response_text="The word eudca has 5 letters.", + response_ids=iter(["resp_1", "resp_2"]), + usages=iter( + [ + ResponseUsage( + input_tokens=142, + input_tokens_details=InputTokensDetails( + cached_tokens=0, + ), + output_tokens=50, + output_tokens_details=OutputTokensDetails( + reasoning_tokens=0, + ), + total_tokens=192, + ), + ResponseUsage( + input_tokens=89, + input_tokens_details=InputTokensDetails( + cached_tokens=0, + ), + output_tokens=28, + output_tokens_details=OutputTokensDetails( + reasoning_tokens=0, + ), + total_tokens=117, + ), + ] + ), + ) + tool_response = get_model_response( + next(responses), + serialize_pydantic=True, + request_headers={ + "X-Stainless-Raw-Response": "True", + }, + ) + final_response = get_model_response( + next(responses), + serialize_pydantic=True, + request_headers={ + "X-Stainless-Raw-Response": "True", + }, + ) + + llm = ChatOpenAI( + model_name="gpt-4", + temperature=0, + openai_api_key="badkey", + use_responses_api=True, + ) + agent = create_agent( + model=llm, + tools=[get_word_length], + name="word_length_agent", + ) + + with patch.object( + llm.client._client._client, + "send", + side_effect=[tool_response, final_response], + ) as _: + with start_transaction(): + agent.invoke( + { + "messages": [ + HumanMessage(content="How many letters in the word eudca"), + ], + }, + ) + + tx = events[0] + assert tx["type"] == "transaction" + assert tx["contexts"]["trace"]["origin"] == "manual" + + chat_spans = list(x for x in tx["spans"] if x["op"] == "gen_ai.chat") + tool_exec_span = next(x for x in tx["spans"] if x["op"] == "gen_ai.execute_tool") + + assert len(chat_spans) == 2 + + assert chat_spans[0]["origin"] == "auto.ai.langchain" + assert chat_spans[1]["origin"] == "auto.ai.langchain" + assert tool_exec_span["origin"] == "auto.ai.langchain" + + # We can't guarantee anything about the "shape" of the langchain execution graph + assert len(list(x for x in tx["spans"] if x["op"] == "gen_ai.chat")) > 0 + + # Token usage is only available in newer versions of langchain (v0.2+) + # where usage_metadata is supported on AIMessageChunk + if "gen_ai.usage.input_tokens" in chat_spans[0]["data"]: + assert chat_spans[0]["data"]["gen_ai.usage.input_tokens"] == 142 + assert chat_spans[0]["data"]["gen_ai.usage.output_tokens"] == 50 + assert chat_spans[0]["data"]["gen_ai.usage.total_tokens"] == 192 + + if "gen_ai.usage.input_tokens" in chat_spans[1]["data"]: + assert chat_spans[1]["data"]["gen_ai.usage.input_tokens"] == 89 + assert chat_spans[1]["data"]["gen_ai.usage.output_tokens"] == 28 + assert chat_spans[1]["data"]["gen_ai.usage.total_tokens"] == 117 + + if send_default_pii and include_prompts: + assert "word" in tool_exec_span["data"][SPANDATA.GEN_AI_TOOL_INPUT] + + assert "5" in chat_spans[1]["data"][SPANDATA.GEN_AI_RESPONSE_TEXT] + + # Verify tool calls are recorded when PII is enabled + assert SPANDATA.GEN_AI_RESPONSE_TOOL_CALLS in chat_spans[0].get("data", {}), ( + "Tool calls should be recorded when send_default_pii=True and include_prompts=True" + ) + tool_calls_data = chat_spans[0]["data"][SPANDATA.GEN_AI_RESPONSE_TOOL_CALLS] + assert isinstance(tool_calls_data, (list, str)) # Could be serialized + if isinstance(tool_calls_data, str): + assert "get_word_length" in tool_calls_data + elif isinstance(tool_calls_data, list) and len(tool_calls_data) > 0: + # Check if tool calls contain expected function name + tool_call_str = str(tool_calls_data) + assert "get_word_length" in tool_call_str + else: + assert SPANDATA.GEN_AI_REQUEST_MESSAGES not in chat_spans[0].get("data", {}) + assert SPANDATA.GEN_AI_RESPONSE_TEXT not in chat_spans[0].get("data", {}) + assert SPANDATA.GEN_AI_REQUEST_MESSAGES not in chat_spans[1].get("data", {}) + assert SPANDATA.GEN_AI_RESPONSE_TEXT not in chat_spans[1].get("data", {}) + assert SPANDATA.GEN_AI_TOOL_INPUT not in tool_exec_span.get("data", {}) + assert SPANDATA.GEN_AI_TOOL_OUTPUT not in tool_exec_span.get("data", {}) + + # Verify tool calls are NOT recorded when PII is disabled + assert SPANDATA.GEN_AI_RESPONSE_TOOL_CALLS not in chat_spans[0].get( + "data", {} + ), ( + f"Tool calls should NOT be recorded when send_default_pii={send_default_pii} " + f"and include_prompts={include_prompts}" + ) + assert SPANDATA.GEN_AI_RESPONSE_TOOL_CALLS not in chat_spans[1].get( + "data", {} + ), ( + f"Tool calls should NOT be recorded when send_default_pii={send_default_pii} " + f"and include_prompts={include_prompts}" + ) + + # Verify that available tools are always recorded regardless of PII settings + for chat_span in chat_spans: + tools_data = chat_span["data"][SPANDATA.GEN_AI_REQUEST_AVAILABLE_TOOLS] + assert "get_word_length" in tools_data + + @pytest.mark.parametrize( "send_default_pii, include_prompts", [ diff --git a/tests/integrations/openai_agents/test_openai_agents.py b/tests/integrations/openai_agents/test_openai_agents.py index 1442a2001b..c428a822d4 100644 --- a/tests/integrations/openai_agents/test_openai_agents.py +++ b/tests/integrations/openai_agents/test_openai_agents.py @@ -1062,7 +1062,11 @@ async def test_max_turns_before_handoff_span( @pytest.mark.asyncio async def test_tool_execution_span( - sentry_init, capture_events, test_agent, get_model_response + sentry_init, + capture_events, + test_agent, + get_model_response, + responses_tool_call_model_responses, ): """ Test tool execution span creation. @@ -1078,75 +1082,45 @@ def simple_test_tool(message: str) -> str: model = OpenAIResponsesModel(model="gpt-4", openai_client=client) agent_with_tool = test_agent.clone(tools=[simple_test_tool], model=model) - tool_response = get_model_response( - Response( - id="resp_tool_123", - output=[ - ResponseFunctionToolCall( - id="call_123", - call_id="call_123", - name="simple_test_tool", - type="function_call", - arguments='{"message": "hello"}', - ) - ], - parallel_tool_calls=False, - tool_choice="none", - tools=[], - created_at=10000000, - model="gpt-4", - object="response", - usage=ResponseUsage( - input_tokens=10, - input_tokens_details=InputTokensDetails( - cached_tokens=0, + responses = responses_tool_call_model_responses( + tool_name="simple_test_tool", + arguments='{"message": "hello"}', + response_model="gpt-4", + response_text="Task completed using the tool", + response_ids=iter(["resp_tool_123", "resp_final_123"]), + usages=iter( + [ + ResponseUsage( + input_tokens=10, + input_tokens_details=InputTokensDetails( + cached_tokens=0, + ), + output_tokens=5, + output_tokens_details=OutputTokensDetails( + reasoning_tokens=0, + ), + total_tokens=15, ), - output_tokens=5, - output_tokens_details=OutputTokensDetails( - reasoning_tokens=0, + ResponseUsage( + input_tokens=15, + input_tokens_details=InputTokensDetails( + cached_tokens=0, + ), + output_tokens=10, + output_tokens_details=OutputTokensDetails( + reasoning_tokens=0, + ), + total_tokens=25, ), - total_tokens=15, - ), + ] ), + ) + tool_response = get_model_response( + next(responses), serialize_pydantic=True, ) - final_response = get_model_response( - Response( - id="resp_final_123", - output=[ - ResponseOutputMessage( - id="msg_final", - type="message", - status="completed", - content=[ - ResponseOutputText( - text="Task completed using the tool", - type="output_text", - annotations=[], - ) - ], - role="assistant", - ) - ], - parallel_tool_calls=False, - tool_choice="none", - tools=[], - created_at=10000000, - model="gpt-4", - object="response", - usage=ResponseUsage( - input_tokens=15, - input_tokens_details=InputTokensDetails( - cached_tokens=0, - ), - output_tokens=10, - output_tokens_details=OutputTokensDetails( - reasoning_tokens=0, - ), - total_tokens=25, - ), - ), + next(responses), serialize_pydantic=True, ) @@ -2298,7 +2272,11 @@ def test_openai_agents_message_role_mapping( @pytest.mark.asyncio async def test_tool_execution_error_tracing( - sentry_init, capture_events, test_agent, get_model_response + sentry_init, + capture_events, + test_agent, + get_model_response, + responses_tool_call_model_responses, ): """ Test that tool execution errors are properly tracked via error tracing patch. @@ -2321,75 +2299,45 @@ def failing_tool(message: str) -> str: model = OpenAIResponsesModel(model="gpt-4", openai_client=client) agent_with_tool = test_agent.clone(tools=[failing_tool], model=model) - tool_response = get_model_response( - Response( - id="resp_1", - output=[ - ResponseFunctionToolCall( - id="call_123", - call_id="call_123", - name="failing_tool", - type="function_call", - arguments='{"message": "test"}', - ) - ], - parallel_tool_calls=False, - tool_choice="none", - tools=[], - created_at=10000000, - model="gpt-4.1-2025-04-14", - object="response", - usage=ResponseUsage( - input_tokens=10, - input_tokens_details=InputTokensDetails( - cached_tokens=0, + responses = responses_tool_call_model_responses( + tool_name="failing_tool", + arguments='{"message": "test"}', + response_model="gpt-4-0613", + response_text="An error occurred while running the tool", + response_ids=iter(["resp_1", "resp_2"]), + usages=iter( + [ + ResponseUsage( + input_tokens=10, + input_tokens_details=InputTokensDetails( + cached_tokens=0, + ), + output_tokens=5, + output_tokens_details=OutputTokensDetails( + reasoning_tokens=0, + ), + total_tokens=15, ), - output_tokens=5, - output_tokens_details=OutputTokensDetails( - reasoning_tokens=0, + ResponseUsage( + input_tokens=15, + input_tokens_details=InputTokensDetails( + cached_tokens=0, + ), + output_tokens=10, + output_tokens_details=OutputTokensDetails( + reasoning_tokens=0, + ), + total_tokens=25, ), - total_tokens=15, - ), + ] ), + ) + tool_response = get_model_response( + next(responses), serialize_pydantic=True, ) - final_response = get_model_response( - Response( - id="resp_2", - output=[ - ResponseOutputMessage( - id="msg_final", - type="message", - status="completed", - content=[ - ResponseOutputText( - text="An error occurred while running the tool", - type="output_text", - annotations=[], - ) - ], - role="assistant", - ) - ], - parallel_tool_calls=False, - tool_choice="none", - tools=[], - created_at=10000000, - model="gpt-4-0613", - object="response", - usage=ResponseUsage( - input_tokens=15, - input_tokens_details=InputTokensDetails( - cached_tokens=0, - ), - output_tokens=10, - output_tokens_details=OutputTokensDetails( - reasoning_tokens=0, - ), - total_tokens=25, - ), - ), + next(responses), serialize_pydantic=True, ) @@ -2420,7 +2368,10 @@ def failing_tool(message: str) -> str: # Find the execute_tool span execute_tool_span = None for span in spans: - if span.get("description", "").startswith("execute_tool failing_tool"): + description = span.get("description", "") + if description is not None and description.startswith( + "execute_tool failing_tool" + ): execute_tool_span = span break From e6bfe4d21d1ce6b40fcc7a8496367d939d3f849f Mon Sep 17 00:00:00 2001 From: Alexander Alderman Webb Date: Mon, 23 Mar 2026 11:26:18 +0100 Subject: [PATCH 2/5] remove redundant assertion --- tests/integrations/langchain/test_langchain.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/tests/integrations/langchain/test_langchain.py b/tests/integrations/langchain/test_langchain.py index b7016f226f..11a0fe3042 100644 --- a/tests/integrations/langchain/test_langchain.py +++ b/tests/integrations/langchain/test_langchain.py @@ -336,9 +336,6 @@ def test_tool_execution_span( assert chat_spans[1]["origin"] == "auto.ai.langchain" assert tool_exec_span["origin"] == "auto.ai.langchain" - # We can't guarantee anything about the "shape" of the langchain execution graph - assert len(list(x for x in tx["spans"] if x["op"] == "gen_ai.chat")) > 0 - # Token usage is only available in newer versions of langchain (v0.2+) # where usage_metadata is supported on AIMessageChunk if "gen_ai.usage.input_tokens" in chat_spans[0]["data"]: From 1b4b2baf31e6462a985dd2c1375bb705ddb24083 Mon Sep 17 00:00:00 2001 From: Alexander Alderman Webb Date: Mon, 23 Mar 2026 16:39:45 +0100 Subject: [PATCH 3/5] do not gate token assertions --- tests/integrations/langchain/test_langchain.py | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) diff --git a/tests/integrations/langchain/test_langchain.py b/tests/integrations/langchain/test_langchain.py index 122c45b224..45b7d28c27 100644 --- a/tests/integrations/langchain/test_langchain.py +++ b/tests/integrations/langchain/test_langchain.py @@ -333,17 +333,13 @@ def test_tool_execution_span( assert chat_spans[1]["origin"] == "auto.ai.langchain" assert tool_exec_span["origin"] == "auto.ai.langchain" - # Token usage is only available in newer versions of langchain (v0.2+) - # where usage_metadata is supported on AIMessageChunk - if "gen_ai.usage.input_tokens" in chat_spans[0]["data"]: - assert chat_spans[0]["data"]["gen_ai.usage.input_tokens"] == 142 - assert chat_spans[0]["data"]["gen_ai.usage.output_tokens"] == 50 - assert chat_spans[0]["data"]["gen_ai.usage.total_tokens"] == 192 + assert chat_spans[0]["data"]["gen_ai.usage.input_tokens"] == 142 + assert chat_spans[0]["data"]["gen_ai.usage.output_tokens"] == 50 + assert chat_spans[0]["data"]["gen_ai.usage.total_tokens"] == 192 - if "gen_ai.usage.input_tokens" in chat_spans[1]["data"]: - assert chat_spans[1]["data"]["gen_ai.usage.input_tokens"] == 89 - assert chat_spans[1]["data"]["gen_ai.usage.output_tokens"] == 28 - assert chat_spans[1]["data"]["gen_ai.usage.total_tokens"] == 117 + assert chat_spans[1]["data"]["gen_ai.usage.input_tokens"] == 89 + assert chat_spans[1]["data"]["gen_ai.usage.output_tokens"] == 28 + assert chat_spans[1]["data"]["gen_ai.usage.total_tokens"] == 117 if send_default_pii and include_prompts: assert "word" in tool_exec_span["data"][SPANDATA.GEN_AI_TOOL_INPUT] From c7e871b030d4e76e71dc45652330240b01ca7796 Mon Sep 17 00:00:00 2001 From: Alexander Alderman Webb Date: Mon, 23 Mar 2026 18:19:42 +0100 Subject: [PATCH 4/5] assert that there is only one tool span --- tests/integrations/langchain/test_langchain.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/tests/integrations/langchain/test_langchain.py b/tests/integrations/langchain/test_langchain.py index 45b7d28c27..ed5cda83a6 100644 --- a/tests/integrations/langchain/test_langchain.py +++ b/tests/integrations/langchain/test_langchain.py @@ -325,10 +325,12 @@ def test_tool_execution_span( assert tx["contexts"]["trace"]["origin"] == "manual" chat_spans = list(x for x in tx["spans"] if x["op"] == "gen_ai.chat") - tool_exec_span = next(x for x in tx["spans"] if x["op"] == "gen_ai.execute_tool") - assert len(chat_spans) == 2 + tool_exec_spans = list(x for x in tx["spans"] if x["op"] == "gen_ai.execute_tool") + assert len(tool_exec_spans) == 1 + tool_exec_span = tool_exec_spans[0] + assert chat_spans[0]["origin"] == "auto.ai.langchain" assert chat_spans[1]["origin"] == "auto.ai.langchain" assert tool_exec_span["origin"] == "auto.ai.langchain" From 09673b74eec68453834faca52012df5039ebdce1 Mon Sep 17 00:00:00 2001 From: Alexander Alderman Webb Date: Mon, 23 Mar 2026 18:23:45 +0100 Subject: [PATCH 5/5] expect string response tool calls --- tests/integrations/langchain/test_langchain.py | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/tests/integrations/langchain/test_langchain.py b/tests/integrations/langchain/test_langchain.py index ed5cda83a6..5a7032d552 100644 --- a/tests/integrations/langchain/test_langchain.py +++ b/tests/integrations/langchain/test_langchain.py @@ -353,13 +353,8 @@ def test_tool_execution_span( "Tool calls should be recorded when send_default_pii=True and include_prompts=True" ) tool_calls_data = chat_spans[0]["data"][SPANDATA.GEN_AI_RESPONSE_TOOL_CALLS] - assert isinstance(tool_calls_data, (list, str)) # Could be serialized - if isinstance(tool_calls_data, str): - assert "get_word_length" in tool_calls_data - elif isinstance(tool_calls_data, list) and len(tool_calls_data) > 0: - # Check if tool calls contain expected function name - tool_call_str = str(tool_calls_data) - assert "get_word_length" in tool_call_str + assert isinstance(tool_calls_data, str) + assert "get_word_length" in tool_calls_data else: assert SPANDATA.GEN_AI_REQUEST_MESSAGES not in chat_spans[0].get("data", {}) assert SPANDATA.GEN_AI_RESPONSE_TEXT not in chat_spans[0].get("data", {})