From a728bd0f07447483699232ac2152487cab39ff4f Mon Sep 17 00:00:00 2001
From: Alexander Alderman Webb <alexander.webb@sentry.io>
Date: Mon, 23 Mar 2026 08:40:31 +0100
Subject: [PATCH 1/5] test(langchain): Add tool execution test

---
 tests/conftest.py                             |  59 +++++
 .../integrations/langchain/test_langchain.py  | 187 ++++++++++++++++
 .../openai_agents/test_openai_agents.py       | 209 +++++++-----------
 3 files changed, 326 insertions(+), 129 deletions(-)

diff --git a/tests/conftest.py b/tests/conftest.py
index 5dd62931f1..1aa5f04f6a 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -1099,6 +1099,65 @@ def nonstreaming_responses_model_response():
     )
 
 
+@pytest.fixture
+def responses_tool_call_model_responses():
+    def inner(
+        tool_name: str,
+        arguments: str,
+        response_model: str,
+        response_text: str,
+        response_ids: "Iterator[str]",
+        usages: "Iterator[openai.types.responses.ResponseUsage]",
+    ):
+        yield openai.types.responses.Response(
+            id=next(response_ids),
+            output=[
+                openai.types.responses.ResponseFunctionToolCall(
+                    id="call_123",
+                    call_id="call_123",
+                    name=tool_name,
+                    type="function_call",
+                    arguments=arguments,
+                )
+            ],
+            parallel_tool_calls=False,
+            tool_choice="none",
+            tools=[],
+            created_at=10000000,
+            model=response_model,
+            object="response",
+            usage=next(usages),
+        )
+
+        yield openai.types.responses.Response(
+            id=next(response_ids),
+            output=[
+                openai.types.responses.ResponseOutputMessage(
+                    id="msg_final",
+                    type="message",
+                    status="completed",
+                    content=[
+                        openai.types.responses.ResponseOutputText(
+                            text=response_text,
+                            type="output_text",
+                            annotations=[],
+                        )
+                    ],
+                    role="assistant",
+                )
+            ],
+            parallel_tool_calls=False,
+            tool_choice="none",
+            tools=[],
+            created_at=10000000,
+            model=response_model,
+            object="response",
+            usage=next(usages),
+        )
+
+    return inner
+
+
 class MockServerRequestHandler(BaseHTTPRequestHandler):
     def do_GET(self):  # noqa: N802
         # Process an HTTP GET request and return a response.
diff --git a/tests/integrations/langchain/test_langchain.py b/tests/integrations/langchain/test_langchain.py
index 99ee2b7851..86f8e6ad1a 100644
--- a/tests/integrations/langchain/test_langchain.py
+++ b/tests/integrations/langchain/test_langchain.py
@@ -54,6 +54,14 @@
     CompletionUsage,
 )
 
+from openai.types.responses import (
+    ResponseUsage,
+)
+from openai.types.responses.response_usage import (
+    InputTokensDetails,
+    OutputTokensDetails,
+)
+
 LANGCHAIN_VERSION = package_version("langchain")
 
 
@@ -209,6 +217,185 @@ def test_langchain_create_agent(
         assert SPANDATA.GEN_AI_RESPONSE_TEXT not in chat_spans[0].get("data", {})
 
 
+@pytest.mark.skipif(
+    LANGCHAIN_VERSION < (1,),
+    reason="LangChain 1.0+ required (ONE AGENT refactor)",
+)
+@pytest.mark.parametrize(
+    "send_default_pii, include_prompts",
+    [
+        (True, True),
+        (True, False),
+        (False, True),
+        (False, False),
+    ],
+)
+def test_tool_execution_span(
+    sentry_init,
+    capture_events,
+    send_default_pii,
+    include_prompts,
+    get_model_response,
+    responses_tool_call_model_responses,
+):
+    sentry_init(
+        integrations=[
+            LangchainIntegration(
+                include_prompts=include_prompts,
+            )
+        ],
+        traces_sample_rate=1.0,
+        send_default_pii=send_default_pii,
+    )
+    events = capture_events()
+
+    responses = responses_tool_call_model_responses(
+        tool_name="get_word_length",
+        arguments='{"word": "eudca"}',
+        response_model="gpt-4-0613",
+        response_text="The word eudca has 5 letters.",
+        response_ids=iter(["resp_1", "resp_2"]),
+        usages=iter(
+            [
+                ResponseUsage(
+                    input_tokens=142,
+                    input_tokens_details=InputTokensDetails(
+                        cached_tokens=0,
+                    ),
+                    output_tokens=50,
+                    output_tokens_details=OutputTokensDetails(
+                        reasoning_tokens=0,
+                    ),
+                    total_tokens=192,
+                ),
+                ResponseUsage(
+                    input_tokens=89,
+                    input_tokens_details=InputTokensDetails(
+                        cached_tokens=0,
+                    ),
+                    output_tokens=28,
+                    output_tokens_details=OutputTokensDetails(
+                        reasoning_tokens=0,
+                    ),
+                    total_tokens=117,
+                ),
+            ]
+        ),
+    )
+    tool_response = get_model_response(
+        next(responses),
+        serialize_pydantic=True,
+        request_headers={
+            "X-Stainless-Raw-Response": "True",
+        },
+    )
+    final_response = get_model_response(
+        next(responses),
+        serialize_pydantic=True,
+        request_headers={
+            "X-Stainless-Raw-Response": "True",
+        },
+    )
+
+    llm = ChatOpenAI(
+        model_name="gpt-4",
+        temperature=0,
+        openai_api_key="badkey",
+        use_responses_api=True,
+    )
+    agent = create_agent(
+        model=llm,
+        tools=[get_word_length],
+        name="word_length_agent",
+    )
+
+    with patch.object(
+        llm.client._client._client,
+        "send",
+        side_effect=[tool_response, final_response],
+    ) as _:
+        with start_transaction():
+            agent.invoke(
+                {
+                    "messages": [
+                        HumanMessage(content="How many letters in the word eudca"),
+                    ],
+                },
+            )
+
+    tx = events[0]
+    assert tx["type"] == "transaction"
+    assert tx["contexts"]["trace"]["origin"] == "manual"
+
+    chat_spans = list(x for x in tx["spans"] if x["op"] == "gen_ai.chat")
+    tool_exec_span = next(x for x in tx["spans"] if x["op"] == "gen_ai.execute_tool")
+
+    assert len(chat_spans) == 2
+
+    assert chat_spans[0]["origin"] == "auto.ai.langchain"
+    assert chat_spans[1]["origin"] == "auto.ai.langchain"
+    assert tool_exec_span["origin"] == "auto.ai.langchain"
+
+    # We can't guarantee anything about the "shape" of the langchain execution graph
+    assert len(list(x for x in tx["spans"] if x["op"] == "gen_ai.chat")) > 0
+
+    # Token usage is only available in newer versions of langchain (v0.2+)
+    # where usage_metadata is supported on AIMessageChunk
+    if "gen_ai.usage.input_tokens" in chat_spans[0]["data"]:
+        assert chat_spans[0]["data"]["gen_ai.usage.input_tokens"] == 142
+        assert chat_spans[0]["data"]["gen_ai.usage.output_tokens"] == 50
+        assert chat_spans[0]["data"]["gen_ai.usage.total_tokens"] == 192
+
+    if "gen_ai.usage.input_tokens" in chat_spans[1]["data"]:
+        assert chat_spans[1]["data"]["gen_ai.usage.input_tokens"] == 89
+        assert chat_spans[1]["data"]["gen_ai.usage.output_tokens"] == 28
+        assert chat_spans[1]["data"]["gen_ai.usage.total_tokens"] == 117
+
+    if send_default_pii and include_prompts:
+        assert "word" in tool_exec_span["data"][SPANDATA.GEN_AI_TOOL_INPUT]
+
+        assert "5" in chat_spans[1]["data"][SPANDATA.GEN_AI_RESPONSE_TEXT]
+
+        # Verify tool calls are recorded when PII is enabled
+        assert SPANDATA.GEN_AI_RESPONSE_TOOL_CALLS in chat_spans[0].get("data", {}), (
+            "Tool calls should be recorded when send_default_pii=True and include_prompts=True"
+        )
+        tool_calls_data = chat_spans[0]["data"][SPANDATA.GEN_AI_RESPONSE_TOOL_CALLS]
+        assert isinstance(tool_calls_data, (list, str))  # Could be serialized
+        if isinstance(tool_calls_data, str):
+            assert "get_word_length" in tool_calls_data
+        elif isinstance(tool_calls_data, list) and len(tool_calls_data) > 0:
+            # Check if tool calls contain expected function name
+            tool_call_str = str(tool_calls_data)
+            assert "get_word_length" in tool_call_str
+    else:
+        assert SPANDATA.GEN_AI_REQUEST_MESSAGES not in chat_spans[0].get("data", {})
+        assert SPANDATA.GEN_AI_RESPONSE_TEXT not in chat_spans[0].get("data", {})
+        assert SPANDATA.GEN_AI_REQUEST_MESSAGES not in chat_spans[1].get("data", {})
+        assert SPANDATA.GEN_AI_RESPONSE_TEXT not in chat_spans[1].get("data", {})
+        assert SPANDATA.GEN_AI_TOOL_INPUT not in tool_exec_span.get("data", {})
+        assert SPANDATA.GEN_AI_TOOL_OUTPUT not in tool_exec_span.get("data", {})
+
+        # Verify tool calls are NOT recorded when PII is disabled
+        assert SPANDATA.GEN_AI_RESPONSE_TOOL_CALLS not in chat_spans[0].get(
+            "data", {}
+        ), (
+            f"Tool calls should NOT be recorded when send_default_pii={send_default_pii} "
+            f"and include_prompts={include_prompts}"
+        )
+        assert SPANDATA.GEN_AI_RESPONSE_TOOL_CALLS not in chat_spans[1].get(
+            "data", {}
+        ), (
+            f"Tool calls should NOT be recorded when send_default_pii={send_default_pii} "
+            f"and include_prompts={include_prompts}"
+        )
+
+    # Verify that available tools are always recorded regardless of PII settings
+    for chat_span in chat_spans:
+        tools_data = chat_span["data"][SPANDATA.GEN_AI_REQUEST_AVAILABLE_TOOLS]
+        assert "get_word_length" in tools_data
+
+
 @pytest.mark.parametrize(
     "send_default_pii, include_prompts",
     [
diff --git a/tests/integrations/openai_agents/test_openai_agents.py b/tests/integrations/openai_agents/test_openai_agents.py
index 1442a2001b..c428a822d4 100644
--- a/tests/integrations/openai_agents/test_openai_agents.py
+++ b/tests/integrations/openai_agents/test_openai_agents.py
@@ -1062,7 +1062,11 @@ async def test_max_turns_before_handoff_span(
 
 @pytest.mark.asyncio
 async def test_tool_execution_span(
-    sentry_init, capture_events, test_agent, get_model_response
+    sentry_init,
+    capture_events,
+    test_agent,
+    get_model_response,
+    responses_tool_call_model_responses,
 ):
     """
     Test tool execution span creation.
@@ -1078,75 +1082,45 @@ def simple_test_tool(message: str) -> str:
     model = OpenAIResponsesModel(model="gpt-4", openai_client=client)
     agent_with_tool = test_agent.clone(tools=[simple_test_tool], model=model)
 
-    tool_response = get_model_response(
-        Response(
-            id="resp_tool_123",
-            output=[
-                ResponseFunctionToolCall(
-                    id="call_123",
-                    call_id="call_123",
-                    name="simple_test_tool",
-                    type="function_call",
-                    arguments='{"message": "hello"}',
-                )
-            ],
-            parallel_tool_calls=False,
-            tool_choice="none",
-            tools=[],
-            created_at=10000000,
-            model="gpt-4",
-            object="response",
-            usage=ResponseUsage(
-                input_tokens=10,
-                input_tokens_details=InputTokensDetails(
-                    cached_tokens=0,
+    responses = responses_tool_call_model_responses(
+        tool_name="simple_test_tool",
+        arguments='{"message": "hello"}',
+        response_model="gpt-4",
+        response_text="Task completed using the tool",
+        response_ids=iter(["resp_tool_123", "resp_final_123"]),
+        usages=iter(
+            [
+                ResponseUsage(
+                    input_tokens=10,
+                    input_tokens_details=InputTokensDetails(
+                        cached_tokens=0,
+                    ),
+                    output_tokens=5,
+                    output_tokens_details=OutputTokensDetails(
+                        reasoning_tokens=0,
+                    ),
+                    total_tokens=15,
                 ),
-                output_tokens=5,
-                output_tokens_details=OutputTokensDetails(
-                    reasoning_tokens=0,
+                ResponseUsage(
+                    input_tokens=15,
+                    input_tokens_details=InputTokensDetails(
+                        cached_tokens=0,
+                    ),
+                    output_tokens=10,
+                    output_tokens_details=OutputTokensDetails(
+                        reasoning_tokens=0,
+                    ),
+                    total_tokens=25,
                 ),
-                total_tokens=15,
-            ),
+            ]
         ),
+    )
+    tool_response = get_model_response(
+        next(responses),
         serialize_pydantic=True,
     )
-
     final_response = get_model_response(
-        Response(
-            id="resp_final_123",
-            output=[
-                ResponseOutputMessage(
-                    id="msg_final",
-                    type="message",
-                    status="completed",
-                    content=[
-                        ResponseOutputText(
-                            text="Task completed using the tool",
-                            type="output_text",
-                            annotations=[],
-                        )
-                    ],
-                    role="assistant",
-                )
-            ],
-            parallel_tool_calls=False,
-            tool_choice="none",
-            tools=[],
-            created_at=10000000,
-            model="gpt-4",
-            object="response",
-            usage=ResponseUsage(
-                input_tokens=15,
-                input_tokens_details=InputTokensDetails(
-                    cached_tokens=0,
-                ),
-                output_tokens=10,
-                output_tokens_details=OutputTokensDetails(
-                    reasoning_tokens=0,
-                ),
-                total_tokens=25,
-            ),
-        ),
+        next(responses),
         serialize_pydantic=True,
     )
 
@@ -2298,7 +2272,11 @@ def test_openai_agents_message_role_mapping(
 
 @pytest.mark.asyncio
 async def test_tool_execution_error_tracing(
-    sentry_init, capture_events, test_agent, get_model_response
+    sentry_init,
+    capture_events,
+    test_agent,
+    get_model_response,
+    responses_tool_call_model_responses,
 ):
     """
     Test that tool execution errors are properly tracked via error tracing patch.
@@ -2321,75 +2299,45 @@ def failing_tool(message: str) -> str:
     model = OpenAIResponsesModel(model="gpt-4", openai_client=client)
     agent_with_tool = test_agent.clone(tools=[failing_tool], model=model)
 
-    tool_response = get_model_response(
-        Response(
-            id="resp_1",
-            output=[
-                ResponseFunctionToolCall(
-                    id="call_123",
-                    call_id="call_123",
-                    name="failing_tool",
-                    type="function_call",
-                    arguments='{"message": "test"}',
-                )
-            ],
-            parallel_tool_calls=False,
-            tool_choice="none",
-            tools=[],
-            created_at=10000000,
-            model="gpt-4.1-2025-04-14",
-            object="response",
-            usage=ResponseUsage(
-                input_tokens=10,
-                input_tokens_details=InputTokensDetails(
-                    cached_tokens=0,
+    responses = responses_tool_call_model_responses(
+        tool_name="failing_tool",
+        arguments='{"message": "test"}',
+        response_model="gpt-4-0613",
+        response_text="An error occurred while running the tool",
+        response_ids=iter(["resp_1", "resp_2"]),
+        usages=iter(
+            [
+                ResponseUsage(
+                    input_tokens=10,
+                    input_tokens_details=InputTokensDetails(
+                        cached_tokens=0,
+                    ),
+                    output_tokens=5,
+                    output_tokens_details=OutputTokensDetails(
+                        reasoning_tokens=0,
+                    ),
+                    total_tokens=15,
                 ),
-                output_tokens=5,
-                output_tokens_details=OutputTokensDetails(
-                    reasoning_tokens=0,
+                ResponseUsage(
+                    input_tokens=15,
+                    input_tokens_details=InputTokensDetails(
+                        cached_tokens=0,
+                    ),
+                    output_tokens=10,
+                    output_tokens_details=OutputTokensDetails(
+                        reasoning_tokens=0,
+                    ),
+                    total_tokens=25,
                 ),
-                total_tokens=15,
-            ),
+            ]
         ),
+    )
+    tool_response = get_model_response(
+        next(responses),
         serialize_pydantic=True,
     )
-
     final_response = get_model_response(
-        Response(
-            id="resp_2",
-            output=[
-                ResponseOutputMessage(
-                    id="msg_final",
-                    type="message",
-                    status="completed",
-                    content=[
-                        ResponseOutputText(
-                            text="An error occurred while running the tool",
-                            type="output_text",
-                            annotations=[],
-                        )
-                    ],
-                    role="assistant",
-                )
-            ],
-            parallel_tool_calls=False,
-            tool_choice="none",
-            tools=[],
-            created_at=10000000,
-            model="gpt-4-0613",
-            object="response",
-            usage=ResponseUsage(
-                input_tokens=15,
-                input_tokens_details=InputTokensDetails(
-                    cached_tokens=0,
-                ),
-                output_tokens=10,
-                output_tokens_details=OutputTokensDetails(
-                    reasoning_tokens=0,
-                ),
-                total_tokens=25,
-            ),
-        ),
+        next(responses),
         serialize_pydantic=True,
     )
 
@@ -2420,7 +2368,10 @@ def failing_tool(message: str) -> str:
     # Find the execute_tool span
     execute_tool_span = None
     for span in spans:
-        if span.get("description", "").startswith("execute_tool failing_tool"):
+        description = span.get("description", "")
+        if description is not None and description.startswith(
+            "execute_tool failing_tool"
+        ):
             execute_tool_span = span
             break
 

From e6bfe4d21d1ce6b40fcc7a8496367d939d3f849f Mon Sep 17 00:00:00 2001
From: Alexander Alderman Webb <alexander.webb@sentry.io>
Date: Mon, 23 Mar 2026 11:26:18 +0100
Subject: [PATCH 2/5] remove redundant assertion

---
 tests/integrations/langchain/test_langchain.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/tests/integrations/langchain/test_langchain.py b/tests/integrations/langchain/test_langchain.py
index b7016f226f..11a0fe3042 100644
--- a/tests/integrations/langchain/test_langchain.py
+++ b/tests/integrations/langchain/test_langchain.py
@@ -336,9 +336,6 @@ def test_tool_execution_span(
     assert chat_spans[1]["origin"] == "auto.ai.langchain"
     assert tool_exec_span["origin"] == "auto.ai.langchain"
 
-    # We can't guarantee anything about the "shape" of the langchain execution graph
-    assert len(list(x for x in tx["spans"] if x["op"] == "gen_ai.chat")) > 0
-
     # Token usage is only available in newer versions of langchain (v0.2+)
     # where usage_metadata is supported on AIMessageChunk
     if "gen_ai.usage.input_tokens" in chat_spans[0]["data"]:

From 1b4b2baf31e6462a985dd2c1375bb705ddb24083 Mon Sep 17 00:00:00 2001
From: Alexander Alderman Webb <alexander.webb@sentry.io>
Date: Mon, 23 Mar 2026 16:39:45 +0100
Subject: [PATCH 3/5] do not gate token assertions

---
 tests/integrations/langchain/test_langchain.py | 16 ++++++----------
 1 file changed, 6 insertions(+), 10 deletions(-)

diff --git a/tests/integrations/langchain/test_langchain.py b/tests/integrations/langchain/test_langchain.py
index 122c45b224..45b7d28c27 100644
--- a/tests/integrations/langchain/test_langchain.py
+++ b/tests/integrations/langchain/test_langchain.py
@@ -333,17 +333,13 @@ def test_tool_execution_span(
     assert chat_spans[1]["origin"] == "auto.ai.langchain"
     assert tool_exec_span["origin"] == "auto.ai.langchain"
 
-    # Token usage is only available in newer versions of langchain (v0.2+)
-    # where usage_metadata is supported on AIMessageChunk
-    if "gen_ai.usage.input_tokens" in chat_spans[0]["data"]:
-        assert chat_spans[0]["data"]["gen_ai.usage.input_tokens"] == 142
-        assert chat_spans[0]["data"]["gen_ai.usage.output_tokens"] == 50
-        assert chat_spans[0]["data"]["gen_ai.usage.total_tokens"] == 192
+    assert chat_spans[0]["data"]["gen_ai.usage.input_tokens"] == 142
+    assert chat_spans[0]["data"]["gen_ai.usage.output_tokens"] == 50
+    assert chat_spans[0]["data"]["gen_ai.usage.total_tokens"] == 192
 
-    if "gen_ai.usage.input_tokens" in chat_spans[1]["data"]:
-        assert chat_spans[1]["data"]["gen_ai.usage.input_tokens"] == 89
-        assert chat_spans[1]["data"]["gen_ai.usage.output_tokens"] == 28
-        assert chat_spans[1]["data"]["gen_ai.usage.total_tokens"] == 117
+    assert chat_spans[1]["data"]["gen_ai.usage.input_tokens"] == 89
+    assert chat_spans[1]["data"]["gen_ai.usage.output_tokens"] == 28
+    assert chat_spans[1]["data"]["gen_ai.usage.total_tokens"] == 117
 
     if send_default_pii and include_prompts:
         assert "word" in tool_exec_span["data"][SPANDATA.GEN_AI_TOOL_INPUT]

From c7e871b030d4e76e71dc45652330240b01ca7796 Mon Sep 17 00:00:00 2001
From: Alexander Alderman Webb <alexander.webb@sentry.io>
Date: Mon, 23 Mar 2026 18:19:42 +0100
Subject: [PATCH 4/5] assert that there is only one tool span

---
 tests/integrations/langchain/test_langchain.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/tests/integrations/langchain/test_langchain.py b/tests/integrations/langchain/test_langchain.py
index 45b7d28c27..ed5cda83a6 100644
--- a/tests/integrations/langchain/test_langchain.py
+++ b/tests/integrations/langchain/test_langchain.py
@@ -325,10 +325,12 @@ def test_tool_execution_span(
     assert tx["contexts"]["trace"]["origin"] == "manual"
 
     chat_spans = list(x for x in tx["spans"] if x["op"] == "gen_ai.chat")
-    tool_exec_span = next(x for x in tx["spans"] if x["op"] == "gen_ai.execute_tool")
-
     assert len(chat_spans) == 2
 
+    tool_exec_spans = list(x for x in tx["spans"] if x["op"] == "gen_ai.execute_tool")
+    assert len(tool_exec_spans) == 1
+    tool_exec_span = tool_exec_spans[0]
+
     assert chat_spans[0]["origin"] == "auto.ai.langchain"
     assert chat_spans[1]["origin"] == "auto.ai.langchain"
     assert tool_exec_span["origin"] == "auto.ai.langchain"

From 09673b74eec68453834faca52012df5039ebdce1 Mon Sep 17 00:00:00 2001
From: Alexander Alderman Webb <alexander.webb@sentry.io>
Date: Mon, 23 Mar 2026 18:23:45 +0100
Subject: [PATCH 5/5] expect string response tool calls

---
 tests/integrations/langchain/test_langchain.py | 9 ++-------
 1 file changed, 2 insertions(+), 7 deletions(-)

diff --git a/tests/integrations/langchain/test_langchain.py b/tests/integrations/langchain/test_langchain.py
index ed5cda83a6..5a7032d552 100644
--- a/tests/integrations/langchain/test_langchain.py
+++ b/tests/integrations/langchain/test_langchain.py
@@ -353,13 +353,8 @@ def test_tool_execution_span(
             "Tool calls should be recorded when send_default_pii=True and include_prompts=True"
         )
         tool_calls_data = chat_spans[0]["data"][SPANDATA.GEN_AI_RESPONSE_TOOL_CALLS]
-        assert isinstance(tool_calls_data, (list, str))  # Could be serialized
-        if isinstance(tool_calls_data, str):
-            assert "get_word_length" in tool_calls_data
-        elif isinstance(tool_calls_data, list) and len(tool_calls_data) > 0:
-            # Check if tool calls contain expected function name
-            tool_call_str = str(tool_calls_data)
-            assert "get_word_length" in tool_call_str
+        assert isinstance(tool_calls_data, str)
+        assert "get_word_length" in tool_calls_data
     else:
         assert SPANDATA.GEN_AI_REQUEST_MESSAGES not in chat_spans[0].get("data", {})
         assert SPANDATA.GEN_AI_RESPONSE_TEXT not in chat_spans[0].get("data", {})