From f24ebc40d40df8e47dfff78e170faeb48fe1881d Mon Sep 17 00:00:00 2001 From: Fabian Schindler Date: Tue, 24 Mar 2026 16:36:12 +0100 Subject: [PATCH] fix(openai): capture token usage for streaming responses when available This requires the user to pass `stream_options={"include_usage": True}` --- sentry_sdk/integrations/openai.py | 28 ++++++++++++++++++++-------- 1 file changed, 20 insertions(+), 8 deletions(-) diff --git a/sentry_sdk/integrations/openai.py b/sentry_sdk/integrations/openai.py index a5556b8776..4693070fa6 100644 --- a/sentry_sdk/integrations/openai.py +++ b/sentry_sdk/integrations/openai.py @@ -163,17 +163,21 @@ def _calculate_token_usage( if hasattr(response, "usage"): input_tokens = _get_usage(response.usage, ["input_tokens", "prompt_tokens"]) - if hasattr(response.usage, "input_tokens_details"): - input_tokens_cached = _get_usage( - response.usage.input_tokens_details, ["cached_tokens"] - ) + input_tokens_details = getattr( + response.usage, "input_tokens_details", None + ) or getattr(response.usage, "prompt_tokens_details", None) + if input_tokens_details is not None: + input_tokens_cached = _get_usage(input_tokens_details, ["cached_tokens"]) output_tokens = _get_usage( response.usage, ["output_tokens", "completion_tokens"] ) - if hasattr(response.usage, "output_tokens_details"): + output_tokens_details = getattr( + response.usage, "output_tokens_details", None + ) or getattr(response.usage, "completion_tokens_details", None) + if output_tokens_details is not None: output_tokens_reasoning = _get_usage( - response.usage.output_tokens_details, ["reasoning_tokens"] + output_tokens_details, ["reasoning_tokens"] ) total_tokens = _get_usage(response.usage, ["total_tokens"]) @@ -610,10 +614,14 @@ def _set_streaming_completions_api_output_data( def new_iterator() -> "Iterator[ChatCompletionChunk]": nonlocal ttft + usage_chunk = None for x in old_iterator: span.set_data(SPANDATA.GEN_AI_RESPONSE_MODEL, x.model) with capture_internal_exceptions(): + if hasattr(x, "usage") and x.usage is not None: + usage_chunk = x + if hasattr(x, "choices"): choice_index = 0 for choice in x.choices: @@ -643,7 +651,7 @@ def new_iterator() -> "Iterator[ChatCompletionChunk]": ) _calculate_token_usage( messages, - response, + usage_chunk if usage_chunk is not None else response, span, all_responses, integration.count_tokens, @@ -654,10 +662,14 @@ def new_iterator() -> "Iterator[ChatCompletionChunk]": async def new_iterator_async() -> "AsyncIterator[ChatCompletionChunk]": nonlocal ttft + usage_chunk = None async for x in old_iterator: span.set_data(SPANDATA.GEN_AI_RESPONSE_MODEL, x.model) with capture_internal_exceptions(): + if hasattr(x, "usage") and x.usage is not None: + usage_chunk = x + if hasattr(x, "choices"): choice_index = 0 for choice in x.choices: @@ -687,7 +699,7 @@ async def new_iterator_async() -> "AsyncIterator[ChatCompletionChunk]": ) _calculate_token_usage( messages, - response, + usage_chunk if usage_chunk is not None else response, span, all_responses, integration.count_tokens,