getsentry · constantinius · Jan 27, 2026 · Jan 20, 2026 · Jan 27, 2026 · Jan 27, 2026
@@ -518,6 +518,12 @@ class SPANDATA:
     Example: ["The weather in Paris is rainy and overcast, with temperatures around 57°F", "The weather in London is sunny and warm, with temperatures around 65°F"]
     """
 
+    GEN_AI_RESPONSE_TIME_TO_FIRST_TOKEN = "gen_ai.response.time_to_first_token"
+    """
+    The time it took to receive the first token from the model.
+    Example: 0.1
+    """
+
     GEN_AI_RESPONSE_TOOL_CALLS = "gen_ai.response.tool_calls"
     """
     The tool calls in the model's response.

@@ -1,5 +1,6 @@
 import sys
 from functools import wraps
+import time
 
 import sentry_sdk
 from sentry_sdk import consts
@@ -249,6 +250,7 @@ def _set_output_data(
     response: "Any",
     kwargs: "dict[str, Any]",
     integration: "OpenAIIntegration",
+    start_time: "Optional[float]" = None,
     finish_span: bool = True,
 ) -> None:
     if hasattr(response, "model"):
@@ -263,6 +265,8 @@ def _set_output_data(
     if messages is not None and isinstance(messages, str):
         messages = [messages]
 
+    ttft: "Optional[float]" = None
+
     if hasattr(response, "choices"):
         if should_send_default_pii() and integration.include_prompts:
             response_text = [
@@ -320,6 +324,7 @@ def _set_output_data(
         old_iterator = response._iterator
 
         def new_iterator() -> "Iterator[ChatCompletionChunk]":
+            nonlocal ttft
             count_tokens_manually = True
             for x in old_iterator:
                 with capture_internal_exceptions():
@@ -330,6 +335,8 @@ def new_iterator() -> "Iterator[ChatCompletionChunk]":
                             if hasattr(choice, "delta") and hasattr(
                                 choice.delta, "content"
                             ):
+                                if start_time is not None and ttft is None:
+                                    ttft = time.perf_counter() - start_time
                                 content = choice.delta.content
                                 if len(data_buf) <= choice_index:
                                     data_buf.append([])
@@ -338,6 +345,8 @@ def new_iterator() -> "Iterator[ChatCompletionChunk]":
 
                     # OpenAI responses API
                     elif hasattr(x, "delta"):
+                        if start_time is not None and ttft is None:
+                            ttft = time.perf_counter() - start_time
                         if len(data_buf) == 0:
                             data_buf.append([])
                         data_buf[0].append(x.delta or "")
@@ -356,6 +365,10 @@ def new_iterator() -> "Iterator[ChatCompletionChunk]":
                 yield x
 
             with capture_internal_exceptions():
+                if ttft is not None:
+                    set_data_normalized(
+                        span, SPANDATA.GEN_AI_RESPONSE_TIME_TO_FIRST_TOKEN, ttft
+                    )
                 if len(data_buf) > 0:
                     all_responses = ["".join(chunk) for chunk in data_buf]
                     if should_send_default_pii() and integration.include_prompts:
@@ -375,6 +388,7 @@ def new_iterator() -> "Iterator[ChatCompletionChunk]":
                 span.__exit__(None, None, None)
 
         async def new_iterator_async() -> "AsyncIterator[ChatCompletionChunk]":
+            nonlocal ttft
             count_tokens_manually = True
             async for x in old_iterator:
                 with capture_internal_exceptions():
@@ -385,6 +399,8 @@ async def new_iterator_async() -> "AsyncIterator[ChatCompletionChunk]":
                             if hasattr(choice, "delta") and hasattr(
                                 choice.delta, "content"
                             ):
+                                if start_time is not None and ttft is None:
+                                    ttft = time.perf_counter() - start_time
                                 content = choice.delta.content
                                 if len(data_buf) <= choice_index:
                                     data_buf.append([])
@@ -393,6 +409,8 @@ async def new_iterator_async() -> "AsyncIterator[ChatCompletionChunk]":
 
                     # OpenAI responses API
                     elif hasattr(x, "delta"):
+                        if start_time is not None and ttft is None:
+                            ttft = time.perf_counter() - start_time
                         if len(data_buf) == 0:
                             data_buf.append([])
                         data_buf[0].append(x.delta or "")
@@ -411,6 +429,10 @@ async def new_iterator_async() -> "AsyncIterator[ChatCompletionChunk]":
                 yield x
 
             with capture_internal_exceptions():
+                if ttft is not None:
+                    set_data_normalized(
+                        span, SPANDATA.GEN_AI_RESPONSE_TIME_TO_FIRST_TOKEN, ttft
+                    )
                 if len(data_buf) > 0:
                     all_responses = ["".join(chunk) for chunk in data_buf]
                     if should_send_default_pii() and integration.include_prompts:
@@ -465,9 +487,10 @@ def _new_chat_completion_common(f: "Any", *args: "Any", **kwargs: "Any") -> "Any
 
     _set_input_data(span, kwargs, operation, integration)
 
+    start_time = time.perf_counter()
     response = yield f, args, kwargs
 
-    _set_output_data(span, response, kwargs, integration, finish_span=True)
+    _set_output_data(span, response, kwargs, integration, start_time, finish_span=True)
 
     return response
 
@@ -645,9 +668,10 @@ def _new_responses_create_common(f: "Any", *args: "Any", **kwargs: "Any") -> "An
 
     _set_input_data(span, kwargs, operation, integration)
 
+    start_time = time.perf_counter()
     response = yield f, args, kwargs
 
-    _set_output_data(span, response, kwargs, integration, finish_span=True)
+    _set_output_data(span, response, kwargs, integration, start_time, finish_span=True)
 
     return response
 

@@ -1,5 +1,5 @@
 import copy
-import sys
+import time
 from functools import wraps
 
 from sentry_sdk.integrations import DidNotEnable
@@ -149,8 +149,21 @@ async def wrapped_stream_response(*args: "Any", **kwargs: "Any") -> "Any":
                     span.set_data(SPANDATA.GEN_AI_RESPONSE_STREAMING, True)
 
                     streaming_response = None
+                    ttft_recorded = False
 
                     async for event in original_stream_response(*args, **kwargs):
+                        # Detect first content token (text delta event)
+                        if not ttft_recorded and hasattr(event, "delta"):
+                            start_time = getattr(
+                                agent, "_sentry_chat_ttft_start_time", None
+                            )
+                            if start_time is not None:
+                                ttft = time.perf_counter() - start_time
+                                span.set_data(
+                                    SPANDATA.GEN_AI_RESPONSE_TIME_TO_FIRST_TOKEN, ttft
+                                )
+                            ttft_recorded = True
+
                         # Capture the full response from ResponseCompletedEvent
                         if hasattr(event, "response"):
                             streaming_response = event.response

diff --git a/sentry_sdk/integrations/openai_agents/spans/ai_client.py b/sentry_sdk/integrations/openai_agents/spans/ai_client.py
@@ -1,3 +1,5 @@
+import time
+
 import sentry_sdk
 from sentry_sdk.consts import OP, SPANDATA
 
@@ -36,6 +38,9 @@ def ai_client_span(
     # TODO-anton: remove hardcoded stuff and replace something that also works for embedding and so on
     span.set_data(SPANDATA.GEN_AI_OPERATION_NAME, "chat")
 
+    # Store start time for TTFT calculation on the agent object
+    agent._sentry_chat_ttft_start_time = time.perf_counter()
+
     _set_agent_data(span, agent)
     _set_input_data(span, get_response_kwargs)
 

@@ -2202,3 +2202,100 @@ async def test_streaming_span_update_captures_response_data(
         assert span._data["gen_ai.usage.input_tokens"] == 10
         assert span._data["gen_ai.usage.output_tokens"] == 20
         assert span._data["gen_ai.response.model"] == "gpt-4-streaming"
+
+
+@pytest.mark.asyncio
+async def test_streaming_ttft_on_chat_span(sentry_init, test_agent):
+    """
+    Test that time-to-first-token (TTFT) is recorded on chat spans during streaming.
+
+    TTFT is triggered by events with a `delta` attribute, which includes:
+    - ResponseTextDeltaEvent (text output)
+    - ResponseAudioDeltaEvent (audio output)
+    - ResponseReasoningTextDeltaEvent (reasoning/thinking)
+    - ResponseFunctionCallArgumentsDeltaEvent (function call args)
+    - and other delta events...
+
+    Events WITHOUT delta (like ResponseCompletedEvent, ResponseCreatedEvent, etc.)
+    should NOT trigger TTFT.
+    """
+    import time
+
+    sentry_init(
+        integrations=[OpenAIAgentsIntegration()],
+        traces_sample_rate=1.0,
+    )
+
+    # Create a mock model that returns a stream_response generator
+    class MockModel:
+        model = "gpt-4"
+
+        async def stream_response(self, *args, **kwargs):
+            # First event: ResponseCreatedEvent (no delta - should NOT trigger TTFT)
+            created_event = MagicMock(spec=["type", "sequence_number"])
+            created_event.type = "response.created"
+            yield created_event
+
+            # Simulate server-side processing delay before first token
+            await asyncio.sleep(0.05)  # 50ms delay
+
+            # Second event: ResponseTextDeltaEvent (HAS delta - triggers TTFT)
+            # This simulates the first actual content token
+            text_delta_event = MagicMock(spec=["delta", "type", "content_index"])
+            text_delta_event.delta = "Hello"
+            text_delta_event.type = "response.output_text.delta"
+            yield text_delta_event
+            await asyncio.sleep(0.05)  # 50ms delay
+
+            # Third event: more text content (also has delta, but TTFT already recorded)
+            text_delta_event2 = MagicMock(spec=["delta", "type", "content_index"])
+            text_delta_event2.delta = " world!"
+            text_delta_event2.type = "response.output_text.delta"
+            yield text_delta_event2
+
+            # Final event: ResponseCompletedEvent (has response, no delta)
+            completed_event = MagicMock(spec=["response", "type", "sequence_number"])
+            completed_event.response = MagicMock()
+            completed_event.response.model = "gpt-4"
+            completed_event.response.usage = Usage(
+                requests=1,
+                input_tokens=10,
+                output_tokens=5,
+                total_tokens=15,
+            )
+            completed_event.response.output = []
+            yield completed_event
+
+    mock_model = MockModel()
+
+    with sentry_sdk.start_transaction(name="test_ttft", sampled=True) as transaction:
+        # Simulate calling the wrapped stream_response logic
+        from sentry_sdk.integrations.openai_agents.spans import ai_client_span
+
+        with ai_client_span(test_agent, {}) as span:
+            span.set_data(SPANDATA.GEN_AI_RESPONSE_STREAMING, True)
+
+            ttft_recorded = False
+            start_time = getattr(test_agent, "_sentry_chat_ttft_start_time", None)
+
+            async for event in mock_model.stream_response():
+                # This is the same logic used in the actual integration
+                if (
+                    not ttft_recorded
+                    and hasattr(event, "delta")
+                    and start_time is not None
+                ):
+                    ttft = time.perf_counter() - start_time
+                    span.set_data(SPANDATA.GEN_AI_RESPONSE_TIME_TO_FIRST_TOKEN, ttft)
+                    ttft_recorded = True
+
+        # Verify TTFT is recorded on the chat span (inside transaction context)
+        chat_spans = [
+            s for s in transaction._span_recorder.spans if s.op == "gen_ai.chat"
+        ]
+        assert len(chat_spans) >= 1
+        chat_span = chat_spans[0]
+        assert SPANDATA.GEN_AI_RESPONSE_TIME_TO_FIRST_TOKEN in chat_span._data
+        ttft_value = chat_span._data[SPANDATA.GEN_AI_RESPONSE_TIME_TO_FIRST_TOKEN]
+        # TTFT should be at least 40ms (our simulated delay minus some variance) but reasonable
+        assert 0.04 < ttft_value < 1.0, f"TTFT {ttft_value} should be around 50ms"