Skip to content
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions sentry_sdk/consts.py
Original file line number Diff line number Diff line change
Expand Up @@ -518,6 +518,12 @@ class SPANDATA:
Example: ["The weather in Paris is rainy and overcast, with temperatures around 57°F", "The weather in London is sunny and warm, with temperatures around 65°F"]
"""

GEN_AI_RESPONSE_TIME_TO_FIRST_TOKEN = "gen_ai.response.time_to_first_token"
"""
The time it took to receive the first token from the model.
Example: 0.1
"""

GEN_AI_RESPONSE_TOOL_CALLS = "gen_ai.response.tool_calls"
"""
The tool calls in the model's response.
Expand Down
28 changes: 26 additions & 2 deletions sentry_sdk/integrations/openai.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import sys
from functools import wraps
import time

import sentry_sdk
from sentry_sdk import consts
Expand Down Expand Up @@ -249,6 +250,7 @@ def _set_output_data(
response: "Any",
kwargs: "dict[str, Any]",
integration: "OpenAIIntegration",
start_time: "Optional[float]" = None,
finish_span: bool = True,
) -> None:
if hasattr(response, "model"):
Expand All @@ -263,6 +265,8 @@ def _set_output_data(
if messages is not None and isinstance(messages, str):
messages = [messages]

ttft: "Optional[float]" = None

if hasattr(response, "choices"):
if should_send_default_pii() and integration.include_prompts:
response_text = [
Expand Down Expand Up @@ -320,6 +324,7 @@ def _set_output_data(
old_iterator = response._iterator

def new_iterator() -> "Iterator[ChatCompletionChunk]":
nonlocal ttft
count_tokens_manually = True
for x in old_iterator:
with capture_internal_exceptions():
Expand All @@ -330,6 +335,8 @@ def new_iterator() -> "Iterator[ChatCompletionChunk]":
if hasattr(choice, "delta") and hasattr(
choice.delta, "content"
):
if start_time is not None and ttft is None:
ttft = time.perf_counter() - start_time
content = choice.delta.content
if len(data_buf) <= choice_index:
data_buf.append([])
Expand All @@ -338,6 +345,8 @@ def new_iterator() -> "Iterator[ChatCompletionChunk]":

# OpenAI responses API
elif hasattr(x, "delta"):
if start_time is not None and ttft is None:
ttft = time.perf_counter() - start_time
if len(data_buf) == 0:
data_buf.append([])
data_buf[0].append(x.delta or "")
Expand All @@ -356,6 +365,10 @@ def new_iterator() -> "Iterator[ChatCompletionChunk]":
yield x

with capture_internal_exceptions():
if ttft is not None:
set_data_normalized(
span, SPANDATA.GEN_AI_RESPONSE_TIME_TO_FIRST_TOKEN, ttft
)
if len(data_buf) > 0:
all_responses = ["".join(chunk) for chunk in data_buf]
if should_send_default_pii() and integration.include_prompts:
Expand All @@ -375,6 +388,7 @@ def new_iterator() -> "Iterator[ChatCompletionChunk]":
span.__exit__(None, None, None)

async def new_iterator_async() -> "AsyncIterator[ChatCompletionChunk]":
nonlocal ttft
count_tokens_manually = True
async for x in old_iterator:
with capture_internal_exceptions():
Expand All @@ -385,6 +399,8 @@ async def new_iterator_async() -> "AsyncIterator[ChatCompletionChunk]":
if hasattr(choice, "delta") and hasattr(
choice.delta, "content"
):
if start_time is not None and ttft is None:
ttft = time.perf_counter() - start_time
content = choice.delta.content
if len(data_buf) <= choice_index:
data_buf.append([])
Expand All @@ -393,6 +409,8 @@ async def new_iterator_async() -> "AsyncIterator[ChatCompletionChunk]":

# OpenAI responses API
elif hasattr(x, "delta"):
if start_time is not None and ttft is None:
ttft = time.perf_counter() - start_time
if len(data_buf) == 0:
data_buf.append([])
data_buf[0].append(x.delta or "")
Expand All @@ -411,6 +429,10 @@ async def new_iterator_async() -> "AsyncIterator[ChatCompletionChunk]":
yield x

with capture_internal_exceptions():
if ttft is not None:
set_data_normalized(
span, SPANDATA.GEN_AI_RESPONSE_TIME_TO_FIRST_TOKEN, ttft
)
if len(data_buf) > 0:
all_responses = ["".join(chunk) for chunk in data_buf]
if should_send_default_pii() and integration.include_prompts:
Expand Down Expand Up @@ -465,9 +487,10 @@ def _new_chat_completion_common(f: "Any", *args: "Any", **kwargs: "Any") -> "Any

_set_input_data(span, kwargs, operation, integration)

start_time = time.perf_counter()
response = yield f, args, kwargs

_set_output_data(span, response, kwargs, integration, finish_span=True)
_set_output_data(span, response, kwargs, integration, start_time, finish_span=True)

return response

Expand Down Expand Up @@ -645,9 +668,10 @@ def _new_responses_create_common(f: "Any", *args: "Any", **kwargs: "Any") -> "An

_set_input_data(span, kwargs, operation, integration)

start_time = time.perf_counter()
response = yield f, args, kwargs

_set_output_data(span, response, kwargs, integration, finish_span=True)
_set_output_data(span, response, kwargs, integration, start_time, finish_span=True)

return response

Expand Down
15 changes: 14 additions & 1 deletion sentry_sdk/integrations/openai_agents/patches/models.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import copy
import sys
import time
from functools import wraps

from sentry_sdk.integrations import DidNotEnable
Expand Down Expand Up @@ -149,8 +149,21 @@ async def wrapped_stream_response(*args: "Any", **kwargs: "Any") -> "Any":
span.set_data(SPANDATA.GEN_AI_RESPONSE_STREAMING, True)

streaming_response = None
ttft_recorded = False

async for event in original_stream_response(*args, **kwargs):
# Detect first content token (text delta event)
if not ttft_recorded and hasattr(event, "delta"):
start_time = getattr(
agent, "_sentry_chat_ttft_start_time", None
)
if start_time is not None:
ttft = time.perf_counter() - start_time
span.set_data(
SPANDATA.GEN_AI_RESPONSE_TIME_TO_FIRST_TOKEN, ttft
)
ttft_recorded = True

# Capture the full response from ResponseCompletedEvent
if hasattr(event, "response"):
streaming_response = event.response
Expand Down
5 changes: 5 additions & 0 deletions sentry_sdk/integrations/openai_agents/spans/ai_client.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
import time

import sentry_sdk
from sentry_sdk.consts import OP, SPANDATA

Expand Down Expand Up @@ -36,6 +38,9 @@ def ai_client_span(
# TODO-anton: remove hardcoded stuff and replace something that also works for embedding and so on
span.set_data(SPANDATA.GEN_AI_OPERATION_NAME, "chat")

# Store start time for TTFT calculation on the agent object
agent._sentry_chat_ttft_start_time = time.perf_counter()
Comment thread
cursor[bot] marked this conversation as resolved.
Outdated

_set_agent_data(span, agent)
_set_input_data(span, get_response_kwargs)

Expand Down
97 changes: 97 additions & 0 deletions tests/integrations/openai_agents/test_openai_agents.py
Original file line number Diff line number Diff line change
Expand Up @@ -2202,3 +2202,100 @@ async def test_streaming_span_update_captures_response_data(
assert span._data["gen_ai.usage.input_tokens"] == 10
assert span._data["gen_ai.usage.output_tokens"] == 20
assert span._data["gen_ai.response.model"] == "gpt-4-streaming"


@pytest.mark.asyncio
async def test_streaming_ttft_on_chat_span(sentry_init, test_agent):
"""
Test that time-to-first-token (TTFT) is recorded on chat spans during streaming.

TTFT is triggered by events with a `delta` attribute, which includes:
- ResponseTextDeltaEvent (text output)
- ResponseAudioDeltaEvent (audio output)
- ResponseReasoningTextDeltaEvent (reasoning/thinking)
- ResponseFunctionCallArgumentsDeltaEvent (function call args)
- and other delta events...

Events WITHOUT delta (like ResponseCompletedEvent, ResponseCreatedEvent, etc.)
should NOT trigger TTFT.
"""
import time

sentry_init(
integrations=[OpenAIAgentsIntegration()],
traces_sample_rate=1.0,
)

# Create a mock model that returns a stream_response generator
class MockModel:
model = "gpt-4"

async def stream_response(self, *args, **kwargs):
# First event: ResponseCreatedEvent (no delta - should NOT trigger TTFT)
created_event = MagicMock(spec=["type", "sequence_number"])
created_event.type = "response.created"
yield created_event

# Simulate server-side processing delay before first token
await asyncio.sleep(0.05) # 50ms delay

# Second event: ResponseTextDeltaEvent (HAS delta - triggers TTFT)
# This simulates the first actual content token
text_delta_event = MagicMock(spec=["delta", "type", "content_index"])
text_delta_event.delta = "Hello"
text_delta_event.type = "response.output_text.delta"
yield text_delta_event
await asyncio.sleep(0.05) # 50ms delay

# Third event: more text content (also has delta, but TTFT already recorded)
text_delta_event2 = MagicMock(spec=["delta", "type", "content_index"])
text_delta_event2.delta = " world!"
text_delta_event2.type = "response.output_text.delta"
yield text_delta_event2

# Final event: ResponseCompletedEvent (has response, no delta)
completed_event = MagicMock(spec=["response", "type", "sequence_number"])
completed_event.response = MagicMock()
completed_event.response.model = "gpt-4"
completed_event.response.usage = Usage(
requests=1,
input_tokens=10,
output_tokens=5,
total_tokens=15,
)
completed_event.response.output = []
yield completed_event

mock_model = MockModel()

with sentry_sdk.start_transaction(name="test_ttft", sampled=True) as transaction:
# Simulate calling the wrapped stream_response logic
from sentry_sdk.integrations.openai_agents.spans import ai_client_span

with ai_client_span(test_agent, {}) as span:
span.set_data(SPANDATA.GEN_AI_RESPONSE_STREAMING, True)

ttft_recorded = False
start_time = getattr(test_agent, "_sentry_chat_ttft_start_time", None)

async for event in mock_model.stream_response():
# This is the same logic used in the actual integration
if (
not ttft_recorded
and hasattr(event, "delta")
and start_time is not None
):
ttft = time.perf_counter() - start_time
span.set_data(SPANDATA.GEN_AI_RESPONSE_TIME_TO_FIRST_TOKEN, ttft)
ttft_recorded = True

# Verify TTFT is recorded on the chat span (inside transaction context)
chat_spans = [
s for s in transaction._span_recorder.spans if s.op == "gen_ai.chat"
]
assert len(chat_spans) >= 1
chat_span = chat_spans[0]
assert SPANDATA.GEN_AI_RESPONSE_TIME_TO_FIRST_TOKEN in chat_span._data
ttft_value = chat_span._data[SPANDATA.GEN_AI_RESPONSE_TIME_TO_FIRST_TOKEN]
# TTFT should be at least 40ms (our simulated delay minus some variance) but reasonable
assert 0.04 < ttft_value < 1.0, f"TTFT {ttft_value} should be around 50ms"
Loading