diff --git a/sentry_sdk/_types.py b/sentry_sdk/_types.py index ad3fa35849..2dd257f6ec 100644 --- a/sentry_sdk/_types.py +++ b/sentry_sdk/_types.py @@ -242,6 +242,7 @@ class SDKInfo(TypedDict): "type": Literal["check_in", "transaction"], "user": dict[str, object], "_dropped_spans": int, + "_has_gen_ai_span": bool, }, total=False, ) diff --git a/sentry_sdk/client.py b/sentry_sdk/client.py index d0b93e3bb1..8821ac4922 100644 --- a/sentry_sdk/client.py +++ b/sentry_sdk/client.py @@ -2,11 +2,12 @@ import uuid import random import socket -from collections.abc import Mapping +from collections.abc import Mapping, Iterable from datetime import datetime, timezone from importlib import import_module from typing import TYPE_CHECKING, List, Dict, cast, overload import warnings +import json from sentry_sdk._compat import check_uwsgi_thread_support from sentry_sdk._metrics_batcher import MetricsBatcher @@ -30,6 +31,7 @@ ) from sentry_sdk.serializer import serialize from sentry_sdk.tracing import trace +from sentry_sdk.traces import SpanStatus from sentry_sdk.tracing_utils import has_span_streaming_enabled from sentry_sdk.transport import ( HttpTransportCore, @@ -38,6 +40,7 @@ ) from sentry_sdk.consts import ( SPANDATA, + SPANSTATUS, DEFAULT_MAX_VALUE_LENGTH, DEFAULT_OPTIONS, INSTRUMENTER, @@ -47,7 +50,7 @@ from sentry_sdk.integrations import _DEFAULT_INTEGRATIONS, setup_integrations from sentry_sdk.integrations.dedupe import DedupeIntegration from sentry_sdk.sessions import SessionFlusher -from sentry_sdk.envelope import Envelope +from sentry_sdk.envelope import Envelope, Item, PayloadRef from sentry_sdk.profiler.continuous_profiler import setup_continuous_profiler from sentry_sdk.profiler.transaction_profiler import ( has_profiling_enabled, @@ -56,6 +59,7 @@ ) from sentry_sdk.scrubber import EventScrubber from sentry_sdk.monitor import Monitor +from sentry_sdk.utils import datetime_from_isoformat if TYPE_CHECKING: from typing import Any @@ -66,7 +70,15 @@ from typing import Union from typing import TypeVar - from sentry_sdk._types import Event, Hint, SDKInfo, Log, Metric, EventDataCategory + from sentry_sdk._types import ( + Event, + Hint, + SDKInfo, + Log, + Metric, + EventDataCategory, + SerializedAttributeValue, + ) from sentry_sdk.integrations import Integration from sentry_sdk.scope import Scope from sentry_sdk.session import Session @@ -89,6 +101,196 @@ } +def _serialized_v1_attribute_to_serialized_v2_attribute( + attribute_value: "Any", +) -> "Optional[SerializedAttributeValue]": + if isinstance(attribute_value, bool): + return { + "value": attribute_value, + "type": "boolean", + } + + if isinstance(attribute_value, int): + return { + "value": attribute_value, + "type": "integer", + } + + if isinstance(attribute_value, float): + return { + "value": attribute_value, + "type": "double", + } + + if isinstance(attribute_value, str): + return { + "value": attribute_value, + "type": "string", + } + + if isinstance(attribute_value, list): + if not attribute_value: + return {"value": [], "type": "array"} + + ty = type(attribute_value[0]) + if ty in (int, str, bool, float) and all( + type(v) is ty for v in attribute_value + ): + return { + "value": attribute_value, + "type": "array", + } + + # Types returned when the serializer for V1 span attributes recurses into some container types. + if isinstance(attribute_value, (dict, list)): + return { + "value": json.dumps(attribute_value), + "type": "string", + } + + return None + + +def _serialized_v1_span_to_serialized_v2_span( + span: "dict[str, Any]", event: "Event" +) -> "dict[str, Any]": + # See SpanBatcher._to_transport_format() for analogous population of all entries except "attributes". + res: "dict[str, Any]" = { + "status": SpanStatus.OK.value, + "is_segment": False, + } + + if "trace_id" in span: + res["trace_id"] = span["trace_id"] + + if "span_id" in span: + res["span_id"] = span["span_id"] + + if "description" in span: + description = span["description"] + + if description is None and "op" in span: + description = span["op"] + + res["name"] = description + + if "start_timestamp" in span: + start_timestamp = None + try: + start_timestamp = datetime_from_isoformat(span["start_timestamp"]) + except Exception: + pass + + if start_timestamp is not None: + res["start_timestamp"] = start_timestamp.timestamp() + + if "timestamp" in span: + end_timestamp = None + try: + end_timestamp = datetime_from_isoformat(span["timestamp"]) + except Exception: + pass + + if end_timestamp is not None: + res["end_timestamp"] = end_timestamp.timestamp() + + if "parent_span_id" in span: + res["parent_span_id"] = span["parent_span_id"] + + if "status" in span and span["status"] != SPANSTATUS.OK: + res["status"] = "error" + + attributes: "Dict[str, Any]" = {} + + if "op" in span: + attributes["sentry.op"] = span["op"] + if "origin" in span: + attributes["sentry.origin"] = span["origin"] + + span_data = span.get("data") + if isinstance(span_data, dict): + attributes.update(span_data) + + span_tags = span.get("tags") + if isinstance(span_tags, dict): + attributes.update(span_tags) + + # See Scope._apply_user_attributes_to_telemetry() for user attributes. + user = event.get("user") + if isinstance(user, dict): + if "id" in user: + attributes["user.id"] = user["id"] + if "username" in user: + attributes["user.name"] = user["username"] + if "email" in user: + attributes["user.email"] = user["email"] + + # See Scope.set_global_attributes() for release, environment, and SDK metadata. + if "release" in event: + attributes["sentry.release"] = event["release"] + if "environment" in event: + attributes["sentry.environment"] = event["environment"] + if "transaction" in event: + attributes["sentry.segment.name"] = event["transaction"] + + trace_context = event.get("contexts", {}).get("trace", {}) + if "span_id" in trace_context: + attributes["sentry.segment.id"] = trace_context["span_id"] + + sdk_info = event.get("sdk") + if isinstance(sdk_info, dict): + if "name" in sdk_info: + attributes["sentry.sdk.name"] = sdk_info["name"] + if "version" in sdk_info: + attributes["sentry.sdk.version"] = sdk_info["version"] + + if not attributes: + return res + + res["attributes"] = {} + for key, value in attributes.items(): + converted_value = _serialized_v1_attribute_to_serialized_v2_attribute(value) + if converted_value is None: + continue + + res["attributes"][key] = converted_value + + # Remove redundant attribute, as status is stored in the status field. + if "status" in res["attributes"]: + del res["attributes"]["status"] + + return res + + +def _split_gen_ai_spans( + event_opt: "Event", +) -> "Optional[tuple[List[Dict[str, object]], List[Dict[str, object]]]]": + if "spans" not in event_opt: + return None + + spans: "Any" = event_opt["spans"] + if isinstance(spans, AnnotatedValue): + spans = spans.value + + if not isinstance(spans, Iterable): + return None + + non_gen_ai_spans = [] + gen_ai_spans = [] + for span in spans: + if not isinstance(span, dict): + non_gen_ai_spans.append(span) + continue + + span_op = span.get("op") + if isinstance(span_op, str) and span_op.startswith("gen_ai."): + gen_ai_spans.append(span) + else: + non_gen_ai_spans.append(span) + + return non_gen_ai_spans, gen_ai_spans + + def _get_options(*args: "Optional[str]", **kwargs: "Any") -> "Dict[str, Any]": if args and (isinstance(args[0], (bytes, str)) or args[0] is None): dsn: "Optional[str]" = args[0] @@ -874,6 +1076,8 @@ def capture_event( event_id = event.get("event_id") if event_id is None: event["event_id"] = event_id = uuid.uuid4().hex + + span_recorder_has_gen_ai_span = event.pop("_has_gen_ai_span", False) event_opt = self._prepare_event(event, hint, scope) if event_opt is None: return None @@ -909,10 +1113,43 @@ def capture_event( envelope = Envelope(headers=headers) - if is_transaction: - if isinstance(profile, Profile): - envelope.add_profile(profile.to_json(event_opt, self.options)) + if is_transaction and isinstance(profile, Profile): + envelope.add_profile(profile.to_json(event_opt, self.options)) + + if is_transaction and not span_recorder_has_gen_ai_span: envelope.add_transaction(event_opt) + elif is_transaction: + split_spans = _split_gen_ai_spans(event_opt) + if split_spans is None or not split_spans[1]: + envelope.add_transaction(event_opt) + else: + non_gen_ai_spans, gen_ai_spans = split_spans + + event_opt["spans"] = non_gen_ai_spans + envelope.add_transaction(event_opt) + + converted_gen_ai_spans = [ + _serialized_v1_span_to_serialized_v2_span(span, event_opt) + for span in gen_ai_spans + if isinstance(span, dict) + ] + + envelope.add_item( + Item( + type=SpanBatcher.TYPE, + content_type=SpanBatcher.CONTENT_TYPE, + headers={ + "item_count": len(converted_gen_ai_spans), + }, + payload=PayloadRef( + json={ + "version": 2, + "items": converted_gen_ai_spans, + }, + ), + ) + ) + elif is_checkin: envelope.add_checkin(event_opt) else: diff --git a/sentry_sdk/consts.py b/sentry_sdk/consts.py index d2b4cd89af..0a58292d6d 100644 --- a/sentry_sdk/consts.py +++ b/sentry_sdk/consts.py @@ -1218,6 +1218,7 @@ def __init__( before_send_metric: "Optional[Callable[[Metric, Hint], Optional[Metric]]]" = None, org_id: "Optional[str]" = None, strict_trace_continuation: bool = False, + stream_gen_ai_spans: bool = False, ) -> None: """Initialize the Sentry SDK with the given parameters. All parameters described here can be used in a call to `sentry_sdk.init()`. @@ -1633,6 +1634,9 @@ def __init__( but you can provide it explicitly for self-hosted and Relay setups. This value is used for trace propagation and for features like `strict_trace_continuation`. + :param stream_gen_ai_spans: When set, generative AI spans are sent in a new transport format to + reduce downstream data loss. + :param _experiments: """ pass diff --git a/sentry_sdk/tracing.py b/sentry_sdk/tracing.py index c0c0fa9bde..6cf2527fe3 100644 --- a/sentry_sdk/tracing.py +++ b/sentry_sdk/tracing.py @@ -1040,11 +1040,23 @@ def finish( return None - finished_spans = [ - span.to_json() - for span in self._span_recorder.spans - if span.timestamp is not None - ] + finished_spans = [] + has_gen_ai_span = False + if client.options.get("stream_gen_ai_spans", False): + for span in self._span_recorder.spans: + if span.timestamp is None: + continue + + if isinstance(span.op, str) and span.op.startswith("gen_ai."): + has_gen_ai_span = True + + finished_spans.append(span.to_json()) + else: + finished_spans = [ + span.to_json() + for span in self._span_recorder.spans + if span.timestamp is not None + ] len_diff = len(self._span_recorder.spans) - len(finished_spans) dropped_spans = len_diff + self._span_recorder.dropped_spans @@ -1076,6 +1088,9 @@ def finish( if dropped_spans > 0: event["_dropped_spans"] = dropped_spans + if has_gen_ai_span: + event["_has_gen_ai_span"] = True + if self._profile is not None and self._profile.valid(): event["profile"] = self._profile self._profile = None diff --git a/tests/integrations/anthropic/test_anthropic.py b/tests/integrations/anthropic/test_anthropic.py index e86f7e1fa9..31f487aef2 100644 --- a/tests/integrations/anthropic/test_anthropic.py +++ b/tests/integrations/anthropic/test_anthropic.py @@ -81,6 +81,7 @@ async def __call__(self, *args, **kwargs): ) +@pytest.mark.parametrize("stream_gen_ai_spans", [True, False]) @pytest.mark.parametrize( "send_default_pii, include_prompts", [ @@ -91,14 +92,20 @@ async def __call__(self, *args, **kwargs): ], ) def test_nonstreaming_create_message( - sentry_init, capture_events, send_default_pii, include_prompts + sentry_init, + capture_events, + capture_items, + send_default_pii, + include_prompts, + stream_gen_ai_spans, ): sentry_init( integrations=[AnthropicIntegration(include_prompts=include_prompts)], traces_sample_rate=1.0, send_default_pii=send_default_pii, + stream_gen_ai_spans=stream_gen_ai_spans, ) - events = capture_events() + client = Anthropic(api_key="z") client.messages._post = mock.Mock(return_value=EXAMPLE_MESSAGE) @@ -109,50 +116,106 @@ def test_nonstreaming_create_message( } ] - with start_transaction(name="anthropic"): - response = client.messages.create( - max_tokens=1024, messages=messages, model="model" - ) + if stream_gen_ai_spans: + items = capture_items("transaction", "span") - assert response == EXAMPLE_MESSAGE - usage = response.usage + with start_transaction(name="anthropic"): + response = client.messages.create( + max_tokens=1024, messages=messages, model="model" + ) - assert usage.input_tokens == 10 - assert usage.output_tokens == 20 + assert response == EXAMPLE_MESSAGE + usage = response.usage - assert len(events) == 1 - (event,) = events + assert usage.input_tokens == 10 + assert usage.output_tokens == 20 - assert event["type"] == "transaction" - assert event["transaction"] == "anthropic" + (event,) = (item.payload for item in items if item.type == "transaction") + assert event["transaction"] == "anthropic" - assert len(event["spans"]) == 1 - (span,) = event["spans"] + spans = [item.payload for item in items if item.type == "span"] + assert len(spans) == 1 + (span,) = spans - assert span["op"] == OP.GEN_AI_CHAT - assert span["description"] == "chat model" - assert span["data"][SPANDATA.GEN_AI_SYSTEM] == "anthropic" - assert span["data"][SPANDATA.GEN_AI_OPERATION_NAME] == "chat" - assert span["data"][SPANDATA.GEN_AI_REQUEST_MODEL] == "model" + assert span["attributes"]["sentry.op"] == OP.GEN_AI_CHAT + assert span["name"] == "chat model" + assert span["attributes"][SPANDATA.GEN_AI_SYSTEM] == "anthropic" + assert span["attributes"][SPANDATA.GEN_AI_OPERATION_NAME] == "chat" + assert span["attributes"][SPANDATA.GEN_AI_REQUEST_MODEL] == "model" - if send_default_pii and include_prompts: + if send_default_pii and include_prompts: + assert ( + span["attributes"][SPANDATA.GEN_AI_REQUEST_MESSAGES] + == '[{"role": "user", "content": "Hello, Claude"}]' + ) + assert ( + span["attributes"][SPANDATA.GEN_AI_RESPONSE_TEXT] == "Hi, I'm Claude." + ) + else: + assert SPANDATA.GEN_AI_REQUEST_MESSAGES not in span["attributes"] + assert SPANDATA.GEN_AI_RESPONSE_TEXT not in span["attributes"] + + assert span["attributes"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS] == 10 + assert span["attributes"][SPANDATA.GEN_AI_USAGE_OUTPUT_TOKENS] == 20 + assert span["attributes"][SPANDATA.GEN_AI_USAGE_TOTAL_TOKENS] == 30 + assert span["attributes"][SPANDATA.GEN_AI_RESPONSE_STREAMING] is False assert ( - span["data"][SPANDATA.GEN_AI_REQUEST_MESSAGES] - == '[{"role": "user", "content": "Hello, Claude"}]' + span["attributes"][SPANDATA.GEN_AI_RESPONSE_ID] + == "msg_01XFDUDYJgAACzvnptvVoYEL" ) - assert span["data"][SPANDATA.GEN_AI_RESPONSE_TEXT] == "Hi, I'm Claude." + assert span["attributes"][SPANDATA.GEN_AI_RESPONSE_FINISH_REASONS] == [ + "end_turn" + ] else: - assert SPANDATA.GEN_AI_REQUEST_MESSAGES not in span["data"] - assert SPANDATA.GEN_AI_RESPONSE_TEXT not in span["data"] + events = capture_events() + + with start_transaction(name="anthropic"): + response = client.messages.create( + max_tokens=1024, messages=messages, model="model" + ) + + assert response == EXAMPLE_MESSAGE + usage = response.usage + + assert usage.input_tokens == 10 + assert usage.output_tokens == 20 + + assert len(events) == 1 + (event,) = events - assert span["data"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS] == 10 - assert span["data"][SPANDATA.GEN_AI_USAGE_OUTPUT_TOKENS] == 20 - assert span["data"][SPANDATA.GEN_AI_USAGE_TOTAL_TOKENS] == 30 - assert span["data"][SPANDATA.GEN_AI_RESPONSE_STREAMING] is False - assert span["data"][SPANDATA.GEN_AI_RESPONSE_ID] == "msg_01XFDUDYJgAACzvnptvVoYEL" - assert span["data"][SPANDATA.GEN_AI_RESPONSE_FINISH_REASONS] == ["end_turn"] + assert event["type"] == "transaction" + assert event["transaction"] == "anthropic" + + assert len(event["spans"]) == 1 + (span,) = event["spans"] + + assert span["op"] == OP.GEN_AI_CHAT + assert span["description"] == "chat model" + assert span["data"][SPANDATA.GEN_AI_SYSTEM] == "anthropic" + assert span["data"][SPANDATA.GEN_AI_OPERATION_NAME] == "chat" + assert span["data"][SPANDATA.GEN_AI_REQUEST_MODEL] == "model" + + if send_default_pii and include_prompts: + assert ( + span["data"][SPANDATA.GEN_AI_REQUEST_MESSAGES] + == '[{"role": "user", "content": "Hello, Claude"}]' + ) + assert span["data"][SPANDATA.GEN_AI_RESPONSE_TEXT] == "Hi, I'm Claude." + else: + assert SPANDATA.GEN_AI_REQUEST_MESSAGES not in span["data"] + assert SPANDATA.GEN_AI_RESPONSE_TEXT not in span["data"] + + assert span["data"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS] == 10 + assert span["data"][SPANDATA.GEN_AI_USAGE_OUTPUT_TOKENS] == 20 + assert span["data"][SPANDATA.GEN_AI_USAGE_TOTAL_TOKENS] == 30 + assert span["data"][SPANDATA.GEN_AI_RESPONSE_STREAMING] is False + assert ( + span["data"][SPANDATA.GEN_AI_RESPONSE_ID] == "msg_01XFDUDYJgAACzvnptvVoYEL" + ) + assert span["data"][SPANDATA.GEN_AI_RESPONSE_FINISH_REASONS] == ["end_turn"] +@pytest.mark.parametrize("stream_gen_ai_spans", [True, False]) @pytest.mark.asyncio @pytest.mark.parametrize( "send_default_pii, include_prompts", @@ -164,14 +227,20 @@ def test_nonstreaming_create_message( ], ) async def test_nonstreaming_create_message_async( - sentry_init, capture_events, send_default_pii, include_prompts + sentry_init, + capture_events, + capture_items, + send_default_pii, + include_prompts, + stream_gen_ai_spans, ): sentry_init( integrations=[AnthropicIntegration(include_prompts=include_prompts)], traces_sample_rate=1.0, send_default_pii=send_default_pii, + stream_gen_ai_spans=stream_gen_ai_spans, ) - events = capture_events() + client = AsyncAnthropic(api_key="z") client.messages._post = AsyncMock(return_value=EXAMPLE_MESSAGE) @@ -182,49 +251,102 @@ async def test_nonstreaming_create_message_async( } ] - with start_transaction(name="anthropic"): - response = await client.messages.create( - max_tokens=1024, messages=messages, model="model" - ) + if stream_gen_ai_spans: + items = capture_items("transaction", "span") - assert response == EXAMPLE_MESSAGE - usage = response.usage + with start_transaction(name="anthropic"): + response = await client.messages.create( + max_tokens=1024, messages=messages, model="model" + ) - assert usage.input_tokens == 10 - assert usage.output_tokens == 20 + assert response == EXAMPLE_MESSAGE + usage = response.usage - assert len(events) == 1 - (event,) = events + assert usage.input_tokens == 10 + assert usage.output_tokens == 20 - assert event["type"] == "transaction" - assert event["transaction"] == "anthropic" + (event,) = (item.payload for item in items if item.type == "transaction") + assert event["transaction"] == "anthropic" - assert len(event["spans"]) == 1 - (span,) = event["spans"] + spans = [item.payload for item in items if item.type == "span"] + assert len(spans) == 1 + (span,) = spans - assert span["op"] == OP.GEN_AI_CHAT - assert span["description"] == "chat model" - assert span["data"][SPANDATA.GEN_AI_SYSTEM] == "anthropic" - assert span["data"][SPANDATA.GEN_AI_OPERATION_NAME] == "chat" - assert span["data"][SPANDATA.GEN_AI_REQUEST_MODEL] == "model" + assert span["attributes"]["sentry.op"] == OP.GEN_AI_CHAT + assert span["name"] == "chat model" + assert span["attributes"][SPANDATA.GEN_AI_SYSTEM] == "anthropic" + assert span["attributes"][SPANDATA.GEN_AI_OPERATION_NAME] == "chat" + assert span["attributes"][SPANDATA.GEN_AI_REQUEST_MODEL] == "model" - if send_default_pii and include_prompts: + if send_default_pii and include_prompts: + assert ( + span["attributes"][SPANDATA.GEN_AI_REQUEST_MESSAGES] + == '[{"role": "user", "content": "Hello, Claude"}]' + ) + assert ( + span["attributes"][SPANDATA.GEN_AI_RESPONSE_TEXT] == "Hi, I'm Claude." + ) + else: + assert SPANDATA.GEN_AI_REQUEST_MESSAGES not in span["attributes"] + assert SPANDATA.GEN_AI_RESPONSE_TEXT not in span["attributes"] + + assert span["attributes"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS] == 10 + assert span["attributes"][SPANDATA.GEN_AI_USAGE_OUTPUT_TOKENS] == 20 + assert span["attributes"][SPANDATA.GEN_AI_USAGE_TOTAL_TOKENS] == 30 + assert span["attributes"][SPANDATA.GEN_AI_RESPONSE_STREAMING] is False assert ( - span["data"][SPANDATA.GEN_AI_REQUEST_MESSAGES] - == '[{"role": "user", "content": "Hello, Claude"}]' + span["attributes"][SPANDATA.GEN_AI_RESPONSE_ID] + == "msg_01XFDUDYJgAACzvnptvVoYEL" ) - assert span["data"][SPANDATA.GEN_AI_RESPONSE_TEXT] == "Hi, I'm Claude." else: - assert SPANDATA.GEN_AI_REQUEST_MESSAGES not in span["data"] - assert SPANDATA.GEN_AI_RESPONSE_TEXT not in span["data"] + events = capture_events() + + with start_transaction(name="anthropic"): + response = await client.messages.create( + max_tokens=1024, messages=messages, model="model" + ) + + assert response == EXAMPLE_MESSAGE + usage = response.usage + + assert usage.input_tokens == 10 + assert usage.output_tokens == 20 + + assert len(events) == 1 + (event,) = events + + assert event["type"] == "transaction" + assert event["transaction"] == "anthropic" + + assert len(event["spans"]) == 1 + (span,) = event["spans"] + + assert span["op"] == OP.GEN_AI_CHAT + assert span["description"] == "chat model" + assert span["data"][SPANDATA.GEN_AI_SYSTEM] == "anthropic" + assert span["data"][SPANDATA.GEN_AI_OPERATION_NAME] == "chat" + assert span["data"][SPANDATA.GEN_AI_REQUEST_MODEL] == "model" - assert span["data"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS] == 10 - assert span["data"][SPANDATA.GEN_AI_USAGE_OUTPUT_TOKENS] == 20 - assert span["data"][SPANDATA.GEN_AI_USAGE_TOTAL_TOKENS] == 30 - assert span["data"][SPANDATA.GEN_AI_RESPONSE_STREAMING] is False - assert span["data"][SPANDATA.GEN_AI_RESPONSE_ID] == "msg_01XFDUDYJgAACzvnptvVoYEL" + if send_default_pii and include_prompts: + assert ( + span["data"][SPANDATA.GEN_AI_REQUEST_MESSAGES] + == '[{"role": "user", "content": "Hello, Claude"}]' + ) + assert span["data"][SPANDATA.GEN_AI_RESPONSE_TEXT] == "Hi, I'm Claude." + else: + assert SPANDATA.GEN_AI_REQUEST_MESSAGES not in span["data"] + assert SPANDATA.GEN_AI_RESPONSE_TEXT not in span["data"] + + assert span["data"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS] == 10 + assert span["data"][SPANDATA.GEN_AI_USAGE_OUTPUT_TOKENS] == 20 + assert span["data"][SPANDATA.GEN_AI_USAGE_TOTAL_TOKENS] == 30 + assert span["data"][SPANDATA.GEN_AI_RESPONSE_STREAMING] is False + assert ( + span["data"][SPANDATA.GEN_AI_RESPONSE_ID] == "msg_01XFDUDYJgAACzvnptvVoYEL" + ) +@pytest.mark.parametrize("stream_gen_ai_spans", [True, False]) @pytest.mark.parametrize( "send_default_pii, include_prompts", [ @@ -237,10 +359,12 @@ async def test_nonstreaming_create_message_async( def test_streaming_create_message( sentry_init, capture_events, + capture_items, send_default_pii, include_prompts, get_model_response, server_side_event_chunks, + stream_gen_ai_spans, ): client = Anthropic(api_key="z") @@ -285,8 +409,8 @@ def test_streaming_create_message( integrations=[AnthropicIntegration(include_prompts=include_prompts)], traces_sample_rate=1.0, send_default_pii=send_default_pii, + stream_gen_ai_spans=stream_gen_ai_spans, ) - events = capture_events() messages = [ { @@ -295,12 +419,14 @@ def test_streaming_create_message( } ] - with mock.patch.object( - client._client, - "send", - return_value=response, - ) as _: - with start_transaction(name="anthropic"): + if stream_gen_ai_spans: + items = capture_items("transaction", "span") + + with mock.patch.object( + client._client, + "send", + return_value=response, + ) as _, start_transaction(name="anthropic"): message = client.messages.create( max_tokens=1024, messages=messages, model="model", stream=True ) @@ -308,44 +434,102 @@ def test_streaming_create_message( for _ in message: pass - assert len(events) == 1 - (event,) = events + (event,) = (item.payload for item in items if item.type == "transaction") + assert event["transaction"] == "anthropic" + + spans = [item.payload for item in items if item.type == "span"] + span = next( + span for span in spans if span["attributes"]["sentry.op"] == OP.GEN_AI_CHAT + ) - assert event["type"] == "transaction" - assert event["transaction"] == "anthropic" + assert span["attributes"]["sentry.op"] == OP.GEN_AI_CHAT + assert span["name"] == "chat model" + assert span["attributes"][SPANDATA.GEN_AI_SYSTEM] == "anthropic" + assert span["attributes"][SPANDATA.GEN_AI_OPERATION_NAME] == "chat" + assert span["attributes"][SPANDATA.GEN_AI_REQUEST_MODEL] == "model" - span = next(span for span in event["spans"] if span["op"] == OP.GEN_AI_CHAT) + if send_default_pii and include_prompts: + assert ( + span["attributes"][SPANDATA.GEN_AI_REQUEST_MESSAGES] + == '[{"role": "user", "content": "Hello, Claude"}]' + ) + assert ( + span["attributes"][SPANDATA.GEN_AI_RESPONSE_TEXT] == "Hi! I'm Claude!" + ) - assert span["op"] == OP.GEN_AI_CHAT - assert span["description"] == "chat model" - assert span["data"][SPANDATA.GEN_AI_SYSTEM] == "anthropic" - assert span["data"][SPANDATA.GEN_AI_OPERATION_NAME] == "chat" - assert span["data"][SPANDATA.GEN_AI_REQUEST_MODEL] == "model" + else: + assert SPANDATA.GEN_AI_REQUEST_MESSAGES not in span["attributes"] + assert SPANDATA.GEN_AI_RESPONSE_TEXT not in span["attributes"] - if send_default_pii and include_prompts: + assert span["attributes"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS] == 10 + assert span["attributes"][SPANDATA.GEN_AI_USAGE_OUTPUT_TOKENS] == 10 + assert span["attributes"][SPANDATA.GEN_AI_USAGE_TOTAL_TOKENS] == 20 + assert span["attributes"][SPANDATA.GEN_AI_RESPONSE_STREAMING] is True assert ( - span["data"][SPANDATA.GEN_AI_REQUEST_MESSAGES] - == '[{"role": "user", "content": "Hello, Claude"}]' + span["attributes"][SPANDATA.GEN_AI_RESPONSE_ID] + == "msg_01XFDUDYJgAACzvnptvVoYEL" ) - assert span["data"][SPANDATA.GEN_AI_RESPONSE_TEXT] == "Hi! I'm Claude!" - + assert span["attributes"][SPANDATA.GEN_AI_RESPONSE_FINISH_REASONS] == [ + "max_tokens" + ] else: - assert SPANDATA.GEN_AI_REQUEST_MESSAGES not in span["data"] - assert SPANDATA.GEN_AI_RESPONSE_TEXT not in span["data"] + events = capture_events() + + with mock.patch.object( + client._client, + "send", + return_value=response, + ) as _, start_transaction(name="anthropic"): + message = client.messages.create( + max_tokens=1024, messages=messages, model="model", stream=True + ) + + for _ in message: + pass + + assert len(events) == 1 + (event,) = events + + assert event["type"] == "transaction" + assert event["transaction"] == "anthropic" + + span = next(span for span in event["spans"] if span["op"] == OP.GEN_AI_CHAT) - assert span["data"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS] == 10 - assert span["data"][SPANDATA.GEN_AI_USAGE_OUTPUT_TOKENS] == 10 - assert span["data"][SPANDATA.GEN_AI_USAGE_TOTAL_TOKENS] == 20 - assert span["data"][SPANDATA.GEN_AI_RESPONSE_STREAMING] is True - assert span["data"][SPANDATA.GEN_AI_RESPONSE_ID] == "msg_01XFDUDYJgAACzvnptvVoYEL" - assert span["data"][SPANDATA.GEN_AI_RESPONSE_FINISH_REASONS] == ["max_tokens"] + assert span["op"] == OP.GEN_AI_CHAT + assert span["description"] == "chat model" + assert span["data"][SPANDATA.GEN_AI_SYSTEM] == "anthropic" + assert span["data"][SPANDATA.GEN_AI_OPERATION_NAME] == "chat" + assert span["data"][SPANDATA.GEN_AI_REQUEST_MODEL] == "model" + + if send_default_pii and include_prompts: + assert ( + span["data"][SPANDATA.GEN_AI_REQUEST_MESSAGES] + == '[{"role": "user", "content": "Hello, Claude"}]' + ) + assert span["data"][SPANDATA.GEN_AI_RESPONSE_TEXT] == "Hi! I'm Claude!" + + else: + assert SPANDATA.GEN_AI_REQUEST_MESSAGES not in span["data"] + assert SPANDATA.GEN_AI_RESPONSE_TEXT not in span["data"] + + assert span["data"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS] == 10 + assert span["data"][SPANDATA.GEN_AI_USAGE_OUTPUT_TOKENS] == 10 + assert span["data"][SPANDATA.GEN_AI_USAGE_TOTAL_TOKENS] == 20 + assert span["data"][SPANDATA.GEN_AI_RESPONSE_STREAMING] is True + assert ( + span["data"][SPANDATA.GEN_AI_RESPONSE_ID] == "msg_01XFDUDYJgAACzvnptvVoYEL" + ) + assert span["data"][SPANDATA.GEN_AI_RESPONSE_FINISH_REASONS] == ["max_tokens"] +@pytest.mark.parametrize("stream_gen_ai_spans", [True, False]) def test_streaming_create_message_close( sentry_init, capture_events, + capture_items, get_model_response, server_side_event_chunks, + stream_gen_ai_spans, ): client = Anthropic(api_key="z") @@ -390,8 +574,8 @@ def test_streaming_create_message_close( integrations=[AnthropicIntegration(include_prompts=True)], traces_sample_rate=1.0, send_default_pii=True, + stream_gen_ai_spans=stream_gen_ai_spans, ) - events = capture_events() messages = [ { @@ -400,12 +584,14 @@ def test_streaming_create_message_close( } ] - with mock.patch.object( - client._client, - "send", - return_value=response, - ) as _: - with start_transaction(name="anthropic"): + if stream_gen_ai_spans: + items = capture_items("transaction", "span") + + with mock.patch.object( + client._client, + "send", + return_value=response, + ) as _, start_transaction(name="anthropic"): messages = client.messages.create( max_tokens=1024, messages=messages, model="model", stream=True ) @@ -415,33 +601,81 @@ def test_streaming_create_message_close( messages.close() - assert len(events) == 1 - (event,) = events + (event,) = (item.payload for item in items if item.type == "transaction") + assert event["transaction"] == "anthropic" - assert event["type"] == "transaction" - assert event["transaction"] == "anthropic" + spans = [item.payload for item in items if item.type == "span"] + span = next( + span for span in spans if span["attributes"]["sentry.op"] == OP.GEN_AI_CHAT + ) - span = next(span for span in event["spans"] if span["op"] == OP.GEN_AI_CHAT) + assert span["attributes"]["sentry.op"] == OP.GEN_AI_CHAT + assert span["name"] == "chat model" + assert span["attributes"][SPANDATA.GEN_AI_SYSTEM] == "anthropic" + assert span["attributes"][SPANDATA.GEN_AI_OPERATION_NAME] == "chat" + assert span["attributes"][SPANDATA.GEN_AI_REQUEST_MODEL] == "model" - assert span["op"] == OP.GEN_AI_CHAT - assert span["description"] == "chat model" - assert span["data"][SPANDATA.GEN_AI_SYSTEM] == "anthropic" - assert span["data"][SPANDATA.GEN_AI_OPERATION_NAME] == "chat" - assert span["data"][SPANDATA.GEN_AI_REQUEST_MODEL] == "model" + assert ( + span["attributes"][SPANDATA.GEN_AI_REQUEST_MESSAGES] + == '[{"role": "user", "content": "Hello, Claude"}]' + ) + assert span["attributes"][SPANDATA.GEN_AI_RESPONSE_TEXT] == "Hi!" - assert ( - span["data"][SPANDATA.GEN_AI_REQUEST_MESSAGES] - == '[{"role": "user", "content": "Hello, Claude"}]' - ) - assert span["data"][SPANDATA.GEN_AI_RESPONSE_TEXT] == "Hi!" + assert span["attributes"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS] == 10 + assert span["attributes"][SPANDATA.GEN_AI_USAGE_OUTPUT_TOKENS] == 20 + assert span["attributes"][SPANDATA.GEN_AI_USAGE_TOTAL_TOKENS] == 30 + assert span["attributes"][SPANDATA.GEN_AI_RESPONSE_STREAMING] is True + assert ( + span["attributes"][SPANDATA.GEN_AI_RESPONSE_ID] + == "msg_01XFDUDYJgAACzvnptvVoYEL" + ) + else: + events = capture_events() + + with mock.patch.object( + client._client, + "send", + return_value=response, + ) as _, start_transaction(name="anthropic"): + messages = client.messages.create( + max_tokens=1024, messages=messages, model="model", stream=True + ) + + for _ in range(4): + next(messages) + + messages.close() + + assert len(events) == 1 + (event,) = events + + assert event["type"] == "transaction" + assert event["transaction"] == "anthropic" + + span = next(span for span in event["spans"] if span["op"] == OP.GEN_AI_CHAT) + + assert span["op"] == OP.GEN_AI_CHAT + assert span["description"] == "chat model" + assert span["data"][SPANDATA.GEN_AI_SYSTEM] == "anthropic" + assert span["data"][SPANDATA.GEN_AI_OPERATION_NAME] == "chat" + assert span["data"][SPANDATA.GEN_AI_REQUEST_MODEL] == "model" + + assert ( + span["data"][SPANDATA.GEN_AI_REQUEST_MESSAGES] + == '[{"role": "user", "content": "Hello, Claude"}]' + ) + assert span["data"][SPANDATA.GEN_AI_RESPONSE_TEXT] == "Hi!" - assert span["data"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS] == 10 - assert span["data"][SPANDATA.GEN_AI_USAGE_OUTPUT_TOKENS] == 20 - assert span["data"][SPANDATA.GEN_AI_USAGE_TOTAL_TOKENS] == 30 - assert span["data"][SPANDATA.GEN_AI_RESPONSE_STREAMING] is True - assert span["data"][SPANDATA.GEN_AI_RESPONSE_ID] == "msg_01XFDUDYJgAACzvnptvVoYEL" + assert span["data"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS] == 10 + assert span["data"][SPANDATA.GEN_AI_USAGE_OUTPUT_TOKENS] == 20 + assert span["data"][SPANDATA.GEN_AI_USAGE_TOTAL_TOKENS] == 30 + assert span["data"][SPANDATA.GEN_AI_RESPONSE_STREAMING] is True + assert ( + span["data"][SPANDATA.GEN_AI_RESPONSE_ID] == "msg_01XFDUDYJgAACzvnptvVoYEL" + ) +@pytest.mark.parametrize("stream_gen_ai_spans", [True, False]) @pytest.mark.skipif( ANTHROPIC_VERSION < (0, 41), reason="Error classes moved in https://github.com/anthropics/anthropic-sdk-python/commit/4e0b15e22fe40e9aa513459564f641bf97c90954.", @@ -449,8 +683,10 @@ def test_streaming_create_message_close( def test_streaming_create_message_api_error( sentry_init, capture_events, + capture_items, get_model_response, server_side_event_chunks, + stream_gen_ai_spans, ): client = Anthropic(api_key="z") @@ -490,8 +726,8 @@ def test_streaming_create_message_api_error( integrations=[AnthropicIntegration(include_prompts=True)], traces_sample_rate=1.0, send_default_pii=True, + stream_gen_ai_spans=stream_gen_ai_spans, ) - events = capture_events() messages = [ { @@ -500,50 +736,99 @@ def test_streaming_create_message_api_error( } ] - with pytest.raises(APIStatusError), mock.patch.object( - client._client, - "send", - return_value=response, - ) as _: - with start_transaction(name="anthropic"): + if stream_gen_ai_spans: + items = capture_items("transaction", "span") + + with pytest.raises(APIStatusError), mock.patch.object( + client._client, + "send", + return_value=response, + ) as _, start_transaction(name="anthropic"): message = client.messages.create( max_tokens=1024, messages=messages, model="model", stream=True ) for _ in message: pass + (event,) = (item.payload for item in items if item.type == "transaction") + assert event["transaction"] == "anthropic" + + spans = [item.payload for item in items if item.type == "span"] + span = next( + span for span in spans if span["attributes"]["sentry.op"] == OP.GEN_AI_CHAT + ) - assert len(events) == 1 - (event,) = events + assert span["attributes"]["sentry.op"] == OP.GEN_AI_CHAT + assert span["name"] == "chat model" + assert span["attributes"][SPANDATA.GEN_AI_SYSTEM] == "anthropic" + assert span["attributes"][SPANDATA.GEN_AI_OPERATION_NAME] == "chat" + assert span["attributes"][SPANDATA.GEN_AI_REQUEST_MODEL] == "model" - assert event["type"] == "transaction" - assert event["transaction"] == "anthropic" + assert ( + span["attributes"][SPANDATA.GEN_AI_REQUEST_MESSAGES] + == '[{"role": "user", "content": "Hello, Claude"}]' + ) + assert span["attributes"][SPANDATA.GEN_AI_RESPONSE_TEXT] == "Hi!" - span = next(span for span in event["spans"] if span["op"] == OP.GEN_AI_CHAT) + assert span["attributes"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS] == 10 + assert span["attributes"][SPANDATA.GEN_AI_USAGE_OUTPUT_TOKENS] == 20 + assert span["attributes"][SPANDATA.GEN_AI_USAGE_TOTAL_TOKENS] == 30 + assert span["attributes"][SPANDATA.GEN_AI_RESPONSE_STREAMING] is True + assert ( + span["attributes"][SPANDATA.GEN_AI_RESPONSE_ID] + == "msg_01XFDUDYJgAACzvnptvVoYEL" + ) - assert span["op"] == OP.GEN_AI_CHAT - assert span["description"] == "chat model" - assert span["data"][SPANDATA.GEN_AI_SYSTEM] == "anthropic" - assert span["data"][SPANDATA.GEN_AI_OPERATION_NAME] == "chat" - assert span["data"][SPANDATA.GEN_AI_REQUEST_MODEL] == "model" + assert span["status"] == "error" + else: + events = capture_events() - assert ( - span["data"][SPANDATA.GEN_AI_REQUEST_MESSAGES] - == '[{"role": "user", "content": "Hello, Claude"}]' - ) - assert span["data"][SPANDATA.GEN_AI_RESPONSE_TEXT] == "Hi!" + with pytest.raises(APIStatusError), mock.patch.object( + client._client, + "send", + return_value=response, + ) as _, start_transaction(name="anthropic"): + message = client.messages.create( + max_tokens=1024, messages=messages, model="model", stream=True + ) + + for _ in message: + pass + + assert len(events) == 1 + (event,) = events + + assert event["type"] == "transaction" + assert event["transaction"] == "anthropic" + + span = next(span for span in event["spans"] if span["op"] == OP.GEN_AI_CHAT) - assert span["data"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS] == 10 - assert span["data"][SPANDATA.GEN_AI_USAGE_OUTPUT_TOKENS] == 20 - assert span["data"][SPANDATA.GEN_AI_USAGE_TOTAL_TOKENS] == 30 - assert span["data"][SPANDATA.GEN_AI_RESPONSE_STREAMING] is True - assert span["data"][SPANDATA.GEN_AI_RESPONSE_ID] == "msg_01XFDUDYJgAACzvnptvVoYEL" + assert span["op"] == OP.GEN_AI_CHAT + assert span["description"] == "chat model" + assert span["data"][SPANDATA.GEN_AI_SYSTEM] == "anthropic" + assert span["data"][SPANDATA.GEN_AI_OPERATION_NAME] == "chat" + assert span["data"][SPANDATA.GEN_AI_REQUEST_MODEL] == "model" - assert span["status"] == "internal_error" - assert span["tags"]["status"] == "internal_error" + assert ( + span["data"][SPANDATA.GEN_AI_REQUEST_MESSAGES] + == '[{"role": "user", "content": "Hello, Claude"}]' + ) + assert span["data"][SPANDATA.GEN_AI_RESPONSE_TEXT] == "Hi!" + + assert span["data"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS] == 10 + assert span["data"][SPANDATA.GEN_AI_USAGE_OUTPUT_TOKENS] == 20 + assert span["data"][SPANDATA.GEN_AI_USAGE_TOTAL_TOKENS] == 30 + assert span["data"][SPANDATA.GEN_AI_RESPONSE_STREAMING] is True + assert ( + span["data"][SPANDATA.GEN_AI_RESPONSE_ID] == "msg_01XFDUDYJgAACzvnptvVoYEL" + ) + + assert span["status"] == "internal_error" + assert span["tags"]["status"] == "internal_error" assert event["contexts"]["trace"]["status"] == "internal_error" +@pytest.mark.parametrize("stream_gen_ai_spans", [True, False]) @pytest.mark.parametrize( "send_default_pii, include_prompts", [ @@ -556,10 +841,12 @@ def test_streaming_create_message_api_error( def test_stream_messages( sentry_init, capture_events, + capture_items, send_default_pii, include_prompts, get_model_response, server_side_event_chunks, + stream_gen_ai_spans, ): client = Anthropic(api_key="z") @@ -604,8 +891,8 @@ def test_stream_messages( integrations=[AnthropicIntegration(include_prompts=include_prompts)], traces_sample_rate=1.0, send_default_pii=send_default_pii, + stream_gen_ai_spans=stream_gen_ai_spans, ) - events = capture_events() messages = [ { @@ -614,58 +901,116 @@ def test_stream_messages( } ] - with mock.patch.object( - client._client, - "send", - return_value=response, - ) as _: - with start_transaction(name="anthropic"): - with client.messages.stream( - max_tokens=1024, - messages=messages, - model="model", - ) as stream: - for event in stream: - pass + if stream_gen_ai_spans: + items = capture_items("transaction", "span") + + with mock.patch.object( + client._client, + "send", + return_value=response, + ) as _, start_transaction(name="anthropic"), client.messages.stream( + max_tokens=1024, + messages=messages, + model="model", + ) as stream: + for event in stream: + pass + + (event,) = (item.payload for item in items if item.type == "transaction") + assert event["transaction"] == "anthropic" - assert len(events) == 1 - (event,) = events + spans = [item.payload for item in items if item.type == "span"] + span = next( + span for span in spans if span["attributes"]["sentry.op"] == OP.GEN_AI_CHAT + ) - assert event["type"] == "transaction" - assert event["transaction"] == "anthropic" + assert span["attributes"]["sentry.op"] == OP.GEN_AI_CHAT + assert span["name"] == "chat model" + assert span["attributes"][SPANDATA.GEN_AI_SYSTEM] == "anthropic" + assert span["attributes"][SPANDATA.GEN_AI_OPERATION_NAME] == "chat" + assert span["attributes"][SPANDATA.GEN_AI_REQUEST_MODEL] == "model" - span = next(span for span in event["spans"] if span["op"] == OP.GEN_AI_CHAT) + if send_default_pii and include_prompts: + assert ( + span["attributes"][SPANDATA.GEN_AI_REQUEST_MESSAGES] + == '[{"role": "user", "content": "Hello, Claude"}]' + ) + assert ( + span["attributes"][SPANDATA.GEN_AI_RESPONSE_TEXT] == "Hi! I'm Claude!" + ) - assert span["op"] == OP.GEN_AI_CHAT - assert span["description"] == "chat model" - assert span["data"][SPANDATA.GEN_AI_SYSTEM] == "anthropic" - assert span["data"][SPANDATA.GEN_AI_OPERATION_NAME] == "chat" - assert span["data"][SPANDATA.GEN_AI_REQUEST_MODEL] == "model" + else: + assert SPANDATA.GEN_AI_REQUEST_MESSAGES not in span["attributes"] + assert SPANDATA.GEN_AI_RESPONSE_TEXT not in span["attributes"] - if send_default_pii and include_prompts: + assert span["attributes"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS] == 10 + assert span["attributes"][SPANDATA.GEN_AI_USAGE_OUTPUT_TOKENS] == 10 + assert span["attributes"][SPANDATA.GEN_AI_USAGE_TOTAL_TOKENS] == 20 + assert span["attributes"][SPANDATA.GEN_AI_RESPONSE_STREAMING] is True assert ( - span["data"][SPANDATA.GEN_AI_REQUEST_MESSAGES] - == '[{"role": "user", "content": "Hello, Claude"}]' + span["attributes"][SPANDATA.GEN_AI_RESPONSE_ID] + == "msg_01XFDUDYJgAACzvnptvVoYEL" ) - assert span["data"][SPANDATA.GEN_AI_RESPONSE_TEXT] == "Hi! I'm Claude!" - + assert span["attributes"][SPANDATA.GEN_AI_RESPONSE_FINISH_REASONS] == [ + "max_tokens" + ] else: - assert SPANDATA.GEN_AI_REQUEST_MESSAGES not in span["data"] - assert SPANDATA.GEN_AI_RESPONSE_TEXT not in span["data"] + events = capture_events() + + with mock.patch.object( + client._client, + "send", + return_value=response, + ) as _, start_transaction(name="anthropic"), client.messages.stream( + max_tokens=1024, + messages=messages, + model="model", + ) as stream: + for event in stream: + pass + + assert len(events) == 1 + (event,) = events + + assert event["type"] == "transaction" + assert event["transaction"] == "anthropic" + + span = next(span for span in event["spans"] if span["op"] == OP.GEN_AI_CHAT) - assert span["data"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS] == 10 - assert span["data"][SPANDATA.GEN_AI_USAGE_OUTPUT_TOKENS] == 10 - assert span["data"][SPANDATA.GEN_AI_USAGE_TOTAL_TOKENS] == 20 - assert span["data"][SPANDATA.GEN_AI_RESPONSE_STREAMING] is True - assert span["data"][SPANDATA.GEN_AI_RESPONSE_ID] == "msg_01XFDUDYJgAACzvnptvVoYEL" - assert span["data"][SPANDATA.GEN_AI_RESPONSE_FINISH_REASONS] == ["max_tokens"] + assert span["op"] == OP.GEN_AI_CHAT + assert span["description"] == "chat model" + assert span["data"][SPANDATA.GEN_AI_SYSTEM] == "anthropic" + assert span["data"][SPANDATA.GEN_AI_OPERATION_NAME] == "chat" + assert span["data"][SPANDATA.GEN_AI_REQUEST_MODEL] == "model" + + if send_default_pii and include_prompts: + assert ( + span["data"][SPANDATA.GEN_AI_REQUEST_MESSAGES] + == '[{"role": "user", "content": "Hello, Claude"}]' + ) + assert span["data"][SPANDATA.GEN_AI_RESPONSE_TEXT] == "Hi! I'm Claude!" + else: + assert SPANDATA.GEN_AI_REQUEST_MESSAGES not in span["data"] + assert SPANDATA.GEN_AI_RESPONSE_TEXT not in span["data"] + + assert span["data"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS] == 10 + assert span["data"][SPANDATA.GEN_AI_USAGE_OUTPUT_TOKENS] == 10 + assert span["data"][SPANDATA.GEN_AI_USAGE_TOTAL_TOKENS] == 20 + assert span["data"][SPANDATA.GEN_AI_RESPONSE_STREAMING] is True + assert ( + span["data"][SPANDATA.GEN_AI_RESPONSE_ID] == "msg_01XFDUDYJgAACzvnptvVoYEL" + ) + assert span["data"][SPANDATA.GEN_AI_RESPONSE_FINISH_REASONS] == ["max_tokens"] +@pytest.mark.parametrize("stream_gen_ai_spans", [True, False]) def test_stream_messages_close( sentry_init, capture_events, + capture_items, get_model_response, server_side_event_chunks, + stream_gen_ai_spans, ): client = Anthropic(api_key="z") @@ -710,8 +1055,8 @@ def test_stream_messages_close( integrations=[AnthropicIntegration(include_prompts=True)], traces_sample_rate=1.0, send_default_pii=True, + stream_gen_ai_spans=stream_gen_ai_spans, ) - events = capture_events() messages = [ { @@ -720,53 +1065,106 @@ def test_stream_messages_close( } ] - with mock.patch.object( - client._client, - "send", - return_value=response, - ) as _: - with start_transaction(name="anthropic"): - with client.messages.stream( - max_tokens=1024, - messages=messages, - model="model", - ) as stream: - for _ in range(4): - next(stream) + if stream_gen_ai_spans: + items = capture_items("transaction", "span") - # New versions add TextEvent, so consume one more event. - if TextEvent is not None and isinstance(next(stream), TextEvent): - next(stream) + with mock.patch.object( + client._client, + "send", + return_value=response, + ) as _, start_transaction(name="anthropic"), client.messages.stream( + max_tokens=1024, + messages=messages, + model="model", + ) as stream: + for _ in range(4): + next(stream) - stream.close() + # New versions add TextEvent, so consume one more event. + if TextEvent is not None and isinstance(next(stream), TextEvent): + next(stream) - assert len(events) == 1 - (event,) = events + stream.close() - assert event["type"] == "transaction" - assert event["transaction"] == "anthropic" + (event,) = (item.payload for item in items if item.type == "transaction") + assert event["transaction"] == "anthropic" - span = next(span for span in event["spans"] if span["op"] == OP.GEN_AI_CHAT) + spans = [item.payload for item in items if item.type == "span"] + span = next( + span for span in spans if span["attributes"]["sentry.op"] == OP.GEN_AI_CHAT + ) - assert span["op"] == OP.GEN_AI_CHAT - assert span["description"] == "chat model" - assert span["data"][SPANDATA.GEN_AI_SYSTEM] == "anthropic" - assert span["data"][SPANDATA.GEN_AI_OPERATION_NAME] == "chat" - assert span["data"][SPANDATA.GEN_AI_REQUEST_MODEL] == "model" + assert span["attributes"]["sentry.op"] == OP.GEN_AI_CHAT + assert span["name"] == "chat model" + assert span["attributes"][SPANDATA.GEN_AI_SYSTEM] == "anthropic" + assert span["attributes"][SPANDATA.GEN_AI_OPERATION_NAME] == "chat" + assert span["attributes"][SPANDATA.GEN_AI_REQUEST_MODEL] == "model" - assert ( - span["data"][SPANDATA.GEN_AI_REQUEST_MESSAGES] - == '[{"role": "user", "content": "Hello, Claude"}]' - ) - assert span["data"][SPANDATA.GEN_AI_RESPONSE_TEXT] == "Hi!" + assert ( + span["attributes"][SPANDATA.GEN_AI_REQUEST_MESSAGES] + == '[{"role": "user", "content": "Hello, Claude"}]' + ) + assert span["attributes"][SPANDATA.GEN_AI_RESPONSE_TEXT] == "Hi!" + + assert span["attributes"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS] == 10 + assert span["attributes"][SPANDATA.GEN_AI_USAGE_OUTPUT_TOKENS] == 20 + assert span["attributes"][SPANDATA.GEN_AI_USAGE_TOTAL_TOKENS] == 30 + assert span["attributes"][SPANDATA.GEN_AI_RESPONSE_STREAMING] is True + assert ( + span["attributes"][SPANDATA.GEN_AI_RESPONSE_ID] + == "msg_01XFDUDYJgAACzvnptvVoYEL" + ) + else: + events = capture_events() + + with mock.patch.object( + client._client, + "send", + return_value=response, + ) as _, start_transaction(name="anthropic"), client.messages.stream( + max_tokens=1024, + messages=messages, + model="model", + ) as stream: + for _ in range(4): + next(stream) + + # New versions add TextEvent, so consume one more event. + if TextEvent is not None and isinstance(next(stream), TextEvent): + next(stream) + + stream.close() + + assert len(events) == 1 + (event,) = events + + assert event["type"] == "transaction" + assert event["transaction"] == "anthropic" + + span = next(span for span in event["spans"] if span["op"] == OP.GEN_AI_CHAT) + + assert span["op"] == OP.GEN_AI_CHAT + assert span["description"] == "chat model" + assert span["data"][SPANDATA.GEN_AI_SYSTEM] == "anthropic" + assert span["data"][SPANDATA.GEN_AI_OPERATION_NAME] == "chat" + assert span["data"][SPANDATA.GEN_AI_REQUEST_MODEL] == "model" + + assert ( + span["data"][SPANDATA.GEN_AI_REQUEST_MESSAGES] + == '[{"role": "user", "content": "Hello, Claude"}]' + ) + assert span["data"][SPANDATA.GEN_AI_RESPONSE_TEXT] == "Hi!" - assert span["data"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS] == 10 - assert span["data"][SPANDATA.GEN_AI_USAGE_OUTPUT_TOKENS] == 20 - assert span["data"][SPANDATA.GEN_AI_USAGE_TOTAL_TOKENS] == 30 - assert span["data"][SPANDATA.GEN_AI_RESPONSE_STREAMING] is True - assert span["data"][SPANDATA.GEN_AI_RESPONSE_ID] == "msg_01XFDUDYJgAACzvnptvVoYEL" + assert span["data"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS] == 10 + assert span["data"][SPANDATA.GEN_AI_USAGE_OUTPUT_TOKENS] == 20 + assert span["data"][SPANDATA.GEN_AI_USAGE_TOTAL_TOKENS] == 30 + assert span["data"][SPANDATA.GEN_AI_RESPONSE_STREAMING] is True + assert ( + span["data"][SPANDATA.GEN_AI_RESPONSE_ID] == "msg_01XFDUDYJgAACzvnptvVoYEL" + ) +@pytest.mark.parametrize("stream_gen_ai_spans", [True, False]) @pytest.mark.skipif( ANTHROPIC_VERSION < (0, 41), reason="Error classes moved in https://github.com/anthropics/anthropic-sdk-python/commit/4e0b15e22fe40e9aa513459564f641bf97c90954.", @@ -774,8 +1172,10 @@ def test_stream_messages_close( def test_stream_messages_api_error( sentry_init, capture_events, + capture_items, get_model_response, server_side_event_chunks, + stream_gen_ai_spans, ): client = Anthropic(api_key="z") @@ -815,8 +1215,8 @@ def test_stream_messages_api_error( integrations=[AnthropicIntegration(include_prompts=True)], traces_sample_rate=1.0, send_default_pii=True, + stream_gen_ai_spans=stream_gen_ai_spans, ) - events = capture_events() messages = [ { @@ -825,51 +1225,100 @@ def test_stream_messages_api_error( } ] - with pytest.raises(APIStatusError), mock.patch.object( - client._client, - "send", - return_value=response, - ) as _: - with start_transaction(name="anthropic"): - with client.messages.stream( - max_tokens=1024, - messages=messages, - model="model", - ) as stream: - for event in stream: - pass + if stream_gen_ai_spans: + items = capture_items("transaction", "span") + + with pytest.raises(APIStatusError), mock.patch.object( + client._client, + "send", + return_value=response, + ) as _, start_transaction(name="anthropic"), client.messages.stream( + max_tokens=1024, + messages=messages, + model="model", + ) as stream: + for event in stream: + pass - assert len(events) == 1 - (event,) = events + (event,) = (item.payload for item in items if item.type == "transaction") + assert event["transaction"] == "anthropic" - assert event["type"] == "transaction" - assert event["transaction"] == "anthropic" + spans = [item.payload for item in items if item.type == "span"] + span = next( + span for span in spans if span["attributes"]["sentry.op"] == OP.GEN_AI_CHAT + ) - span = next(span for span in event["spans"] if span["op"] == OP.GEN_AI_CHAT) + assert span["attributes"]["sentry.op"] == OP.GEN_AI_CHAT + assert span["name"] == "chat model" + assert span["attributes"][SPANDATA.GEN_AI_SYSTEM] == "anthropic" + assert span["attributes"][SPANDATA.GEN_AI_OPERATION_NAME] == "chat" + assert span["attributes"][SPANDATA.GEN_AI_REQUEST_MODEL] == "model" - assert span["op"] == OP.GEN_AI_CHAT - assert span["description"] == "chat model" - assert span["data"][SPANDATA.GEN_AI_SYSTEM] == "anthropic" - assert span["data"][SPANDATA.GEN_AI_OPERATION_NAME] == "chat" - assert span["data"][SPANDATA.GEN_AI_REQUEST_MODEL] == "model" + assert ( + span["attributes"][SPANDATA.GEN_AI_REQUEST_MESSAGES] + == '[{"role": "user", "content": "Hello, Claude"}]' + ) + assert span["attributes"][SPANDATA.GEN_AI_RESPONSE_TEXT] == "Hi!" - assert ( - span["data"][SPANDATA.GEN_AI_REQUEST_MESSAGES] - == '[{"role": "user", "content": "Hello, Claude"}]' - ) - assert span["data"][SPANDATA.GEN_AI_RESPONSE_TEXT] == "Hi!" + assert span["attributes"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS] == 10 + assert span["attributes"][SPANDATA.GEN_AI_USAGE_OUTPUT_TOKENS] == 20 + assert span["attributes"][SPANDATA.GEN_AI_USAGE_TOTAL_TOKENS] == 30 + assert span["attributes"][SPANDATA.GEN_AI_RESPONSE_STREAMING] is True + assert ( + span["attributes"][SPANDATA.GEN_AI_RESPONSE_ID] + == "msg_01XFDUDYJgAACzvnptvVoYEL" + ) - assert span["data"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS] == 10 - assert span["data"][SPANDATA.GEN_AI_USAGE_OUTPUT_TOKENS] == 20 - assert span["data"][SPANDATA.GEN_AI_USAGE_TOTAL_TOKENS] == 30 - assert span["data"][SPANDATA.GEN_AI_RESPONSE_STREAMING] is True - assert span["data"][SPANDATA.GEN_AI_RESPONSE_ID] == "msg_01XFDUDYJgAACzvnptvVoYEL" + assert span["status"] == "error" + else: + events = capture_events() + + with pytest.raises(APIStatusError), mock.patch.object( + client._client, + "send", + return_value=response, + ) as _, start_transaction(name="anthropic"), client.messages.stream( + max_tokens=1024, + messages=messages, + model="model", + ) as stream: + for event in stream: + pass + + assert len(events) == 1 + (event,) = events + + assert event["type"] == "transaction" + assert event["transaction"] == "anthropic" + + span = next(span for span in event["spans"] if span["op"] == OP.GEN_AI_CHAT) + + assert span["op"] == OP.GEN_AI_CHAT + assert span["description"] == "chat model" + assert span["data"][SPANDATA.GEN_AI_SYSTEM] == "anthropic" + assert span["data"][SPANDATA.GEN_AI_OPERATION_NAME] == "chat" + assert span["data"][SPANDATA.GEN_AI_REQUEST_MODEL] == "model" + + assert ( + span["data"][SPANDATA.GEN_AI_REQUEST_MESSAGES] + == '[{"role": "user", "content": "Hello, Claude"}]' + ) + assert span["data"][SPANDATA.GEN_AI_RESPONSE_TEXT] == "Hi!" + + assert span["data"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS] == 10 + assert span["data"][SPANDATA.GEN_AI_USAGE_OUTPUT_TOKENS] == 20 + assert span["data"][SPANDATA.GEN_AI_USAGE_TOTAL_TOKENS] == 30 + assert span["data"][SPANDATA.GEN_AI_RESPONSE_STREAMING] is True + assert ( + span["data"][SPANDATA.GEN_AI_RESPONSE_ID] == "msg_01XFDUDYJgAACzvnptvVoYEL" + ) - assert span["status"] == "internal_error" - assert span["tags"]["status"] == "internal_error" + assert span["status"] == "internal_error" + assert span["tags"]["status"] == "internal_error" assert event["contexts"]["trace"]["status"] == "internal_error" +@pytest.mark.parametrize("stream_gen_ai_spans", [True, False]) @pytest.mark.asyncio @pytest.mark.parametrize( "send_default_pii, include_prompts", @@ -883,11 +1332,13 @@ def test_stream_messages_api_error( async def test_streaming_create_message_async( sentry_init, capture_events, + capture_items, send_default_pii, include_prompts, get_model_response, async_iterator, server_side_event_chunks, + stream_gen_ai_spans, ): client = AsyncAnthropic(api_key="z") @@ -935,8 +1386,8 @@ async def test_streaming_create_message_async( traces_sample_rate=1.0, default_integrations=False, send_default_pii=send_default_pii, + stream_gen_ai_spans=stream_gen_ai_spans, ) - events = capture_events() messages = [ { @@ -945,12 +1396,14 @@ async def test_streaming_create_message_async( } ] - with mock.patch.object( - client._client, - "send", - return_value=response, - ) as _: - with start_transaction(name="anthropic"): + if stream_gen_ai_spans: + items = capture_items("transaction", "span") + + with mock.patch.object( + client._client, + "send", + return_value=response, + ) as _, start_transaction(name="anthropic"): message = await client.messages.create( max_tokens=1024, messages=messages, model="model", stream=True ) @@ -958,47 +1411,104 @@ async def test_streaming_create_message_async( async for _ in message: pass - assert len(events) == 1 - (event,) = events + (event,) = (item.payload for item in items if item.type == "transaction") + assert event["transaction"] == "anthropic" - assert event["type"] == "transaction" - assert event["transaction"] == "anthropic" + spans = [item.payload for item in items if item.type == "span"] + assert len(spans) == 1 + (span,) = spans - assert len(event["spans"]) == 1 - (span,) = event["spans"] + assert span["attributes"]["sentry.op"] == OP.GEN_AI_CHAT + assert span["name"] == "chat model" + assert span["attributes"][SPANDATA.GEN_AI_SYSTEM] == "anthropic" + assert span["attributes"][SPANDATA.GEN_AI_OPERATION_NAME] == "chat" + assert span["attributes"][SPANDATA.GEN_AI_REQUEST_MODEL] == "model" - assert span["op"] == OP.GEN_AI_CHAT - assert span["description"] == "chat model" - assert span["data"][SPANDATA.GEN_AI_SYSTEM] == "anthropic" - assert span["data"][SPANDATA.GEN_AI_OPERATION_NAME] == "chat" - assert span["data"][SPANDATA.GEN_AI_REQUEST_MODEL] == "model" + if send_default_pii and include_prompts: + assert ( + span["attributes"][SPANDATA.GEN_AI_REQUEST_MESSAGES] + == '[{"role": "user", "content": "Hello, Claude"}]' + ) + assert ( + span["attributes"][SPANDATA.GEN_AI_RESPONSE_TEXT] == "Hi! I'm Claude!" + ) - if send_default_pii and include_prompts: + else: + assert SPANDATA.GEN_AI_REQUEST_MESSAGES not in span["attributes"] + assert SPANDATA.GEN_AI_RESPONSE_TEXT not in span["attributes"] + + assert span["attributes"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS] == 10 + assert span["attributes"][SPANDATA.GEN_AI_USAGE_OUTPUT_TOKENS] == 10 + assert span["attributes"][SPANDATA.GEN_AI_USAGE_TOTAL_TOKENS] == 20 + assert span["attributes"][SPANDATA.GEN_AI_RESPONSE_STREAMING] is True assert ( - span["data"][SPANDATA.GEN_AI_REQUEST_MESSAGES] - == '[{"role": "user", "content": "Hello, Claude"}]' + span["attributes"][SPANDATA.GEN_AI_RESPONSE_ID] + == "msg_01XFDUDYJgAACzvnptvVoYEL" ) - assert span["data"][SPANDATA.GEN_AI_RESPONSE_TEXT] == "Hi! I'm Claude!" - + assert span["attributes"][SPANDATA.GEN_AI_RESPONSE_FINISH_REASONS] == [ + "max_tokens" + ] else: - assert SPANDATA.GEN_AI_REQUEST_MESSAGES not in span["data"] - assert SPANDATA.GEN_AI_RESPONSE_TEXT not in span["data"] - - assert span["data"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS] == 10 - assert span["data"][SPANDATA.GEN_AI_USAGE_OUTPUT_TOKENS] == 10 - assert span["data"][SPANDATA.GEN_AI_USAGE_TOTAL_TOKENS] == 20 - assert span["data"][SPANDATA.GEN_AI_RESPONSE_STREAMING] is True - assert span["data"][SPANDATA.GEN_AI_RESPONSE_ID] == "msg_01XFDUDYJgAACzvnptvVoYEL" - assert span["data"][SPANDATA.GEN_AI_RESPONSE_FINISH_REASONS] == ["max_tokens"] + events = capture_events() + with mock.patch.object( + client._client, + "send", + return_value=response, + ) as _, start_transaction(name="anthropic"): + message = await client.messages.create( + max_tokens=1024, messages=messages, model="model", stream=True + ) + async for _ in message: + pass + + assert len(events) == 1 + (event,) = events + + assert event["type"] == "transaction" + assert event["transaction"] == "anthropic" + + assert len(event["spans"]) == 1 + (span,) = event["spans"] + + assert span["op"] == OP.GEN_AI_CHAT + assert span["description"] == "chat model" + assert span["data"][SPANDATA.GEN_AI_SYSTEM] == "anthropic" + assert span["data"][SPANDATA.GEN_AI_OPERATION_NAME] == "chat" + assert span["data"][SPANDATA.GEN_AI_REQUEST_MODEL] == "model" + + if send_default_pii and include_prompts: + assert ( + span["data"][SPANDATA.GEN_AI_REQUEST_MESSAGES] + == '[{"role": "user", "content": "Hello, Claude"}]' + ) + assert span["data"][SPANDATA.GEN_AI_RESPONSE_TEXT] == "Hi! I'm Claude!" + + else: + assert SPANDATA.GEN_AI_REQUEST_MESSAGES not in span["data"] + assert SPANDATA.GEN_AI_RESPONSE_TEXT not in span["data"] + + assert span["data"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS] == 10 + assert span["data"][SPANDATA.GEN_AI_USAGE_OUTPUT_TOKENS] == 10 + assert span["data"][SPANDATA.GEN_AI_USAGE_TOTAL_TOKENS] == 20 + assert span["data"][SPANDATA.GEN_AI_RESPONSE_STREAMING] is True + assert ( + span["data"][SPANDATA.GEN_AI_RESPONSE_ID] == "msg_01XFDUDYJgAACzvnptvVoYEL" + ) + assert span["data"][SPANDATA.GEN_AI_RESPONSE_FINISH_REASONS] == ["max_tokens"] + + +@pytest.mark.parametrize("stream_gen_ai_spans", [True, False]) @pytest.mark.asyncio async def test_streaming_create_message_async_close( sentry_init, capture_events, + capture_items, get_model_response, async_iterator, server_side_event_chunks, + stream_gen_ai_spans, ): client = AsyncAnthropic(api_key="z") @@ -1045,8 +1555,8 @@ async def test_streaming_create_message_async_close( integrations=[AnthropicIntegration(include_prompts=True)], traces_sample_rate=1.0, send_default_pii=True, + stream_gen_ai_spans=stream_gen_ai_spans, ) - events = capture_events() messages = [ { @@ -1055,12 +1565,14 @@ async def test_streaming_create_message_async_close( } ] - with mock.patch.object( - client._client, - "send", - return_value=response, - ) as _: - with start_transaction(name="anthropic"): + if stream_gen_ai_spans: + items = capture_items("transaction", "span") + + with mock.patch.object( + client._client, + "send", + return_value=response, + ) as _, start_transaction(name="anthropic"): messages = await client.messages.create( max_tokens=1024, messages=messages, model="model", stream=True ) @@ -1069,33 +1581,80 @@ async def test_streaming_create_message_async_close( await messages.__anext__() await messages.close() - assert len(events) == 1 - (event,) = events + (event,) = (item.payload for item in items if item.type == "transaction") + assert event["transaction"] == "anthropic" - assert event["type"] == "transaction" - assert event["transaction"] == "anthropic" + spans = [item.payload for item in items if item.type == "span"] + span = next( + span for span in spans if span["attributes"]["sentry.op"] == OP.GEN_AI_CHAT + ) - span = next(span for span in event["spans"] if span["op"] == OP.GEN_AI_CHAT) + assert span["attributes"]["sentry.op"] == OP.GEN_AI_CHAT + assert span["name"] == "chat model" + assert span["attributes"][SPANDATA.GEN_AI_SYSTEM] == "anthropic" + assert span["attributes"][SPANDATA.GEN_AI_OPERATION_NAME] == "chat" + assert span["attributes"][SPANDATA.GEN_AI_REQUEST_MODEL] == "model" - assert span["op"] == OP.GEN_AI_CHAT - assert span["description"] == "chat model" - assert span["data"][SPANDATA.GEN_AI_SYSTEM] == "anthropic" - assert span["data"][SPANDATA.GEN_AI_OPERATION_NAME] == "chat" - assert span["data"][SPANDATA.GEN_AI_REQUEST_MODEL] == "model" + assert ( + span["attributes"][SPANDATA.GEN_AI_REQUEST_MESSAGES] + == '[{"role": "user", "content": "Hello, Claude"}]' + ) + assert span["attributes"][SPANDATA.GEN_AI_RESPONSE_TEXT] == "Hi!" - assert ( - span["data"][SPANDATA.GEN_AI_REQUEST_MESSAGES] - == '[{"role": "user", "content": "Hello, Claude"}]' - ) - assert span["data"][SPANDATA.GEN_AI_RESPONSE_TEXT] == "Hi!" + assert span["attributes"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS] == 10 + assert span["attributes"][SPANDATA.GEN_AI_USAGE_OUTPUT_TOKENS] == 20 + assert span["attributes"][SPANDATA.GEN_AI_USAGE_TOTAL_TOKENS] == 30 + assert span["attributes"][SPANDATA.GEN_AI_RESPONSE_STREAMING] is True + assert ( + span["attributes"][SPANDATA.GEN_AI_RESPONSE_ID] + == "msg_01XFDUDYJgAACzvnptvVoYEL" + ) + else: + events = capture_events() + + with mock.patch.object( + client._client, + "send", + return_value=response, + ) as _, start_transaction(name="anthropic"): + messages = await client.messages.create( + max_tokens=1024, messages=messages, model="model", stream=True + ) + + for _ in range(4): + await messages.__anext__() + await messages.close() - assert span["data"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS] == 10 - assert span["data"][SPANDATA.GEN_AI_USAGE_OUTPUT_TOKENS] == 20 - assert span["data"][SPANDATA.GEN_AI_USAGE_TOTAL_TOKENS] == 30 - assert span["data"][SPANDATA.GEN_AI_RESPONSE_STREAMING] is True - assert span["data"][SPANDATA.GEN_AI_RESPONSE_ID] == "msg_01XFDUDYJgAACzvnptvVoYEL" + assert len(events) == 1 + (event,) = events + assert event["type"] == "transaction" + assert event["transaction"] == "anthropic" + span = next(span for span in event["spans"] if span["op"] == OP.GEN_AI_CHAT) + + assert span["op"] == OP.GEN_AI_CHAT + assert span["description"] == "chat model" + assert span["data"][SPANDATA.GEN_AI_SYSTEM] == "anthropic" + assert span["data"][SPANDATA.GEN_AI_OPERATION_NAME] == "chat" + assert span["data"][SPANDATA.GEN_AI_REQUEST_MODEL] == "model" + + assert ( + span["data"][SPANDATA.GEN_AI_REQUEST_MESSAGES] + == '[{"role": "user", "content": "Hello, Claude"}]' + ) + assert span["data"][SPANDATA.GEN_AI_RESPONSE_TEXT] == "Hi!" + + assert span["data"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS] == 10 + assert span["data"][SPANDATA.GEN_AI_USAGE_OUTPUT_TOKENS] == 20 + assert span["data"][SPANDATA.GEN_AI_USAGE_TOTAL_TOKENS] == 30 + assert span["data"][SPANDATA.GEN_AI_RESPONSE_STREAMING] is True + assert ( + span["data"][SPANDATA.GEN_AI_RESPONSE_ID] == "msg_01XFDUDYJgAACzvnptvVoYEL" + ) + + +@pytest.mark.parametrize("stream_gen_ai_spans", [True, False]) @pytest.mark.skipif( ANTHROPIC_VERSION < (0, 41), reason="Error classes moved in https://github.com/anthropics/anthropic-sdk-python/commit/4e0b15e22fe40e9aa513459564f641bf97c90954.", @@ -1104,9 +1663,11 @@ async def test_streaming_create_message_async_close( async def test_streaming_create_message_async_api_error( sentry_init, capture_events, + capture_items, get_model_response, async_iterator, server_side_event_chunks, + stream_gen_ai_spans, ): client = AsyncAnthropic(api_key="z") @@ -1148,8 +1709,8 @@ async def test_streaming_create_message_async_api_error( integrations=[AnthropicIntegration(include_prompts=True)], traces_sample_rate=1.0, send_default_pii=True, + stream_gen_ai_spans=stream_gen_ai_spans, ) - events = capture_events() messages = [ { @@ -1158,12 +1719,14 @@ async def test_streaming_create_message_async_api_error( } ] - with pytest.raises(APIStatusError), mock.patch.object( - client._client, - "send", - return_value=response, - ) as _: - with start_transaction(name="anthropic"): + if stream_gen_ai_spans: + items = capture_items("transaction", "span") + + with pytest.raises(APIStatusError), mock.patch.object( + client._client, + "send", + return_value=response, + ) as _, start_transaction(name="anthropic"): message = await client.messages.create( max_tokens=1024, messages=messages, model="model", stream=True ) @@ -1171,37 +1734,85 @@ async def test_streaming_create_message_async_api_error( async for _ in message: pass - assert len(events) == 1 - (event,) = events + (event,) = (item.payload for item in items if item.type == "transaction") + assert event["transaction"] == "anthropic" + + spans = [item.payload for item in items if item.type == "span"] + span = next( + span for span in spans if span["attributes"]["sentry.op"] == OP.GEN_AI_CHAT + ) + + assert span["attributes"]["sentry.op"] == OP.GEN_AI_CHAT + assert span["name"] == "chat model" + assert span["attributes"][SPANDATA.GEN_AI_SYSTEM] == "anthropic" + assert span["attributes"][SPANDATA.GEN_AI_OPERATION_NAME] == "chat" + assert span["attributes"][SPANDATA.GEN_AI_REQUEST_MODEL] == "model" - assert event["type"] == "transaction" - assert event["transaction"] == "anthropic" + assert ( + span["attributes"][SPANDATA.GEN_AI_REQUEST_MESSAGES] + == '[{"role": "user", "content": "Hello, Claude"}]' + ) + assert span["attributes"][SPANDATA.GEN_AI_RESPONSE_TEXT] == "Hi!" - span = next(span for span in event["spans"] if span["op"] == OP.GEN_AI_CHAT) + assert span["attributes"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS] == 10 + assert span["attributes"][SPANDATA.GEN_AI_USAGE_OUTPUT_TOKENS] == 20 + assert span["attributes"][SPANDATA.GEN_AI_USAGE_TOTAL_TOKENS] == 30 + assert span["attributes"][SPANDATA.GEN_AI_RESPONSE_STREAMING] is True + assert ( + span["attributes"][SPANDATA.GEN_AI_RESPONSE_ID] + == "msg_01XFDUDYJgAACzvnptvVoYEL" + ) - assert span["op"] == OP.GEN_AI_CHAT - assert span["description"] == "chat model" - assert span["data"][SPANDATA.GEN_AI_SYSTEM] == "anthropic" - assert span["data"][SPANDATA.GEN_AI_OPERATION_NAME] == "chat" - assert span["data"][SPANDATA.GEN_AI_REQUEST_MODEL] == "model" + assert span["status"] == "error" + else: + events = capture_events() - assert ( - span["data"][SPANDATA.GEN_AI_REQUEST_MESSAGES] - == '[{"role": "user", "content": "Hello, Claude"}]' - ) - assert span["data"][SPANDATA.GEN_AI_RESPONSE_TEXT] == "Hi!" + with pytest.raises(APIStatusError), mock.patch.object( + client._client, + "send", + return_value=response, + ) as _, start_transaction(name="anthropic"): + message = await client.messages.create( + max_tokens=1024, messages=messages, model="model", stream=True + ) + + async for _ in message: + pass + + assert len(events) == 1 + (event,) = events + + assert event["type"] == "transaction" + assert event["transaction"] == "anthropic" + + span = next(span for span in event["spans"] if span["op"] == OP.GEN_AI_CHAT) - assert span["data"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS] == 10 - assert span["data"][SPANDATA.GEN_AI_USAGE_OUTPUT_TOKENS] == 20 - assert span["data"][SPANDATA.GEN_AI_USAGE_TOTAL_TOKENS] == 30 - assert span["data"][SPANDATA.GEN_AI_RESPONSE_STREAMING] is True - assert span["data"][SPANDATA.GEN_AI_RESPONSE_ID] == "msg_01XFDUDYJgAACzvnptvVoYEL" + assert span["op"] == OP.GEN_AI_CHAT + assert span["description"] == "chat model" + assert span["data"][SPANDATA.GEN_AI_SYSTEM] == "anthropic" + assert span["data"][SPANDATA.GEN_AI_OPERATION_NAME] == "chat" + assert span["data"][SPANDATA.GEN_AI_REQUEST_MODEL] == "model" - assert span["status"] == "internal_error" - assert span["tags"]["status"] == "internal_error" + assert ( + span["data"][SPANDATA.GEN_AI_REQUEST_MESSAGES] + == '[{"role": "user", "content": "Hello, Claude"}]' + ) + assert span["data"][SPANDATA.GEN_AI_RESPONSE_TEXT] == "Hi!" + + assert span["data"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS] == 10 + assert span["data"][SPANDATA.GEN_AI_USAGE_OUTPUT_TOKENS] == 20 + assert span["data"][SPANDATA.GEN_AI_USAGE_TOTAL_TOKENS] == 30 + assert span["data"][SPANDATA.GEN_AI_RESPONSE_STREAMING] is True + assert ( + span["data"][SPANDATA.GEN_AI_RESPONSE_ID] == "msg_01XFDUDYJgAACzvnptvVoYEL" + ) + + assert span["status"] == "internal_error" + assert span["tags"]["status"] == "internal_error" assert event["contexts"]["trace"]["status"] == "internal_error" +@pytest.mark.parametrize("stream_gen_ai_spans", [True, False]) @pytest.mark.asyncio @pytest.mark.parametrize( "send_default_pii, include_prompts", @@ -1215,11 +1826,13 @@ async def test_streaming_create_message_async_api_error( async def test_stream_message_async( sentry_init, capture_events, + capture_items, send_default_pii, include_prompts, get_model_response, async_iterator, server_side_event_chunks, + stream_gen_ai_spans, ): client = AsyncAnthropic(api_key="z") @@ -1266,8 +1879,8 @@ async def test_stream_message_async( integrations=[AnthropicIntegration(include_prompts=include_prompts)], traces_sample_rate=1.0, send_default_pii=send_default_pii, + stream_gen_ai_spans=stream_gen_ai_spans, ) - events = capture_events() messages = [ { @@ -1276,12 +1889,14 @@ async def test_stream_message_async( } ] - with mock.patch.object( - client._client, - "send", - return_value=response, - ) as _: - with start_transaction(name="anthropic"): + if stream_gen_ai_spans: + items = capture_items("transaction", "span") + + with mock.patch.object( + client._client, + "send", + return_value=response, + ) as _, start_transaction(name="anthropic"): async with client.messages.stream( max_tokens=1024, messages=messages, @@ -1290,39 +1905,91 @@ async def test_stream_message_async( async for event in stream: pass - assert len(events) == 1 - (event,) = events + (event,) = (item.payload for item in items if item.type == "transaction") + assert event["transaction"] == "anthropic" - assert event["type"] == "transaction" - assert event["transaction"] == "anthropic" + spans = [item.payload for item in items if item.type == "span"] + assert len(spans) == 1 + (span,) = spans - assert len(event["spans"]) == 1 - (span,) = event["spans"] + assert span["attributes"]["sentry.op"] == OP.GEN_AI_CHAT + assert span["name"] == "chat model" + assert span["attributes"][SPANDATA.GEN_AI_SYSTEM] == "anthropic" + assert span["attributes"][SPANDATA.GEN_AI_OPERATION_NAME] == "chat" + assert span["attributes"][SPANDATA.GEN_AI_REQUEST_MODEL] == "model" + + if send_default_pii and include_prompts: + assert ( + span["attributes"][SPANDATA.GEN_AI_REQUEST_MESSAGES] + == '[{"role": "user", "content": "Hello, Claude"}]' + ) + assert ( + span["attributes"][SPANDATA.GEN_AI_RESPONSE_TEXT] == "Hi! I'm Claude!" + ) - assert span["op"] == OP.GEN_AI_CHAT - assert span["description"] == "chat model" - assert span["data"][SPANDATA.GEN_AI_SYSTEM] == "anthropic" - assert span["data"][SPANDATA.GEN_AI_OPERATION_NAME] == "chat" - assert span["data"][SPANDATA.GEN_AI_REQUEST_MODEL] == "model" + else: + assert SPANDATA.GEN_AI_REQUEST_MESSAGES not in span["attributes"] + assert SPANDATA.GEN_AI_RESPONSE_TEXT not in span["attributes"] - if send_default_pii and include_prompts: + assert span["attributes"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS] == 10 + assert span["attributes"][SPANDATA.GEN_AI_USAGE_OUTPUT_TOKENS] == 10 + assert span["attributes"][SPANDATA.GEN_AI_USAGE_TOTAL_TOKENS] == 20 + assert span["attributes"][SPANDATA.GEN_AI_RESPONSE_STREAMING] is True assert ( - span["data"][SPANDATA.GEN_AI_REQUEST_MESSAGES] - == '[{"role": "user", "content": "Hello, Claude"}]' + span["attributes"][SPANDATA.GEN_AI_RESPONSE_ID] + == "msg_01XFDUDYJgAACzvnptvVoYEL" ) - assert span["data"][SPANDATA.GEN_AI_RESPONSE_TEXT] == "Hi! I'm Claude!" - else: - assert SPANDATA.GEN_AI_REQUEST_MESSAGES not in span["data"] - assert SPANDATA.GEN_AI_RESPONSE_TEXT not in span["data"] + events = capture_events() + + with mock.patch.object( + client._client, + "send", + return_value=response, + ) as _, start_transaction(name="anthropic"): + async with client.messages.stream( + max_tokens=1024, + messages=messages, + model="model", + ) as stream: + async for event in stream: + pass + assert len(events) == 1 + (event,) = events + + assert event["type"] == "transaction" + assert event["transaction"] == "anthropic" + + assert len(event["spans"]) == 1 + (span,) = event["spans"] + + assert span["op"] == OP.GEN_AI_CHAT + assert span["description"] == "chat model" + assert span["data"][SPANDATA.GEN_AI_SYSTEM] == "anthropic" + assert span["data"][SPANDATA.GEN_AI_OPERATION_NAME] == "chat" + assert span["data"][SPANDATA.GEN_AI_REQUEST_MODEL] == "model" - assert span["data"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS] == 10 - assert span["data"][SPANDATA.GEN_AI_USAGE_OUTPUT_TOKENS] == 10 - assert span["data"][SPANDATA.GEN_AI_USAGE_TOTAL_TOKENS] == 20 - assert span["data"][SPANDATA.GEN_AI_RESPONSE_STREAMING] is True - assert span["data"][SPANDATA.GEN_AI_RESPONSE_ID] == "msg_01XFDUDYJgAACzvnptvVoYEL" + if send_default_pii and include_prompts: + assert ( + span["data"][SPANDATA.GEN_AI_REQUEST_MESSAGES] + == '[{"role": "user", "content": "Hello, Claude"}]' + ) + assert span["data"][SPANDATA.GEN_AI_RESPONSE_TEXT] == "Hi! I'm Claude!" + + else: + assert SPANDATA.GEN_AI_REQUEST_MESSAGES not in span["data"] + assert SPANDATA.GEN_AI_RESPONSE_TEXT not in span["data"] + + assert span["data"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS] == 10 + assert span["data"][SPANDATA.GEN_AI_USAGE_OUTPUT_TOKENS] == 10 + assert span["data"][SPANDATA.GEN_AI_USAGE_TOTAL_TOKENS] == 20 + assert span["data"][SPANDATA.GEN_AI_RESPONSE_STREAMING] is True + assert ( + span["data"][SPANDATA.GEN_AI_RESPONSE_ID] == "msg_01XFDUDYJgAACzvnptvVoYEL" + ) +@pytest.mark.parametrize("stream_gen_ai_spans", [True, False]) @pytest.mark.skipif( ANTHROPIC_VERSION < (0, 41), reason="Error classes moved in https://github.com/anthropics/anthropic-sdk-python/commit/4e0b15e22fe40e9aa513459564f641bf97c90954.", @@ -1331,9 +1998,11 @@ async def test_stream_message_async( async def test_stream_messages_async_api_error( sentry_init, capture_events, + capture_items, get_model_response, async_iterator, server_side_event_chunks, + stream_gen_ai_spans, ): client = AsyncAnthropic(api_key="z") @@ -1375,8 +2044,8 @@ async def test_stream_messages_async_api_error( integrations=[AnthropicIntegration(include_prompts=True)], traces_sample_rate=1.0, send_default_pii=True, + stream_gen_ai_spans=stream_gen_ai_spans, ) - events = capture_events() messages = [ { @@ -1385,12 +2054,14 @@ async def test_stream_messages_async_api_error( } ] - with pytest.raises(APIStatusError), mock.patch.object( - client._client, - "send", - return_value=response, - ) as _: - with start_transaction(name="anthropic"): + if stream_gen_ai_spans: + items = capture_items("transaction", "span") + + with pytest.raises(APIStatusError), mock.patch.object( + client._client, + "send", + return_value=response, + ) as _, start_transaction(name="anthropic"): async with client.messages.stream( max_tokens=1024, messages=messages, @@ -1399,44 +2070,95 @@ async def test_stream_messages_async_api_error( async for event in stream: pass - assert len(events) == 1 - (event,) = events + (event,) = (item.payload for item in items if item.type == "transaction") + assert event["transaction"] == "anthropic" - assert event["type"] == "transaction" - assert event["transaction"] == "anthropic" + spans = [item.payload for item in items if item.type == "span"] + span = next( + span for span in spans if span["attributes"]["sentry.op"] == OP.GEN_AI_CHAT + ) - span = next(span for span in event["spans"] if span["op"] == OP.GEN_AI_CHAT) + assert span["attributes"]["sentry.op"] == OP.GEN_AI_CHAT + assert span["name"] == "chat model" + assert span["attributes"][SPANDATA.GEN_AI_SYSTEM] == "anthropic" + assert span["attributes"][SPANDATA.GEN_AI_OPERATION_NAME] == "chat" + assert span["attributes"][SPANDATA.GEN_AI_REQUEST_MODEL] == "model" - assert span["op"] == OP.GEN_AI_CHAT - assert span["description"] == "chat model" - assert span["data"][SPANDATA.GEN_AI_SYSTEM] == "anthropic" - assert span["data"][SPANDATA.GEN_AI_OPERATION_NAME] == "chat" - assert span["data"][SPANDATA.GEN_AI_REQUEST_MODEL] == "model" + assert ( + span["attributes"][SPANDATA.GEN_AI_REQUEST_MESSAGES] + == '[{"role": "user", "content": "Hello, Claude"}]' + ) + assert span["attributes"][SPANDATA.GEN_AI_RESPONSE_TEXT] == "Hi!" - assert ( - span["data"][SPANDATA.GEN_AI_REQUEST_MESSAGES] - == '[{"role": "user", "content": "Hello, Claude"}]' - ) - assert span["data"][SPANDATA.GEN_AI_RESPONSE_TEXT] == "Hi!" + assert span["attributes"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS] == 10 + assert span["attributes"][SPANDATA.GEN_AI_USAGE_OUTPUT_TOKENS] == 20 + assert span["attributes"][SPANDATA.GEN_AI_USAGE_TOTAL_TOKENS] == 30 + assert span["attributes"][SPANDATA.GEN_AI_RESPONSE_STREAMING] is True + assert ( + span["attributes"][SPANDATA.GEN_AI_RESPONSE_ID] + == "msg_01XFDUDYJgAACzvnptvVoYEL" + ) + + assert span["status"] == "error" + else: + events = capture_events() + + with pytest.raises(APIStatusError), mock.patch.object( + client._client, + "send", + return_value=response, + ) as _, start_transaction(name="anthropic"): + async with client.messages.stream( + max_tokens=1024, + messages=messages, + model="model", + ) as stream: + async for event in stream: + pass + + assert len(events) == 1 + (event,) = events + + assert event["type"] == "transaction" + assert event["transaction"] == "anthropic" + + span = next(span for span in event["spans"] if span["op"] == OP.GEN_AI_CHAT) + + assert span["op"] == OP.GEN_AI_CHAT + assert span["description"] == "chat model" + assert span["data"][SPANDATA.GEN_AI_SYSTEM] == "anthropic" + assert span["data"][SPANDATA.GEN_AI_OPERATION_NAME] == "chat" + assert span["data"][SPANDATA.GEN_AI_REQUEST_MODEL] == "model" - assert span["data"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS] == 10 - assert span["data"][SPANDATA.GEN_AI_USAGE_OUTPUT_TOKENS] == 20 - assert span["data"][SPANDATA.GEN_AI_USAGE_TOTAL_TOKENS] == 30 - assert span["data"][SPANDATA.GEN_AI_RESPONSE_STREAMING] is True - assert span["data"][SPANDATA.GEN_AI_RESPONSE_ID] == "msg_01XFDUDYJgAACzvnptvVoYEL" + assert ( + span["data"][SPANDATA.GEN_AI_REQUEST_MESSAGES] + == '[{"role": "user", "content": "Hello, Claude"}]' + ) + assert span["data"][SPANDATA.GEN_AI_RESPONSE_TEXT] == "Hi!" - assert span["status"] == "internal_error" - assert span["tags"]["status"] == "internal_error" + assert span["data"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS] == 10 + assert span["data"][SPANDATA.GEN_AI_USAGE_OUTPUT_TOKENS] == 20 + assert span["data"][SPANDATA.GEN_AI_USAGE_TOTAL_TOKENS] == 30 + assert span["data"][SPANDATA.GEN_AI_RESPONSE_STREAMING] is True + assert ( + span["data"][SPANDATA.GEN_AI_RESPONSE_ID] == "msg_01XFDUDYJgAACzvnptvVoYEL" + ) + + assert span["status"] == "internal_error" + assert span["tags"]["status"] == "internal_error" assert event["contexts"]["trace"]["status"] == "internal_error" +@pytest.mark.parametrize("stream_gen_ai_spans", [True, False]) @pytest.mark.asyncio async def test_stream_messages_async_close( sentry_init, capture_events, + capture_items, get_model_response, async_iterator, server_side_event_chunks, + stream_gen_ai_spans, ): client = AsyncAnthropic(api_key="z") @@ -1483,8 +2205,8 @@ async def test_stream_messages_async_close( integrations=[AnthropicIntegration(include_prompts=True)], traces_sample_rate=1.0, send_default_pii=True, + stream_gen_ai_spans=stream_gen_ai_spans, ) - events = capture_events() messages = [ { @@ -1493,12 +2215,14 @@ async def test_stream_messages_async_close( } ] - with mock.patch.object( - client._client, - "send", - return_value=response, - ) as _: - with start_transaction(name="anthropic"): + if stream_gen_ai_spans: + items = capture_items("transaction", "span") + + with mock.patch.object( + client._client, + "send", + return_value=response, + ) as _, start_transaction(name="anthropic"): async with client.messages.stream( max_tokens=1024, messages=messages, @@ -1515,33 +2239,88 @@ async def test_stream_messages_async_close( await stream.close() - assert len(events) == 1 - (event,) = events + (event,) = (item.payload for item in items if item.type == "transaction") + assert event["transaction"] == "anthropic" - assert event["type"] == "transaction" - assert event["transaction"] == "anthropic" + spans = [item.payload for item in items if item.type == "span"] + span = next( + span for span in spans if span["attributes"]["sentry.op"] == OP.GEN_AI_CHAT + ) - span = next(span for span in event["spans"] if span["op"] == OP.GEN_AI_CHAT) + assert span["attributes"]["sentry.op"] == OP.GEN_AI_CHAT + assert span["name"] == "chat model" + assert span["attributes"][SPANDATA.GEN_AI_SYSTEM] == "anthropic" + assert span["attributes"][SPANDATA.GEN_AI_OPERATION_NAME] == "chat" + assert span["attributes"][SPANDATA.GEN_AI_REQUEST_MODEL] == "model" - assert span["op"] == OP.GEN_AI_CHAT - assert span["description"] == "chat model" - assert span["data"][SPANDATA.GEN_AI_SYSTEM] == "anthropic" - assert span["data"][SPANDATA.GEN_AI_OPERATION_NAME] == "chat" - assert span["data"][SPANDATA.GEN_AI_REQUEST_MODEL] == "model" + assert ( + span["attributes"][SPANDATA.GEN_AI_REQUEST_MESSAGES] + == '[{"role": "user", "content": "Hello, Claude"}]' + ) + assert span["attributes"][SPANDATA.GEN_AI_RESPONSE_TEXT] == "Hi!" - assert ( - span["data"][SPANDATA.GEN_AI_REQUEST_MESSAGES] - == '[{"role": "user", "content": "Hello, Claude"}]' - ) - assert span["data"][SPANDATA.GEN_AI_RESPONSE_TEXT] == "Hi!" + assert span["attributes"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS] == 10 + assert span["attributes"][SPANDATA.GEN_AI_USAGE_OUTPUT_TOKENS] == 20 + assert span["attributes"][SPANDATA.GEN_AI_USAGE_TOTAL_TOKENS] == 30 + assert span["attributes"][SPANDATA.GEN_AI_RESPONSE_STREAMING] is True + assert ( + span["attributes"][SPANDATA.GEN_AI_RESPONSE_ID] + == "msg_01XFDUDYJgAACzvnptvVoYEL" + ) + else: + events = capture_events() + + with mock.patch.object( + client._client, + "send", + return_value=response, + ) as _, start_transaction(name="anthropic"): + async with client.messages.stream( + max_tokens=1024, + messages=messages, + model="model", + ) as stream: + for _ in range(4): + await stream.__anext__() + + # New versions add TextEvent, so consume one more event. + if TextEvent is not None and isinstance( + await stream.__anext__(), TextEvent + ): + await stream.__anext__() + + await stream.close() + + assert len(events) == 1 + (event,) = events + + assert event["type"] == "transaction" + assert event["transaction"] == "anthropic" + + span = next(span for span in event["spans"] if span["op"] == OP.GEN_AI_CHAT) + + assert span["op"] == OP.GEN_AI_CHAT + assert span["description"] == "chat model" + assert span["data"][SPANDATA.GEN_AI_SYSTEM] == "anthropic" + assert span["data"][SPANDATA.GEN_AI_OPERATION_NAME] == "chat" + assert span["data"][SPANDATA.GEN_AI_REQUEST_MODEL] == "model" + + assert ( + span["data"][SPANDATA.GEN_AI_REQUEST_MESSAGES] + == '[{"role": "user", "content": "Hello, Claude"}]' + ) + assert span["data"][SPANDATA.GEN_AI_RESPONSE_TEXT] == "Hi!" - assert span["data"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS] == 10 - assert span["data"][SPANDATA.GEN_AI_USAGE_OUTPUT_TOKENS] == 20 - assert span["data"][SPANDATA.GEN_AI_USAGE_TOTAL_TOKENS] == 30 - assert span["data"][SPANDATA.GEN_AI_RESPONSE_STREAMING] is True - assert span["data"][SPANDATA.GEN_AI_RESPONSE_ID] == "msg_01XFDUDYJgAACzvnptvVoYEL" + assert span["data"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS] == 10 + assert span["data"][SPANDATA.GEN_AI_USAGE_OUTPUT_TOKENS] == 20 + assert span["data"][SPANDATA.GEN_AI_USAGE_TOTAL_TOKENS] == 30 + assert span["data"][SPANDATA.GEN_AI_RESPONSE_STREAMING] is True + assert ( + span["data"][SPANDATA.GEN_AI_RESPONSE_ID] == "msg_01XFDUDYJgAACzvnptvVoYEL" + ) +@pytest.mark.parametrize("stream_gen_ai_spans", [True, False]) @pytest.mark.skipif( ANTHROPIC_VERSION < (0, 27), reason="Versions <0.27.0 do not include InputJSONDelta, which was introduced in >=0.27.0 along with a new message delta type for tool calling.", @@ -1558,10 +2337,12 @@ async def test_stream_messages_async_close( def test_streaming_create_message_with_input_json_delta( sentry_init, capture_events, + capture_items, send_default_pii, include_prompts, get_model_response, server_side_event_chunks, + stream_gen_ai_spans, ): client = Anthropic(api_key="z") @@ -1636,8 +2417,8 @@ def test_streaming_create_message_with_input_json_delta( integrations=[AnthropicIntegration(include_prompts=include_prompts)], traces_sample_rate=1.0, send_default_pii=send_default_pii, + stream_gen_ai_spans=stream_gen_ai_spans, ) - events = capture_events() messages = [ { @@ -1646,12 +2427,14 @@ def test_streaming_create_message_with_input_json_delta( } ] - with mock.patch.object( - client._client, - "send", - return_value=response, - ) as _: - with start_transaction(name="anthropic"): + if stream_gen_ai_spans: + items = capture_items("transaction", "span") + + with mock.patch.object( + client._client, + "send", + return_value=response, + ) as _, start_transaction(name="anthropic"): message = client.messages.create( max_tokens=1024, messages=messages, model="model", stream=True ) @@ -1659,40 +2442,87 @@ def test_streaming_create_message_with_input_json_delta( for _ in message: pass - assert len(events) == 1 - (event,) = events + (event,) = (item.payload for item in items if item.type == "transaction") + assert event["transaction"] == "anthropic" - assert event["type"] == "transaction" - assert event["transaction"] == "anthropic" + spans = [item.payload for item in items if item.type == "span"] + assert len(spans) == 1 + (span,) = spans - assert len(event["spans"]) == 1 - (span,) = event["spans"] + assert span["attributes"]["sentry.op"] == OP.GEN_AI_CHAT + assert span["name"] == "chat model" + assert span["attributes"][SPANDATA.GEN_AI_SYSTEM] == "anthropic" + assert span["attributes"][SPANDATA.GEN_AI_OPERATION_NAME] == "chat" + assert span["attributes"][SPANDATA.GEN_AI_REQUEST_MODEL] == "model" + + if send_default_pii and include_prompts: + assert ( + span["attributes"][SPANDATA.GEN_AI_REQUEST_MESSAGES] + == '[{"role": "user", "content": "What is the weather like in San Francisco?"}]' + ) + assert ( + span["attributes"][SPANDATA.GEN_AI_RESPONSE_TEXT] + == '{"location": "San Francisco, CA"}' + ) + else: + assert SPANDATA.GEN_AI_REQUEST_MESSAGES not in span["attributes"] + assert SPANDATA.GEN_AI_RESPONSE_TEXT not in span["attributes"] - assert span["op"] == OP.GEN_AI_CHAT - assert span["description"] == "chat model" - assert span["data"][SPANDATA.GEN_AI_SYSTEM] == "anthropic" - assert span["data"][SPANDATA.GEN_AI_OPERATION_NAME] == "chat" - assert span["data"][SPANDATA.GEN_AI_REQUEST_MODEL] == "model" + assert span["attributes"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS] == 366 + assert span["attributes"][SPANDATA.GEN_AI_USAGE_OUTPUT_TOKENS] == 41 + assert span["attributes"][SPANDATA.GEN_AI_USAGE_TOTAL_TOKENS] == 407 + assert span["attributes"][SPANDATA.GEN_AI_RESPONSE_STREAMING] is True - if send_default_pii and include_prompts: - assert ( - span["data"][SPANDATA.GEN_AI_REQUEST_MESSAGES] - == '[{"role": "user", "content": "What is the weather like in San Francisco?"}]' - ) - assert ( - span["data"][SPANDATA.GEN_AI_RESPONSE_TEXT] - == '{"location": "San Francisco, CA"}' - ) else: - assert SPANDATA.GEN_AI_REQUEST_MESSAGES not in span["data"] - assert SPANDATA.GEN_AI_RESPONSE_TEXT not in span["data"] + events = capture_events() + + with mock.patch.object( + client._client, + "send", + return_value=response, + ) as _, start_transaction(name="anthropic"): + message = client.messages.create( + max_tokens=1024, messages=messages, model="model", stream=True + ) + + for _ in message: + pass + + assert len(events) == 1 + (event,) = events + + assert event["type"] == "transaction" + assert event["transaction"] == "anthropic" - assert span["data"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS] == 366 - assert span["data"][SPANDATA.GEN_AI_USAGE_OUTPUT_TOKENS] == 41 - assert span["data"][SPANDATA.GEN_AI_USAGE_TOTAL_TOKENS] == 407 - assert span["data"][SPANDATA.GEN_AI_RESPONSE_STREAMING] is True + assert len(event["spans"]) == 1 + (span,) = event["spans"] + assert span["op"] == OP.GEN_AI_CHAT + assert span["description"] == "chat model" + assert span["data"][SPANDATA.GEN_AI_SYSTEM] == "anthropic" + assert span["data"][SPANDATA.GEN_AI_OPERATION_NAME] == "chat" + assert span["data"][SPANDATA.GEN_AI_REQUEST_MODEL] == "model" + if send_default_pii and include_prompts: + assert ( + span["data"][SPANDATA.GEN_AI_REQUEST_MESSAGES] + == '[{"role": "user", "content": "What is the weather like in San Francisco?"}]' + ) + assert ( + span["data"][SPANDATA.GEN_AI_RESPONSE_TEXT] + == '{"location": "San Francisco, CA"}' + ) + else: + assert SPANDATA.GEN_AI_REQUEST_MESSAGES not in span["data"] + assert SPANDATA.GEN_AI_RESPONSE_TEXT not in span["data"] + + assert span["data"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS] == 366 + assert span["data"][SPANDATA.GEN_AI_USAGE_OUTPUT_TOKENS] == 41 + assert span["data"][SPANDATA.GEN_AI_USAGE_TOTAL_TOKENS] == 407 + assert span["data"][SPANDATA.GEN_AI_RESPONSE_STREAMING] is True + + +@pytest.mark.parametrize("stream_gen_ai_spans", [True, False]) @pytest.mark.skipif( ANTHROPIC_VERSION < (0, 27), reason="Versions <0.27.0 do not include InputJSONDelta, which was introduced in >=0.27.0 along with a new message delta type for tool calling.", @@ -1709,10 +2539,12 @@ def test_streaming_create_message_with_input_json_delta( def test_stream_messages_with_input_json_delta( sentry_init, capture_events, + capture_items, send_default_pii, include_prompts, get_model_response, server_side_event_chunks, + stream_gen_ai_spans, ): client = Anthropic(api_key="z") @@ -1787,8 +2619,8 @@ def test_stream_messages_with_input_json_delta( integrations=[AnthropicIntegration(include_prompts=include_prompts)], traces_sample_rate=1.0, send_default_pii=send_default_pii, + stream_gen_ai_spans=stream_gen_ai_spans, ) - events = capture_events() messages = [ { @@ -1797,54 +2629,101 @@ def test_stream_messages_with_input_json_delta( } ] - with mock.patch.object( - client._client, - "send", - return_value=response, - ) as _: - with start_transaction(name="anthropic"): - with client.messages.stream( - max_tokens=1024, - messages=messages, - model="model", - ) as stream: - for event in stream: - pass + if stream_gen_ai_spans: + items = capture_items("transaction", "span") - assert len(events) == 1 - (event,) = events + with mock.patch.object( + client._client, + "send", + return_value=response, + ) as _, start_transaction(name="anthropic"), client.messages.stream( + max_tokens=1024, + messages=messages, + model="model", + ) as stream: + for event in stream: + pass - assert event["type"] == "transaction" - assert event["transaction"] == "anthropic" + (event,) = (item.payload for item in items if item.type == "transaction") + assert event["transaction"] == "anthropic" - assert len(event["spans"]) == 1 - (span,) = event["spans"] + spans = [item.payload for item in items if item.type == "span"] + assert len(spans) == 1 + (span,) = spans - assert span["op"] == OP.GEN_AI_CHAT - assert span["description"] == "chat model" - assert span["data"][SPANDATA.GEN_AI_SYSTEM] == "anthropic" - assert span["data"][SPANDATA.GEN_AI_OPERATION_NAME] == "chat" - assert span["data"][SPANDATA.GEN_AI_REQUEST_MODEL] == "model" + assert span["attributes"]["sentry.op"] == OP.GEN_AI_CHAT + assert span["name"] == "chat model" + assert span["attributes"][SPANDATA.GEN_AI_SYSTEM] == "anthropic" + assert span["attributes"][SPANDATA.GEN_AI_OPERATION_NAME] == "chat" + assert span["attributes"][SPANDATA.GEN_AI_REQUEST_MODEL] == "model" - if send_default_pii and include_prompts: - assert ( - span["data"][SPANDATA.GEN_AI_REQUEST_MESSAGES] - == '[{"role": "user", "content": "What is the weather like in San Francisco?"}]' - ) - assert ( - span["data"][SPANDATA.GEN_AI_RESPONSE_TEXT] - == '{"location": "San Francisco, CA"}' - ) + if send_default_pii and include_prompts: + assert ( + span["attributes"][SPANDATA.GEN_AI_REQUEST_MESSAGES] + == '[{"role": "user", "content": "What is the weather like in San Francisco?"}]' + ) + assert ( + span["attributes"][SPANDATA.GEN_AI_RESPONSE_TEXT] + == '{"location": "San Francisco, CA"}' + ) + else: + assert SPANDATA.GEN_AI_REQUEST_MESSAGES not in span["attributes"] + assert SPANDATA.GEN_AI_RESPONSE_TEXT not in span["attributes"] + + assert span["attributes"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS] == 366 + assert span["attributes"][SPANDATA.GEN_AI_USAGE_OUTPUT_TOKENS] == 41 + assert span["attributes"][SPANDATA.GEN_AI_USAGE_TOTAL_TOKENS] == 407 + assert span["attributes"][SPANDATA.GEN_AI_RESPONSE_STREAMING] is True else: - assert SPANDATA.GEN_AI_REQUEST_MESSAGES not in span["data"] - assert SPANDATA.GEN_AI_RESPONSE_TEXT not in span["data"] + events = capture_events() + + with mock.patch.object( + client._client, + "send", + return_value=response, + ) as _, start_transaction(name="anthropic"), client.messages.stream( + max_tokens=1024, + messages=messages, + model="model", + ) as stream: + for event in stream: + pass + + assert len(events) == 1 + (event,) = events + + assert event["type"] == "transaction" + assert event["transaction"] == "anthropic" + + assert len(event["spans"]) == 1 + (span,) = event["spans"] + + assert span["op"] == OP.GEN_AI_CHAT + assert span["description"] == "chat model" + assert span["data"][SPANDATA.GEN_AI_SYSTEM] == "anthropic" + assert span["data"][SPANDATA.GEN_AI_OPERATION_NAME] == "chat" + assert span["data"][SPANDATA.GEN_AI_REQUEST_MODEL] == "model" - assert span["data"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS] == 366 - assert span["data"][SPANDATA.GEN_AI_USAGE_OUTPUT_TOKENS] == 41 - assert span["data"][SPANDATA.GEN_AI_USAGE_TOTAL_TOKENS] == 407 - assert span["data"][SPANDATA.GEN_AI_RESPONSE_STREAMING] is True + if send_default_pii and include_prompts: + assert ( + span["data"][SPANDATA.GEN_AI_REQUEST_MESSAGES] + == '[{"role": "user", "content": "What is the weather like in San Francisco?"}]' + ) + assert ( + span["data"][SPANDATA.GEN_AI_RESPONSE_TEXT] + == '{"location": "San Francisco, CA"}' + ) + else: + assert SPANDATA.GEN_AI_REQUEST_MESSAGES not in span["data"] + assert SPANDATA.GEN_AI_RESPONSE_TEXT not in span["data"] + assert span["data"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS] == 366 + assert span["data"][SPANDATA.GEN_AI_USAGE_OUTPUT_TOKENS] == 41 + assert span["data"][SPANDATA.GEN_AI_USAGE_TOTAL_TOKENS] == 407 + assert span["data"][SPANDATA.GEN_AI_RESPONSE_STREAMING] is True + +@pytest.mark.parametrize("stream_gen_ai_spans", [True, False]) @pytest.mark.asyncio @pytest.mark.skipif( ANTHROPIC_VERSION < (0, 27), @@ -1862,11 +2741,13 @@ def test_stream_messages_with_input_json_delta( async def test_streaming_create_message_with_input_json_delta_async( sentry_init, capture_events, + capture_items, send_default_pii, include_prompts, get_model_response, async_iterator, server_side_event_chunks, + stream_gen_ai_spans, ): client = AsyncAnthropic(api_key="z") response = get_model_response( @@ -1946,8 +2827,8 @@ async def test_streaming_create_message_with_input_json_delta_async( integrations=[AnthropicIntegration(include_prompts=include_prompts)], traces_sample_rate=1.0, send_default_pii=send_default_pii, + stream_gen_ai_spans=stream_gen_ai_spans, ) - events = capture_events() messages = [ { @@ -1956,12 +2837,14 @@ async def test_streaming_create_message_with_input_json_delta_async( } ] - with mock.patch.object( - client._client, - "send", - return_value=response, - ) as _: - with start_transaction(name="anthropic"): + if stream_gen_ai_spans: + items = capture_items("transaction", "span") + + with mock.patch.object( + client._client, + "send", + return_value=response, + ) as _, start_transaction(name="anthropic"): message = await client.messages.create( max_tokens=1024, messages=messages, model="model", stream=True ) @@ -1969,41 +2852,88 @@ async def test_streaming_create_message_with_input_json_delta_async( async for _ in message: pass - assert len(events) == 1 - (event,) = events + (event,) = (item.payload for item in items if item.type == "transaction") + assert event["transaction"] == "anthropic" - assert event["type"] == "transaction" - assert event["transaction"] == "anthropic" + spans = [item.payload for item in items if item.type == "span"] + assert len(spans) == 1 + (span,) = spans - assert len(event["spans"]) == 1 - (span,) = event["spans"] + assert span["attributes"]["sentry.op"] == OP.GEN_AI_CHAT + assert span["name"] == "chat model" + assert span["attributes"][SPANDATA.GEN_AI_SYSTEM] == "anthropic" + assert span["attributes"][SPANDATA.GEN_AI_OPERATION_NAME] == "chat" + assert span["attributes"][SPANDATA.GEN_AI_REQUEST_MODEL] == "model" - assert span["op"] == OP.GEN_AI_CHAT - assert span["description"] == "chat model" - assert span["data"][SPANDATA.GEN_AI_SYSTEM] == "anthropic" - assert span["data"][SPANDATA.GEN_AI_OPERATION_NAME] == "chat" - assert span["data"][SPANDATA.GEN_AI_REQUEST_MODEL] == "model" + if send_default_pii and include_prompts: + assert ( + span["attributes"][SPANDATA.GEN_AI_REQUEST_MESSAGES] + == '[{"role": "user", "content": "What is the weather like in San Francisco?"}]' + ) + assert ( + span["attributes"][SPANDATA.GEN_AI_RESPONSE_TEXT] + == '{"location": "San Francisco, CA"}' + ) - if send_default_pii and include_prompts: - assert ( - span["data"][SPANDATA.GEN_AI_REQUEST_MESSAGES] - == '[{"role": "user", "content": "What is the weather like in San Francisco?"}]' - ) - assert ( - span["data"][SPANDATA.GEN_AI_RESPONSE_TEXT] - == '{"location": "San Francisco, CA"}' - ) + else: + assert SPANDATA.GEN_AI_REQUEST_MESSAGES not in span["attributes"] + assert SPANDATA.GEN_AI_RESPONSE_TEXT not in span["attributes"] + assert span["attributes"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS] == 366 + assert span["attributes"][SPANDATA.GEN_AI_USAGE_OUTPUT_TOKENS] == 41 + assert span["attributes"][SPANDATA.GEN_AI_USAGE_TOTAL_TOKENS] == 407 + assert span["attributes"][SPANDATA.GEN_AI_RESPONSE_STREAMING] is True else: - assert SPANDATA.GEN_AI_REQUEST_MESSAGES not in span["data"] - assert SPANDATA.GEN_AI_RESPONSE_TEXT not in span["data"] + events = capture_events() + + with mock.patch.object( + client._client, + "send", + return_value=response, + ) as _, start_transaction(name="anthropic"): + message = await client.messages.create( + max_tokens=1024, messages=messages, model="model", stream=True + ) + + async for _ in message: + pass + + assert len(events) == 1 + (event,) = events + + assert event["type"] == "transaction" + assert event["transaction"] == "anthropic" + + assert len(event["spans"]) == 1 + (span,) = event["spans"] + + assert span["op"] == OP.GEN_AI_CHAT + assert span["description"] == "chat model" + assert span["data"][SPANDATA.GEN_AI_SYSTEM] == "anthropic" + assert span["data"][SPANDATA.GEN_AI_OPERATION_NAME] == "chat" + assert span["data"][SPANDATA.GEN_AI_REQUEST_MODEL] == "model" - assert span["data"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS] == 366 - assert span["data"][SPANDATA.GEN_AI_USAGE_OUTPUT_TOKENS] == 41 - assert span["data"][SPANDATA.GEN_AI_USAGE_TOTAL_TOKENS] == 407 - assert span["data"][SPANDATA.GEN_AI_RESPONSE_STREAMING] is True + if send_default_pii and include_prompts: + assert ( + span["data"][SPANDATA.GEN_AI_REQUEST_MESSAGES] + == '[{"role": "user", "content": "What is the weather like in San Francisco?"}]' + ) + assert ( + span["data"][SPANDATA.GEN_AI_RESPONSE_TEXT] + == '{"location": "San Francisco, CA"}' + ) + + else: + assert SPANDATA.GEN_AI_REQUEST_MESSAGES not in span["data"] + assert SPANDATA.GEN_AI_RESPONSE_TEXT not in span["data"] + + assert span["data"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS] == 366 + assert span["data"][SPANDATA.GEN_AI_USAGE_OUTPUT_TOKENS] == 41 + assert span["data"][SPANDATA.GEN_AI_USAGE_TOTAL_TOKENS] == 407 + assert span["data"][SPANDATA.GEN_AI_RESPONSE_STREAMING] is True +@pytest.mark.parametrize("stream_gen_ai_spans", [True, False]) @pytest.mark.asyncio @pytest.mark.skipif( ANTHROPIC_VERSION < (0, 27), @@ -2021,11 +2951,13 @@ async def test_streaming_create_message_with_input_json_delta_async( async def test_stream_message_with_input_json_delta_async( sentry_init, capture_events, + capture_items, send_default_pii, include_prompts, get_model_response, async_iterator, server_side_event_chunks, + stream_gen_ai_spans, ): client = AsyncAnthropic(api_key="z") response = get_model_response( @@ -2105,8 +3037,8 @@ async def test_stream_message_with_input_json_delta_async( integrations=[AnthropicIntegration(include_prompts=include_prompts)], traces_sample_rate=1.0, send_default_pii=send_default_pii, + stream_gen_ai_spans=stream_gen_ai_spans, ) - events = capture_events() messages = [ { @@ -2115,12 +3047,14 @@ async def test_stream_message_with_input_json_delta_async( } ] - with mock.patch.object( - client._client, - "send", - return_value=response, - ) as _: - with start_transaction(name="anthropic"): + if stream_gen_ai_spans: + items = capture_items("transaction", "span") + + with mock.patch.object( + client._client, + "send", + return_value=response, + ) as _, start_transaction(name="anthropic"): async with client.messages.stream( max_tokens=1024, messages=messages, @@ -2129,70 +3063,108 @@ async def test_stream_message_with_input_json_delta_async( async for event in stream: pass - assert len(events) == 1 - (event,) = events - - assert event["type"] == "transaction" - assert event["transaction"] == "anthropic" - - assert len(event["spans"]) == 1 - (span,) = event["spans"] + (event,) = (item.payload for item in items if item.type == "transaction") + assert event["transaction"] == "anthropic" - assert span["op"] == OP.GEN_AI_CHAT - assert span["description"] == "chat model" - assert span["data"][SPANDATA.GEN_AI_SYSTEM] == "anthropic" - assert span["data"][SPANDATA.GEN_AI_OPERATION_NAME] == "chat" - assert span["data"][SPANDATA.GEN_AI_REQUEST_MODEL] == "model" + spans = [item.payload for item in items if item.type == "span"] + assert len(spans) == 1 + (span,) = spans - if send_default_pii and include_prompts: - assert ( - span["data"][SPANDATA.GEN_AI_REQUEST_MESSAGES] - == '[{"role": "user", "content": "What is the weather like in San Francisco?"}]' - ) - assert ( - span["data"][SPANDATA.GEN_AI_RESPONSE_TEXT] - == '{"location": "San Francisco, CA"}' - ) + assert span["attributes"]["sentry.op"] == OP.GEN_AI_CHAT + assert span["name"] == "chat model" + assert span["attributes"][SPANDATA.GEN_AI_SYSTEM] == "anthropic" + assert span["attributes"][SPANDATA.GEN_AI_OPERATION_NAME] == "chat" + assert span["attributes"][SPANDATA.GEN_AI_REQUEST_MODEL] == "model" + if send_default_pii and include_prompts: + assert ( + span["attributes"][SPANDATA.GEN_AI_REQUEST_MESSAGES] + == '[{"role": "user", "content": "What is the weather like in San Francisco?"}]' + ) + assert ( + span["attributes"][SPANDATA.GEN_AI_RESPONSE_TEXT] + == '{"location": "San Francisco, CA"}' + ) + else: + assert SPANDATA.GEN_AI_REQUEST_MESSAGES not in span["attributes"] + assert SPANDATA.GEN_AI_RESPONSE_TEXT not in span["attributes"] + + assert span["attributes"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS] == 366 + assert span["attributes"][SPANDATA.GEN_AI_USAGE_OUTPUT_TOKENS] == 41 + assert span["attributes"][SPANDATA.GEN_AI_USAGE_TOTAL_TOKENS] == 407 + assert span["attributes"][SPANDATA.GEN_AI_RESPONSE_STREAMING] is True else: - assert SPANDATA.GEN_AI_REQUEST_MESSAGES not in span["data"] - assert SPANDATA.GEN_AI_RESPONSE_TEXT not in span["data"] + events = capture_events() - assert span["data"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS] == 366 - assert span["data"][SPANDATA.GEN_AI_USAGE_OUTPUT_TOKENS] == 41 - assert span["data"][SPANDATA.GEN_AI_USAGE_TOTAL_TOKENS] == 407 - assert span["data"][SPANDATA.GEN_AI_RESPONSE_STREAMING] is True + with mock.patch.object( + client._client, + "send", + return_value=response, + ) as _, start_transaction(name="anthropic"): + async with client.messages.stream( + max_tokens=1024, + messages=messages, + model="model", + ) as stream: + async for event in stream: + pass + + assert len(events) == 1 + (event,) = events + assert event["type"] == "transaction" + assert event["transaction"] == "anthropic" -def test_exception_message_create(sentry_init, capture_events): - sentry_init(integrations=[AnthropicIntegration()], traces_sample_rate=1.0) - events = capture_events() + assert len(event["spans"]) == 1 + (span,) = event["spans"] + + assert span["op"] == OP.GEN_AI_CHAT + assert span["description"] == "chat model" + assert span["data"][SPANDATA.GEN_AI_SYSTEM] == "anthropic" + assert span["data"][SPANDATA.GEN_AI_OPERATION_NAME] == "chat" + assert span["data"][SPANDATA.GEN_AI_REQUEST_MODEL] == "model" + + if send_default_pii and include_prompts: + assert ( + span["data"][SPANDATA.GEN_AI_REQUEST_MESSAGES] + == '[{"role": "user", "content": "What is the weather like in San Francisco?"}]' + ) + assert ( + span["data"][SPANDATA.GEN_AI_RESPONSE_TEXT] + == '{"location": "San Francisco, CA"}' + ) + + else: + assert SPANDATA.GEN_AI_REQUEST_MESSAGES not in span["data"] + assert SPANDATA.GEN_AI_RESPONSE_TEXT not in span["data"] + + assert span["data"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS] == 366 + assert span["data"][SPANDATA.GEN_AI_USAGE_OUTPUT_TOKENS] == 41 + assert span["data"][SPANDATA.GEN_AI_USAGE_TOTAL_TOKENS] == 407 + assert span["data"][SPANDATA.GEN_AI_RESPONSE_STREAMING] is True + + +@pytest.mark.parametrize("stream_gen_ai_spans", [True, False]) +def test_exception_message_create( + sentry_init, + capture_events, + capture_items, + stream_gen_ai_spans, +): + sentry_init( + integrations=[AnthropicIntegration()], + traces_sample_rate=1.0, + stream_gen_ai_spans=stream_gen_ai_spans, + ) client = Anthropic(api_key="z") client.messages._post = mock.Mock( side_effect=AnthropicError("API rate limit reached") ) - with pytest.raises(AnthropicError): - client.messages.create( - model="some-model", - messages=[{"role": "system", "content": "I'm throwing an exception"}], - max_tokens=1024, - ) - - (event, transaction) = events - assert event["level"] == "error" - assert transaction["contexts"]["trace"]["status"] == "internal_error" - -def test_span_status_error(sentry_init, capture_events): - sentry_init(integrations=[AnthropicIntegration()], traces_sample_rate=1.0) - events = capture_events() + if stream_gen_ai_spans: + items = capture_items("event", "transaction") - with start_transaction(name="anthropic"): - client = Anthropic(api_key="z") - client.messages._post = mock.Mock( - side_effect=AnthropicError("API rate limit reached") - ) with pytest.raises(AnthropicError): client.messages.create( model="some-model", @@ -2200,66 +3172,209 @@ def test_span_status_error(sentry_init, capture_events): max_tokens=1024, ) - (error, transaction) = events - assert error["level"] == "error" - assert transaction["spans"][0]["status"] == "internal_error" - assert transaction["spans"][0]["tags"]["status"] == "internal_error" - assert transaction["spans"][0]["data"][SPANDATA.GEN_AI_SYSTEM] == "anthropic" - assert transaction["spans"][0]["data"][SPANDATA.GEN_AI_OPERATION_NAME] == "chat" + (event,) = (item.payload for item in items if item.type == "event") + assert event["level"] == "error" + (transaction,) = (item.payload for item in items if item.type == "transaction") + else: + events = capture_events() -@pytest.mark.asyncio -async def test_span_status_error_async(sentry_init, capture_events): - sentry_init(integrations=[AnthropicIntegration()], traces_sample_rate=1.0) - events = capture_events() - - with start_transaction(name="anthropic"): - client = AsyncAnthropic(api_key="z") - client.messages._post = AsyncMock( - side_effect=AnthropicError("API rate limit reached") - ) with pytest.raises(AnthropicError): - await client.messages.create( + client.messages.create( model="some-model", messages=[{"role": "system", "content": "I'm throwing an exception"}], max_tokens=1024, ) - (error, transaction) = events - assert error["level"] == "error" - assert transaction["spans"][0]["status"] == "internal_error" - assert transaction["spans"][0]["tags"]["status"] == "internal_error" - assert transaction["spans"][0]["data"][SPANDATA.GEN_AI_SYSTEM] == "anthropic" - assert transaction["spans"][0]["data"][SPANDATA.GEN_AI_OPERATION_NAME] == "chat" + (event, transaction) = events + assert event["level"] == "error" + + assert transaction["contexts"]["trace"]["status"] == "internal_error" + + +@pytest.mark.parametrize("stream_gen_ai_spans", [True, False]) +def test_span_status_error( + sentry_init, + capture_events, + capture_items, + stream_gen_ai_spans, +): + sentry_init( + integrations=[AnthropicIntegration()], + traces_sample_rate=1.0, + stream_gen_ai_spans=stream_gen_ai_spans, + ) + if stream_gen_ai_spans: + items = capture_items("event", "span") + with start_transaction(name="anthropic"): + client = Anthropic(api_key="z") + client.messages._post = mock.Mock( + side_effect=AnthropicError("API rate limit reached") + ) + with pytest.raises(AnthropicError): + client.messages.create( + model="some-model", + messages=[ + {"role": "system", "content": "I'm throwing an exception"} + ], + max_tokens=1024, + ) + + (error,) = (item.payload for item in items if item.type == "event") + assert error["level"] == "error" + + spans = [item.payload for item in items if item.type == "span"] + assert spans[0]["status"] == "error" + assert spans[0]["attributes"][SPANDATA.GEN_AI_SYSTEM] == "anthropic" + assert spans[0]["attributes"][SPANDATA.GEN_AI_OPERATION_NAME] == "chat" + else: + events = capture_events() + + with start_transaction(name="anthropic"): + client = Anthropic(api_key="z") + client.messages._post = mock.Mock( + side_effect=AnthropicError("API rate limit reached") + ) + with pytest.raises(AnthropicError): + client.messages.create( + model="some-model", + messages=[ + {"role": "system", "content": "I'm throwing an exception"} + ], + max_tokens=1024, + ) + + (error, transaction) = events + assert error["level"] == "error" + assert transaction["spans"][0]["status"] == "internal_error" + assert transaction["spans"][0]["tags"]["status"] == "internal_error" + assert transaction["spans"][0]["data"][SPANDATA.GEN_AI_SYSTEM] == "anthropic" + assert transaction["spans"][0]["data"][SPANDATA.GEN_AI_OPERATION_NAME] == "chat" + + +@pytest.mark.parametrize("stream_gen_ai_spans", [True, False]) +@pytest.mark.asyncio +async def test_span_status_error_async( + sentry_init, + capture_events, + capture_items, + stream_gen_ai_spans, +): + sentry_init( + integrations=[AnthropicIntegration()], + traces_sample_rate=1.0, + stream_gen_ai_spans=stream_gen_ai_spans, + ) + if stream_gen_ai_spans: + items = capture_items("event", "span") + + with start_transaction(name="anthropic"): + client = AsyncAnthropic(api_key="z") + client.messages._post = AsyncMock( + side_effect=AnthropicError("API rate limit reached") + ) + with pytest.raises(AnthropicError): + await client.messages.create( + model="some-model", + messages=[ + {"role": "system", "content": "I'm throwing an exception"} + ], + max_tokens=1024, + ) + + (error,) = (item.payload for item in items if item.type == "event") + assert error["level"] == "error" + + spans = [item.payload for item in items if item.type == "span"] + assert spans[0]["status"] == "error" + assert spans[0]["attributes"][SPANDATA.GEN_AI_SYSTEM] == "anthropic" + assert spans[0]["attributes"][SPANDATA.GEN_AI_OPERATION_NAME] == "chat" + else: + events = capture_events() + with start_transaction(name="anthropic"): + client = AsyncAnthropic(api_key="z") + client.messages._post = AsyncMock( + side_effect=AnthropicError("API rate limit reached") + ) + with pytest.raises(AnthropicError): + await client.messages.create( + model="some-model", + messages=[ + {"role": "system", "content": "I'm throwing an exception"} + ], + max_tokens=1024, + ) + + (error, transaction) = events + assert error["level"] == "error" + assert transaction["spans"][0]["status"] == "internal_error" + assert transaction["spans"][0]["tags"]["status"] == "internal_error" + assert transaction["spans"][0]["data"][SPANDATA.GEN_AI_SYSTEM] == "anthropic" + assert transaction["spans"][0]["data"][SPANDATA.GEN_AI_OPERATION_NAME] == "chat" + + +@pytest.mark.parametrize("stream_gen_ai_spans", [True, False]) @pytest.mark.asyncio -async def test_exception_message_create_async(sentry_init, capture_events): - sentry_init(integrations=[AnthropicIntegration()], traces_sample_rate=1.0) - events = capture_events() +async def test_exception_message_create_async( + sentry_init, + capture_events, + capture_items, + stream_gen_ai_spans, +): + sentry_init( + integrations=[AnthropicIntegration()], + traces_sample_rate=1.0, + stream_gen_ai_spans=stream_gen_ai_spans, + ) client = AsyncAnthropic(api_key="z") client.messages._post = AsyncMock( side_effect=AnthropicError("API rate limit reached") ) - with pytest.raises(AnthropicError): - await client.messages.create( - model="some-model", - messages=[{"role": "system", "content": "I'm throwing an exception"}], - max_tokens=1024, - ) - (event, transaction) = events - assert event["level"] == "error" + if stream_gen_ai_spans: + items = capture_items("event", "transaction") + + with pytest.raises(AnthropicError): + await client.messages.create( + model="some-model", + messages=[{"role": "system", "content": "I'm throwing an exception"}], + max_tokens=1024, + ) + + (event,) = (item.payload for item in items if item.type == "event") + assert event["level"] == "error" + + (transaction,) = (item.payload for item in items if item.type == "transaction") + else: + events = capture_events() + + with pytest.raises(AnthropicError): + await client.messages.create( + model="some-model", + messages=[{"role": "system", "content": "I'm throwing an exception"}], + max_tokens=1024, + ) + + (event, transaction) = events + assert event["level"] == "error" assert transaction["contexts"]["trace"]["status"] == "internal_error" -def test_span_origin(sentry_init, capture_events): +@pytest.mark.parametrize("stream_gen_ai_spans", [True, False]) +def test_span_origin( + sentry_init, + capture_events, + capture_items, + stream_gen_ai_spans, +): sentry_init( integrations=[AnthropicIntegration()], traces_sample_rate=1.0, + stream_gen_ai_spans=stream_gen_ai_spans, ) - events = capture_events() client = Anthropic(api_key="z") client.messages._post = mock.Mock(return_value=EXAMPLE_MESSAGE) @@ -2271,24 +3386,45 @@ def test_span_origin(sentry_init, capture_events): } ] - with start_transaction(name="anthropic"): - client.messages.create(max_tokens=1024, messages=messages, model="model") + if stream_gen_ai_spans: + items = capture_items("transaction", "span") + + with start_transaction(name="anthropic"): + client.messages.create(max_tokens=1024, messages=messages, model="model") + + (event,) = (item.payload for item in items if item.type == "transaction") + assert event["contexts"]["trace"]["origin"] == "manual" + + spans = [item.payload for item in items if item.type == "span"] + assert spans[0]["attributes"]["sentry.origin"] == "auto.ai.anthropic" + assert spans[0]["attributes"][SPANDATA.GEN_AI_SYSTEM] == "anthropic" + assert spans[0]["attributes"][SPANDATA.GEN_AI_OPERATION_NAME] == "chat" + else: + events = capture_events() - (event,) = events + with start_transaction(name="anthropic"): + client.messages.create(max_tokens=1024, messages=messages, model="model") - assert event["contexts"]["trace"]["origin"] == "manual" - assert event["spans"][0]["origin"] == "auto.ai.anthropic" - assert event["spans"][0]["data"][SPANDATA.GEN_AI_SYSTEM] == "anthropic" - assert event["spans"][0]["data"][SPANDATA.GEN_AI_OPERATION_NAME] == "chat" + (event,) = events + assert event["contexts"]["trace"]["origin"] == "manual" + assert event["spans"][0]["origin"] == "auto.ai.anthropic" + assert event["spans"][0]["data"][SPANDATA.GEN_AI_SYSTEM] == "anthropic" + assert event["spans"][0]["data"][SPANDATA.GEN_AI_OPERATION_NAME] == "chat" +@pytest.mark.parametrize("stream_gen_ai_spans", [True, False]) @pytest.mark.asyncio -async def test_span_origin_async(sentry_init, capture_events): +async def test_span_origin_async( + sentry_init, + capture_events, + capture_items, + stream_gen_ai_spans, +): sentry_init( integrations=[AnthropicIntegration()], traces_sample_rate=1.0, + stream_gen_ai_spans=stream_gen_ai_spans, ) - events = capture_events() client = AsyncAnthropic(api_key="z") client.messages._post = AsyncMock(return_value=EXAMPLE_MESSAGE) @@ -2300,15 +3436,35 @@ async def test_span_origin_async(sentry_init, capture_events): } ] - with start_transaction(name="anthropic"): - await client.messages.create(max_tokens=1024, messages=messages, model="model") + if stream_gen_ai_spans: + items = capture_items("transaction", "span") - (event,) = events + with start_transaction(name="anthropic"): + await client.messages.create( + max_tokens=1024, messages=messages, model="model" + ) - assert event["contexts"]["trace"]["origin"] == "manual" - assert event["spans"][0]["origin"] == "auto.ai.anthropic" - assert event["spans"][0]["data"][SPANDATA.GEN_AI_SYSTEM] == "anthropic" - assert event["spans"][0]["data"][SPANDATA.GEN_AI_OPERATION_NAME] == "chat" + (event,) = (item.payload for item in items if item.type == "transaction") + assert event["contexts"]["trace"]["origin"] == "manual" + + spans = [item.payload for item in items if item.type == "span"] + assert spans[0]["attributes"]["sentry.origin"] == "auto.ai.anthropic" + assert spans[0]["attributes"][SPANDATA.GEN_AI_SYSTEM] == "anthropic" + assert spans[0]["attributes"][SPANDATA.GEN_AI_OPERATION_NAME] == "chat" + else: + events = capture_events() + + with start_transaction(name="anthropic"): + await client.messages.create( + max_tokens=1024, messages=messages, model="model" + ) + + (event,) = events + + assert event["contexts"]["trace"]["origin"] == "manual" + assert event["spans"][0]["origin"] == "auto.ai.anthropic" + assert event["spans"][0]["data"][SPANDATA.GEN_AI_SYSTEM] == "anthropic" + assert event["spans"][0]["data"][SPANDATA.GEN_AI_OPERATION_NAME] == "chat" @pytest.mark.skipif( @@ -2376,6 +3532,7 @@ def test_set_output_data_with_input_json_delta(sentry_init): # Test messages with mixed roles including "ai" that should be mapped to "assistant" +@pytest.mark.parametrize("stream_gen_ai_spans", [True, False]) @pytest.mark.parametrize( "test_message,expected_role", [ @@ -2392,15 +3549,20 @@ def test_set_output_data_with_input_json_delta(sentry_init): ], ) def test_anthropic_message_role_mapping( - sentry_init, capture_events, test_message, expected_role + sentry_init, + capture_events, + capture_items, + test_message, + expected_role, + stream_gen_ai_spans, ): """Test that Anthropic integration properly maps message roles like 'ai' to 'assistant'""" sentry_init( integrations=[AnthropicIntegration(include_prompts=True)], traces_sample_rate=1.0, send_default_pii=True, + stream_gen_ai_spans=stream_gen_ai_spans, ) - events = capture_events() client = Anthropic(api_key="z") @@ -2420,34 +3582,63 @@ def mock_messages_create(*args, **kwargs): test_messages = [test_message] - with start_transaction(name="anthropic tx"): - client.messages.create( - model="claude-3-opus", max_tokens=10, messages=test_messages + if stream_gen_ai_spans: + items = capture_items("transaction", "span") + + with start_transaction(name="anthropic tx"): + client.messages.create( + model="claude-3-opus", max_tokens=10, messages=test_messages + ) + + span = next(item.payload for item in items if item.type == "span") + + # Verify that the span was created correctly + assert span["attributes"]["sentry.op"] == "gen_ai.chat" + assert span["attributes"][SPANDATA.GEN_AI_SYSTEM] == "anthropic" + assert span["attributes"][SPANDATA.GEN_AI_OPERATION_NAME] == "chat" + assert SPANDATA.GEN_AI_REQUEST_MESSAGES in span["attributes"] + + # Parse the stored messages + stored_messages = json.loads( + span["attributes"][SPANDATA.GEN_AI_REQUEST_MESSAGES] ) + else: + events = capture_events() - (event,) = events - span = event["spans"][0] + with start_transaction(name="anthropic tx"): + client.messages.create( + model="claude-3-opus", max_tokens=10, messages=test_messages + ) - # Verify that the span was created correctly - assert span["op"] == "gen_ai.chat" - assert span["data"][SPANDATA.GEN_AI_SYSTEM] == "anthropic" - assert span["data"][SPANDATA.GEN_AI_OPERATION_NAME] == "chat" - assert SPANDATA.GEN_AI_REQUEST_MESSAGES in span["data"] + (event,) = events + span = event["spans"][0] + + # Verify that the span was created correctly + assert span["op"] == "gen_ai.chat" + assert span["data"][SPANDATA.GEN_AI_SYSTEM] == "anthropic" + assert span["data"][SPANDATA.GEN_AI_OPERATION_NAME] == "chat" + assert SPANDATA.GEN_AI_REQUEST_MESSAGES in span["data"] - # Parse the stored messages - stored_messages = json.loads(span["data"][SPANDATA.GEN_AI_REQUEST_MESSAGES]) + # Parse the stored messages + stored_messages = json.loads(span["data"][SPANDATA.GEN_AI_REQUEST_MESSAGES]) assert stored_messages[0]["role"] == expected_role -def test_anthropic_message_truncation(sentry_init, capture_events): +@pytest.mark.parametrize("stream_gen_ai_spans", [True, False]) +def test_anthropic_message_truncation( + sentry_init, + capture_events, + capture_items, + stream_gen_ai_spans, +): """Test that large messages are truncated properly in Anthropic integration.""" sentry_init( integrations=[AnthropicIntegration(include_prompts=True)], traces_sample_rate=1.0, send_default_pii=True, + stream_gen_ai_spans=stream_gen_ai_spans, ) - events = capture_events() client = Anthropic(api_key="z") client.messages._post = mock.Mock(return_value=EXAMPLE_MESSAGE) @@ -2463,43 +3654,82 @@ def test_anthropic_message_truncation(sentry_init, capture_events): {"role": "user", "content": "small message 5"}, ] - with start_transaction(): - client.messages.create(max_tokens=1024, messages=messages, model="model") + if stream_gen_ai_spans: + items = capture_items("transaction", "span") - assert len(events) > 0 - tx = events[0] - assert tx["type"] == "transaction" + with start_transaction(): + client.messages.create(max_tokens=1024, messages=messages, model="model") - chat_spans = [ - span for span in tx.get("spans", []) if span.get("op") == OP.GEN_AI_CHAT - ] - assert len(chat_spans) > 0 + spans = [item.payload for item in items if item.type == "span"] + chat_spans = [ + span + for span in spans + if span["attributes"].get("sentry.op") == OP.GEN_AI_CHAT + ] + + assert len(chat_spans) > 0 + + chat_span = chat_spans[0] + assert chat_span["attributes"][SPANDATA.GEN_AI_SYSTEM] == "anthropic" + assert chat_span["attributes"][SPANDATA.GEN_AI_OPERATION_NAME] == "chat" + assert SPANDATA.GEN_AI_REQUEST_MESSAGES in chat_span["attributes"] - chat_span = chat_spans[0] - assert chat_span["data"][SPANDATA.GEN_AI_SYSTEM] == "anthropic" - assert chat_span["data"][SPANDATA.GEN_AI_OPERATION_NAME] == "chat" - assert SPANDATA.GEN_AI_REQUEST_MESSAGES in chat_span["data"] + messages_data = chat_span["attributes"][SPANDATA.GEN_AI_REQUEST_MESSAGES] + assert isinstance(messages_data, str) - messages_data = chat_span["data"][SPANDATA.GEN_AI_REQUEST_MESSAGES] - assert isinstance(messages_data, str) + parsed_messages = json.loads(messages_data) + assert isinstance(parsed_messages, list) + assert len(parsed_messages) == 1 + assert "small message 5" in str(parsed_messages[0]) - parsed_messages = json.loads(messages_data) - assert isinstance(parsed_messages, list) - assert len(parsed_messages) == 1 - assert "small message 5" in str(parsed_messages[0]) + tx = next(item.payload for item in items if item.type == "transaction") + else: + events = capture_events() + + with start_transaction(): + client.messages.create(max_tokens=1024, messages=messages, model="model") + + assert len(events) > 0 + tx = events[0] + assert tx["type"] == "transaction" + + chat_spans = [ + span for span in tx.get("spans", []) if span.get("op") == OP.GEN_AI_CHAT + ] + + assert len(chat_spans) > 0 + + chat_span = chat_spans[0] + assert chat_span["data"][SPANDATA.GEN_AI_SYSTEM] == "anthropic" + assert chat_span["data"][SPANDATA.GEN_AI_OPERATION_NAME] == "chat" + assert SPANDATA.GEN_AI_REQUEST_MESSAGES in chat_span["data"] + + messages_data = chat_span["data"][SPANDATA.GEN_AI_REQUEST_MESSAGES] + assert isinstance(messages_data, str) + + parsed_messages = json.loads(messages_data) + assert isinstance(parsed_messages, list) + assert len(parsed_messages) == 1 + assert "small message 5" in str(parsed_messages[0]) assert tx["_meta"]["spans"]["0"]["data"]["gen_ai.request.messages"][""]["len"] == 5 +@pytest.mark.parametrize("stream_gen_ai_spans", [True, False]) @pytest.mark.asyncio -async def test_anthropic_message_truncation_async(sentry_init, capture_events): +async def test_anthropic_message_truncation_async( + sentry_init, + capture_events, + capture_items, + stream_gen_ai_spans, +): """Test that large messages are truncated properly in Anthropic integration.""" sentry_init( integrations=[AnthropicIntegration(include_prompts=True)], traces_sample_rate=1.0, send_default_pii=True, + stream_gen_ai_spans=stream_gen_ai_spans, ) - events = capture_events() client = AsyncAnthropic(api_key="z") client.messages._post = mock.AsyncMock(return_value=EXAMPLE_MESSAGE) @@ -2515,34 +3745,73 @@ async def test_anthropic_message_truncation_async(sentry_init, capture_events): {"role": "user", "content": "small message 5"}, ] - with start_transaction(): - await client.messages.create(max_tokens=1024, messages=messages, model="model") + if stream_gen_ai_spans: + items = capture_items("transaction", "span") - assert len(events) > 0 - tx = events[0] - assert tx["type"] == "transaction" + with start_transaction(): + await client.messages.create( + max_tokens=1024, messages=messages, model="model" + ) - chat_spans = [ - span for span in tx.get("spans", []) if span.get("op") == OP.GEN_AI_CHAT - ] - assert len(chat_spans) > 0 + spans = [item.payload for item in items if item.type == "span"] + chat_spans = [ + span + for span in spans + if span["attributes"].get("sentry.op") == OP.GEN_AI_CHAT + ] + assert len(chat_spans) > 0 + + chat_span = chat_spans[0] + + assert chat_span["attributes"][SPANDATA.GEN_AI_SYSTEM] == "anthropic" + assert chat_span["attributes"][SPANDATA.GEN_AI_OPERATION_NAME] == "chat" + assert SPANDATA.GEN_AI_REQUEST_MESSAGES in chat_span["attributes"] + + messages_data = chat_span["attributes"][SPANDATA.GEN_AI_REQUEST_MESSAGES] - chat_span = chat_spans[0] - assert chat_span["data"][SPANDATA.GEN_AI_SYSTEM] == "anthropic" - assert chat_span["data"][SPANDATA.GEN_AI_OPERATION_NAME] == "chat" - assert SPANDATA.GEN_AI_REQUEST_MESSAGES in chat_span["data"] + assert isinstance(messages_data, str) - messages_data = chat_span["data"][SPANDATA.GEN_AI_REQUEST_MESSAGES] - assert isinstance(messages_data, str) + parsed_messages = json.loads(messages_data) + assert isinstance(parsed_messages, list) + assert len(parsed_messages) == 1 + assert "small message 5" in str(parsed_messages[0]) - parsed_messages = json.loads(messages_data) - assert isinstance(parsed_messages, list) - assert len(parsed_messages) == 1 - assert "small message 5" in str(parsed_messages[0]) + tx = next(item.payload for item in items if item.type == "transaction") + else: + events = capture_events() + + with start_transaction(): + await client.messages.create( + max_tokens=1024, messages=messages, model="model" + ) + + assert len(events) > 0 + tx = events[0] + assert tx["type"] == "transaction" + + chat_spans = [ + span for span in tx.get("spans", []) if span.get("op") == OP.GEN_AI_CHAT + ] + assert len(chat_spans) > 0 + + chat_span = chat_spans[0] + + assert chat_span["data"][SPANDATA.GEN_AI_SYSTEM] == "anthropic" + assert chat_span["data"][SPANDATA.GEN_AI_OPERATION_NAME] == "chat" + assert SPANDATA.GEN_AI_REQUEST_MESSAGES in chat_span["data"] + + messages_data = chat_span["data"][SPANDATA.GEN_AI_REQUEST_MESSAGES] + assert isinstance(messages_data, str) + + parsed_messages = json.loads(messages_data) + assert isinstance(parsed_messages, list) + assert len(parsed_messages) == 1 + assert "small message 5" in str(parsed_messages[0]) assert tx["_meta"]["spans"]["0"]["data"]["gen_ai.request.messages"][""]["len"] == 5 +@pytest.mark.parametrize("stream_gen_ai_spans", [True, False]) @pytest.mark.parametrize( "send_default_pii, include_prompts", [ @@ -2553,15 +3822,21 @@ async def test_anthropic_message_truncation_async(sentry_init, capture_events): ], ) def test_nonstreaming_create_message_with_system_prompt( - sentry_init, capture_events, send_default_pii, include_prompts + sentry_init, + capture_events, + capture_items, + send_default_pii, + include_prompts, + stream_gen_ai_spans, ): """Test that system prompts are properly captured in GEN_AI_REQUEST_MESSAGES.""" sentry_init( integrations=[AnthropicIntegration(include_prompts=include_prompts)], traces_sample_rate=1.0, send_default_pii=send_default_pii, + stream_gen_ai_spans=stream_gen_ai_spans, ) - events = capture_events() + client = Anthropic(api_key="z") client.messages._post = mock.Mock(return_value=EXAMPLE_MESSAGE) @@ -2572,62 +3847,127 @@ def test_nonstreaming_create_message_with_system_prompt( } ] - with start_transaction(name="anthropic"): - response = client.messages.create( - max_tokens=1024, - messages=messages, - model="model", - system="You are a helpful assistant.", - ) + if stream_gen_ai_spans: + items = capture_items("transaction", "span") - assert response == EXAMPLE_MESSAGE - usage = response.usage + with start_transaction(name="anthropic"): + response = client.messages.create( + max_tokens=1024, + messages=messages, + model="model", + system="You are a helpful assistant.", + ) - assert usage.input_tokens == 10 - assert usage.output_tokens == 20 + assert response == EXAMPLE_MESSAGE + usage = response.usage - assert len(events) == 1 - (event,) = events + assert usage.input_tokens == 10 + assert usage.output_tokens == 20 - assert event["type"] == "transaction" - assert event["transaction"] == "anthropic" + (event,) = (item.payload for item in items if item.type == "transaction") + assert event["transaction"] == "anthropic" - assert len(event["spans"]) == 1 - (span,) = event["spans"] + spans = [item.payload for item in items if item.type == "span"] + assert len(spans) == 1 + (span,) = spans - assert span["op"] == OP.GEN_AI_CHAT - assert span["description"] == "chat model" - assert span["data"][SPANDATA.GEN_AI_SYSTEM] == "anthropic" - assert span["data"][SPANDATA.GEN_AI_OPERATION_NAME] == "chat" - assert span["data"][SPANDATA.GEN_AI_REQUEST_MODEL] == "model" + assert span["attributes"]["sentry.op"] == OP.GEN_AI_CHAT + assert span["name"] == "chat model" + assert span["attributes"][SPANDATA.GEN_AI_SYSTEM] == "anthropic" + assert span["attributes"][SPANDATA.GEN_AI_OPERATION_NAME] == "chat" + assert span["attributes"][SPANDATA.GEN_AI_REQUEST_MODEL] == "model" - if send_default_pii and include_prompts: - assert SPANDATA.GEN_AI_SYSTEM_INSTRUCTIONS in span["data"] - system_instructions = json.loads( - span["data"][SPANDATA.GEN_AI_SYSTEM_INSTRUCTIONS] - ) - assert system_instructions == [ - {"type": "text", "content": "You are a helpful assistant."} - ] + if send_default_pii and include_prompts: + assert SPANDATA.GEN_AI_SYSTEM_INSTRUCTIONS in span["attributes"] + system_instructions = json.loads( + span["attributes"][SPANDATA.GEN_AI_SYSTEM_INSTRUCTIONS] + ) + assert system_instructions == [ + {"type": "text", "content": "You are a helpful assistant."} + ] - assert SPANDATA.GEN_AI_REQUEST_MESSAGES in span["data"] - stored_messages = json.loads(span["data"][SPANDATA.GEN_AI_REQUEST_MESSAGES]) - assert len(stored_messages) == 1 - assert stored_messages[0]["role"] == "user" - assert stored_messages[0]["content"] == "Hello, Claude" - assert span["data"][SPANDATA.GEN_AI_RESPONSE_TEXT] == "Hi, I'm Claude." + assert SPANDATA.GEN_AI_REQUEST_MESSAGES in span["attributes"] + stored_messages = json.loads( + span["attributes"][SPANDATA.GEN_AI_REQUEST_MESSAGES] + ) + assert len(stored_messages) == 1 + assert stored_messages[0]["role"] == "user" + assert stored_messages[0]["content"] == "Hello, Claude" + assert ( + span["attributes"][SPANDATA.GEN_AI_RESPONSE_TEXT] == "Hi, I'm Claude." + ) + else: + assert SPANDATA.GEN_AI_SYSTEM_INSTRUCTIONS not in span["attributes"] + assert SPANDATA.GEN_AI_REQUEST_MESSAGES not in span["attributes"] + assert SPANDATA.GEN_AI_RESPONSE_TEXT not in span["attributes"] + + assert span["attributes"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS] == 10 + assert span["attributes"][SPANDATA.GEN_AI_USAGE_OUTPUT_TOKENS] == 20 + assert span["attributes"][SPANDATA.GEN_AI_USAGE_TOTAL_TOKENS] == 30 + assert span["attributes"][SPANDATA.GEN_AI_RESPONSE_STREAMING] is False + assert span["attributes"][SPANDATA.GEN_AI_RESPONSE_FINISH_REASONS] == [ + "end_turn" + ] else: - assert SPANDATA.GEN_AI_SYSTEM_INSTRUCTIONS not in span["data"] - assert SPANDATA.GEN_AI_REQUEST_MESSAGES not in span["data"] - assert SPANDATA.GEN_AI_RESPONSE_TEXT not in span["data"] + events = capture_events() - assert span["data"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS] == 10 - assert span["data"][SPANDATA.GEN_AI_USAGE_OUTPUT_TOKENS] == 20 - assert span["data"][SPANDATA.GEN_AI_USAGE_TOTAL_TOKENS] == 30 - assert span["data"][SPANDATA.GEN_AI_RESPONSE_STREAMING] is False - assert span["data"][SPANDATA.GEN_AI_RESPONSE_FINISH_REASONS] == ["end_turn"] + with start_transaction(name="anthropic"): + response = client.messages.create( + max_tokens=1024, + messages=messages, + model="model", + system="You are a helpful assistant.", + ) + assert response == EXAMPLE_MESSAGE + usage = response.usage + assert usage.input_tokens == 10 + assert usage.output_tokens == 20 + + assert len(events) == 1 + (event,) = events + + assert event["type"] == "transaction" + assert event["transaction"] == "anthropic" + + assert len(event["spans"]) == 1 + (span,) = event["spans"] + + assert span["op"] == OP.GEN_AI_CHAT + assert span["description"] == "chat model" + assert span["data"][SPANDATA.GEN_AI_SYSTEM] == "anthropic" + assert span["data"][SPANDATA.GEN_AI_OPERATION_NAME] == "chat" + assert span["data"][SPANDATA.GEN_AI_REQUEST_MODEL] == "model" + + if send_default_pii and include_prompts: + assert SPANDATA.GEN_AI_SYSTEM_INSTRUCTIONS in span["data"] + system_instructions = json.loads( + span["data"][SPANDATA.GEN_AI_SYSTEM_INSTRUCTIONS] + ) + assert system_instructions == [ + {"type": "text", "content": "You are a helpful assistant."} + ] + + assert SPANDATA.GEN_AI_REQUEST_MESSAGES in span["data"] + stored_messages = json.loads(span["data"][SPANDATA.GEN_AI_REQUEST_MESSAGES]) + assert len(stored_messages) == 1 + assert stored_messages[0]["role"] == "user" + assert stored_messages[0]["content"] == "Hello, Claude" + assert span["data"][SPANDATA.GEN_AI_RESPONSE_TEXT] == "Hi, I'm Claude." + else: + assert SPANDATA.GEN_AI_SYSTEM_INSTRUCTIONS not in span["data"] + assert SPANDATA.GEN_AI_REQUEST_MESSAGES not in span["data"] + assert SPANDATA.GEN_AI_RESPONSE_TEXT not in span["data"] + + assert span["data"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS] == 10 + assert span["data"][SPANDATA.GEN_AI_USAGE_OUTPUT_TOKENS] == 20 + assert span["data"][SPANDATA.GEN_AI_USAGE_TOTAL_TOKENS] == 30 + assert span["data"][SPANDATA.GEN_AI_RESPONSE_STREAMING] is False + assert span["data"][SPANDATA.GEN_AI_RESPONSE_FINISH_REASONS] == ["end_turn"] + + +@pytest.mark.parametrize("stream_gen_ai_spans", [True, False]) @pytest.mark.asyncio @pytest.mark.parametrize( "send_default_pii, include_prompts", @@ -2639,15 +3979,21 @@ def test_nonstreaming_create_message_with_system_prompt( ], ) async def test_nonstreaming_create_message_with_system_prompt_async( - sentry_init, capture_events, send_default_pii, include_prompts + sentry_init, + capture_events, + capture_items, + send_default_pii, + include_prompts, + stream_gen_ai_spans, ): """Test that system prompts are properly captured in GEN_AI_REQUEST_MESSAGES (async).""" sentry_init( integrations=[AnthropicIntegration(include_prompts=include_prompts)], traces_sample_rate=1.0, send_default_pii=send_default_pii, + stream_gen_ai_spans=stream_gen_ai_spans, ) - events = capture_events() + client = AsyncAnthropic(api_key="z") client.messages._post = AsyncMock(return_value=EXAMPLE_MESSAGE) @@ -2658,62 +4004,127 @@ async def test_nonstreaming_create_message_with_system_prompt_async( } ] - with start_transaction(name="anthropic"): - response = await client.messages.create( - max_tokens=1024, - messages=messages, - model="model", - system="You are a helpful assistant.", - ) + if stream_gen_ai_spans: + items = capture_items("transaction", "span") - assert response == EXAMPLE_MESSAGE - usage = response.usage + with start_transaction(name="anthropic"): + response = await client.messages.create( + max_tokens=1024, + messages=messages, + model="model", + system="You are a helpful assistant.", + ) - assert usage.input_tokens == 10 - assert usage.output_tokens == 20 + assert response == EXAMPLE_MESSAGE + usage = response.usage - assert len(events) == 1 - (event,) = events + assert usage.input_tokens == 10 + assert usage.output_tokens == 20 - assert event["type"] == "transaction" - assert event["transaction"] == "anthropic" + (event,) = (item.payload for item in items if item.type == "transaction") + assert event["transaction"] == "anthropic" - assert len(event["spans"]) == 1 - (span,) = event["spans"] + spans = [item.payload for item in items if item.type == "span"] + assert len(spans) == 1 + (span,) = spans - assert span["op"] == OP.GEN_AI_CHAT - assert span["description"] == "chat model" - assert span["data"][SPANDATA.GEN_AI_SYSTEM] == "anthropic" - assert span["data"][SPANDATA.GEN_AI_OPERATION_NAME] == "chat" - assert span["data"][SPANDATA.GEN_AI_REQUEST_MODEL] == "model" + assert span["attributes"]["sentry.op"] == OP.GEN_AI_CHAT + assert span["name"] == "chat model" + assert span["attributes"][SPANDATA.GEN_AI_SYSTEM] == "anthropic" + assert span["attributes"][SPANDATA.GEN_AI_OPERATION_NAME] == "chat" + assert span["attributes"][SPANDATA.GEN_AI_REQUEST_MODEL] == "model" - if send_default_pii and include_prompts: - assert SPANDATA.GEN_AI_SYSTEM_INSTRUCTIONS in span["data"] - system_instructions = json.loads( - span["data"][SPANDATA.GEN_AI_SYSTEM_INSTRUCTIONS] - ) - assert system_instructions == [ - {"type": "text", "content": "You are a helpful assistant."} - ] + if send_default_pii and include_prompts: + assert SPANDATA.GEN_AI_SYSTEM_INSTRUCTIONS in span["attributes"] + system_instructions = json.loads( + span["attributes"][SPANDATA.GEN_AI_SYSTEM_INSTRUCTIONS] + ) + assert system_instructions == [ + {"type": "text", "content": "You are a helpful assistant."} + ] - assert SPANDATA.GEN_AI_REQUEST_MESSAGES in span["data"] - stored_messages = json.loads(span["data"][SPANDATA.GEN_AI_REQUEST_MESSAGES]) - assert len(stored_messages) == 1 - assert stored_messages[0]["role"] == "user" - assert stored_messages[0]["content"] == "Hello, Claude" - assert span["data"][SPANDATA.GEN_AI_RESPONSE_TEXT] == "Hi, I'm Claude." + assert SPANDATA.GEN_AI_REQUEST_MESSAGES in span["attributes"] + stored_messages = json.loads( + span["attributes"][SPANDATA.GEN_AI_REQUEST_MESSAGES] + ) + assert len(stored_messages) == 1 + assert stored_messages[0]["role"] == "user" + assert stored_messages[0]["content"] == "Hello, Claude" + assert ( + span["attributes"][SPANDATA.GEN_AI_RESPONSE_TEXT] == "Hi, I'm Claude." + ) + else: + assert SPANDATA.GEN_AI_SYSTEM_INSTRUCTIONS not in span["attributes"] + assert SPANDATA.GEN_AI_REQUEST_MESSAGES not in span["attributes"] + assert SPANDATA.GEN_AI_RESPONSE_TEXT not in span["attributes"] + + assert span["attributes"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS] == 10 + assert span["attributes"][SPANDATA.GEN_AI_USAGE_OUTPUT_TOKENS] == 20 + assert span["attributes"][SPANDATA.GEN_AI_USAGE_TOTAL_TOKENS] == 30 + assert span["attributes"][SPANDATA.GEN_AI_RESPONSE_STREAMING] is False + assert span["attributes"][SPANDATA.GEN_AI_RESPONSE_FINISH_REASONS] == [ + "end_turn" + ] else: - assert SPANDATA.GEN_AI_SYSTEM_INSTRUCTIONS not in span["data"] - assert SPANDATA.GEN_AI_REQUEST_MESSAGES not in span["data"] - assert SPANDATA.GEN_AI_RESPONSE_TEXT not in span["data"] + events = capture_events() + + with start_transaction(name="anthropic"): + response = await client.messages.create( + max_tokens=1024, + messages=messages, + model="model", + system="You are a helpful assistant.", + ) + + assert response == EXAMPLE_MESSAGE + usage = response.usage - assert span["data"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS] == 10 - assert span["data"][SPANDATA.GEN_AI_USAGE_OUTPUT_TOKENS] == 20 - assert span["data"][SPANDATA.GEN_AI_USAGE_TOTAL_TOKENS] == 30 - assert span["data"][SPANDATA.GEN_AI_RESPONSE_STREAMING] is False - assert span["data"][SPANDATA.GEN_AI_RESPONSE_FINISH_REASONS] == ["end_turn"] + assert usage.input_tokens == 10 + assert usage.output_tokens == 20 + assert len(events) == 1 + (event,) = events + assert event["type"] == "transaction" + assert event["transaction"] == "anthropic" + + assert len(event["spans"]) == 1 + (span,) = event["spans"] + + assert span["op"] == OP.GEN_AI_CHAT + assert span["description"] == "chat model" + assert span["data"][SPANDATA.GEN_AI_SYSTEM] == "anthropic" + assert span["data"][SPANDATA.GEN_AI_OPERATION_NAME] == "chat" + assert span["data"][SPANDATA.GEN_AI_REQUEST_MODEL] == "model" + + if send_default_pii and include_prompts: + assert SPANDATA.GEN_AI_SYSTEM_INSTRUCTIONS in span["data"] + system_instructions = json.loads( + span["data"][SPANDATA.GEN_AI_SYSTEM_INSTRUCTIONS] + ) + assert system_instructions == [ + {"type": "text", "content": "You are a helpful assistant."} + ] + + assert SPANDATA.GEN_AI_REQUEST_MESSAGES in span["data"] + stored_messages = json.loads(span["data"][SPANDATA.GEN_AI_REQUEST_MESSAGES]) + assert len(stored_messages) == 1 + assert stored_messages[0]["role"] == "user" + assert stored_messages[0]["content"] == "Hello, Claude" + assert span["data"][SPANDATA.GEN_AI_RESPONSE_TEXT] == "Hi, I'm Claude." + else: + assert SPANDATA.GEN_AI_SYSTEM_INSTRUCTIONS not in span["data"] + assert SPANDATA.GEN_AI_REQUEST_MESSAGES not in span["data"] + assert SPANDATA.GEN_AI_RESPONSE_TEXT not in span["data"] + + assert span["data"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS] == 10 + assert span["data"][SPANDATA.GEN_AI_USAGE_OUTPUT_TOKENS] == 20 + assert span["data"][SPANDATA.GEN_AI_USAGE_TOTAL_TOKENS] == 30 + assert span["data"][SPANDATA.GEN_AI_RESPONSE_STREAMING] is False + assert span["data"][SPANDATA.GEN_AI_RESPONSE_FINISH_REASONS] == ["end_turn"] + + +@pytest.mark.parametrize("stream_gen_ai_spans", [True, False]) @pytest.mark.parametrize( "send_default_pii, include_prompts", [ @@ -2726,10 +4137,12 @@ async def test_nonstreaming_create_message_with_system_prompt_async( def test_streaming_create_message_with_system_prompt( sentry_init, capture_events, + capture_items, send_default_pii, include_prompts, get_model_response, server_side_event_chunks, + stream_gen_ai_spans, ): """Test that system prompts are properly captured in streaming mode.""" client = Anthropic(api_key="z") @@ -2775,8 +4188,8 @@ def test_streaming_create_message_with_system_prompt( integrations=[AnthropicIntegration(include_prompts=include_prompts)], traces_sample_rate=1.0, send_default_pii=send_default_pii, + stream_gen_ai_spans=stream_gen_ai_spans, ) - events = capture_events() messages = [ { @@ -2785,12 +4198,14 @@ def test_streaming_create_message_with_system_prompt( } ] - with mock.patch.object( - client._client, - "send", - return_value=response, - ) as _: - with start_transaction(name="anthropic"): + if stream_gen_ai_spans: + items = capture_items("transaction", "span") + + with mock.patch.object( + client._client, + "send", + return_value=response, + ) as _, start_transaction(name="anthropic"): message = client.messages.create( max_tokens=1024, messages=messages, @@ -2802,48 +4217,111 @@ def test_streaming_create_message_with_system_prompt( for _ in message: pass - assert len(events) == 1 - (event,) = events + (event,) = (item.payload for item in items if item.type == "transaction") + assert event["transaction"] == "anthropic" - assert event["type"] == "transaction" - assert event["transaction"] == "anthropic" + spans = [item.payload for item in items if item.type == "span"] + assert len(spans) == 1 + (span,) = spans - assert len(event["spans"]) == 1 - (span,) = event["spans"] + assert span["attributes"]["sentry.op"] == OP.GEN_AI_CHAT + assert span["name"] == "chat model" + assert span["attributes"][SPANDATA.GEN_AI_SYSTEM] == "anthropic" + assert span["attributes"][SPANDATA.GEN_AI_OPERATION_NAME] == "chat" + assert span["attributes"][SPANDATA.GEN_AI_REQUEST_MODEL] == "model" - assert span["op"] == OP.GEN_AI_CHAT - assert span["description"] == "chat model" - assert span["data"][SPANDATA.GEN_AI_SYSTEM] == "anthropic" - assert span["data"][SPANDATA.GEN_AI_OPERATION_NAME] == "chat" - assert span["data"][SPANDATA.GEN_AI_REQUEST_MODEL] == "model" + if send_default_pii and include_prompts: + assert SPANDATA.GEN_AI_SYSTEM_INSTRUCTIONS in span["attributes"] + system_instructions = json.loads( + span["attributes"][SPANDATA.GEN_AI_SYSTEM_INSTRUCTIONS] + ) + assert system_instructions == [ + {"type": "text", "content": "You are a helpful assistant."} + ] - if send_default_pii and include_prompts: - assert SPANDATA.GEN_AI_SYSTEM_INSTRUCTIONS in span["data"] - system_instructions = json.loads( - span["data"][SPANDATA.GEN_AI_SYSTEM_INSTRUCTIONS] - ) - assert system_instructions == [ - {"type": "text", "content": "You are a helpful assistant."} - ] + assert SPANDATA.GEN_AI_REQUEST_MESSAGES in span["attributes"] + stored_messages = json.loads( + span["attributes"][SPANDATA.GEN_AI_REQUEST_MESSAGES] + ) + assert len(stored_messages) == 1 + assert stored_messages[0]["role"] == "user" + assert stored_messages[0]["content"] == "Hello, Claude" + assert ( + span["attributes"][SPANDATA.GEN_AI_RESPONSE_TEXT] == "Hi! I'm Claude!" + ) - assert SPANDATA.GEN_AI_REQUEST_MESSAGES in span["data"] - stored_messages = json.loads(span["data"][SPANDATA.GEN_AI_REQUEST_MESSAGES]) - assert len(stored_messages) == 1 - assert stored_messages[0]["role"] == "user" - assert stored_messages[0]["content"] == "Hello, Claude" - assert span["data"][SPANDATA.GEN_AI_RESPONSE_TEXT] == "Hi! I'm Claude!" + else: + assert SPANDATA.GEN_AI_SYSTEM_INSTRUCTIONS not in span["attributes"] + assert SPANDATA.GEN_AI_REQUEST_MESSAGES not in span["attributes"] + assert SPANDATA.GEN_AI_RESPONSE_TEXT not in span["attributes"] + + assert span["attributes"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS] == 10 + assert span["attributes"][SPANDATA.GEN_AI_USAGE_OUTPUT_TOKENS] == 10 + assert span["attributes"][SPANDATA.GEN_AI_USAGE_TOTAL_TOKENS] == 20 + assert span["attributes"][SPANDATA.GEN_AI_RESPONSE_STREAMING] is True else: - assert SPANDATA.GEN_AI_SYSTEM_INSTRUCTIONS not in span["data"] - assert SPANDATA.GEN_AI_REQUEST_MESSAGES not in span["data"] - assert SPANDATA.GEN_AI_RESPONSE_TEXT not in span["data"] + events = capture_events() + + with mock.patch.object( + client._client, + "send", + return_value=response, + ) as _, start_transaction(name="anthropic"): + message = client.messages.create( + max_tokens=1024, + messages=messages, + model="model", + stream=True, + system="You are a helpful assistant.", + ) + + for _ in message: + pass + + assert len(events) == 1 + (event,) = events + + assert event["type"] == "transaction" + assert event["transaction"] == "anthropic" + + assert len(event["spans"]) == 1 + (span,) = event["spans"] + + assert span["op"] == OP.GEN_AI_CHAT + assert span["description"] == "chat model" + assert span["data"][SPANDATA.GEN_AI_SYSTEM] == "anthropic" + assert span["data"][SPANDATA.GEN_AI_OPERATION_NAME] == "chat" + assert span["data"][SPANDATA.GEN_AI_REQUEST_MODEL] == "model" + + if send_default_pii and include_prompts: + assert SPANDATA.GEN_AI_SYSTEM_INSTRUCTIONS in span["data"] + system_instructions = json.loads( + span["data"][SPANDATA.GEN_AI_SYSTEM_INSTRUCTIONS] + ) + assert system_instructions == [ + {"type": "text", "content": "You are a helpful assistant."} + ] + + assert SPANDATA.GEN_AI_REQUEST_MESSAGES in span["data"] + stored_messages = json.loads(span["data"][SPANDATA.GEN_AI_REQUEST_MESSAGES]) + assert len(stored_messages) == 1 + assert stored_messages[0]["role"] == "user" + assert stored_messages[0]["content"] == "Hello, Claude" + assert span["data"][SPANDATA.GEN_AI_RESPONSE_TEXT] == "Hi! I'm Claude!" - assert span["data"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS] == 10 - assert span["data"][SPANDATA.GEN_AI_USAGE_OUTPUT_TOKENS] == 10 - assert span["data"][SPANDATA.GEN_AI_USAGE_TOTAL_TOKENS] == 20 - assert span["data"][SPANDATA.GEN_AI_RESPONSE_STREAMING] is True + else: + assert SPANDATA.GEN_AI_SYSTEM_INSTRUCTIONS not in span["data"] + assert SPANDATA.GEN_AI_REQUEST_MESSAGES not in span["data"] + assert SPANDATA.GEN_AI_RESPONSE_TEXT not in span["data"] + assert span["data"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS] == 10 + assert span["data"][SPANDATA.GEN_AI_USAGE_OUTPUT_TOKENS] == 10 + assert span["data"][SPANDATA.GEN_AI_USAGE_TOTAL_TOKENS] == 20 + assert span["data"][SPANDATA.GEN_AI_RESPONSE_STREAMING] is True + +@pytest.mark.parametrize("stream_gen_ai_spans", [True, False]) @pytest.mark.parametrize( "send_default_pii, include_prompts", [ @@ -2856,10 +4334,12 @@ def test_streaming_create_message_with_system_prompt( def test_stream_messages_with_system_prompt( sentry_init, capture_events, + capture_items, send_default_pii, include_prompts, get_model_response, server_side_event_chunks, + stream_gen_ai_spans, ): """Test that system prompts are properly captured in streaming mode.""" client = Anthropic(api_key="z") @@ -2905,8 +4385,8 @@ def test_stream_messages_with_system_prompt( integrations=[AnthropicIntegration(include_prompts=include_prompts)], traces_sample_rate=1.0, send_default_pii=send_default_pii, + stream_gen_ai_spans=stream_gen_ai_spans, ) - events = capture_events() messages = [ { @@ -2915,63 +4395,119 @@ def test_stream_messages_with_system_prompt( } ] - with mock.patch.object( - client._client, - "send", - return_value=response, - ) as _: - with start_transaction(name="anthropic"): - with client.messages.stream( - max_tokens=1024, - messages=messages, - model="model", - system="You are a helpful assistant.", - ) as stream: - for event in stream: - pass + if stream_gen_ai_spans: + items = capture_items("transaction", "span") - assert len(events) == 1 - (event,) = events + with mock.patch.object( + client._client, + "send", + return_value=response, + ) as _, start_transaction(name="anthropic"), client.messages.stream( + max_tokens=1024, + messages=messages, + model="model", + system="You are a helpful assistant.", + ) as stream: + for event in stream: + pass - assert event["type"] == "transaction" - assert event["transaction"] == "anthropic" + (event,) = (item.payload for item in items if item.type == "transaction") + assert event["transaction"] == "anthropic" - assert len(event["spans"]) == 1 - (span,) = event["spans"] + spans = [item.payload for item in items if item.type == "span"] + assert len(spans) == 1 + (span,) = spans - assert span["op"] == OP.GEN_AI_CHAT - assert span["description"] == "chat model" - assert span["data"][SPANDATA.GEN_AI_SYSTEM] == "anthropic" - assert span["data"][SPANDATA.GEN_AI_OPERATION_NAME] == "chat" - assert span["data"][SPANDATA.GEN_AI_REQUEST_MODEL] == "model" + assert span["attributes"]["sentry.op"] == OP.GEN_AI_CHAT + assert span["name"] == "chat model" + assert span["attributes"][SPANDATA.GEN_AI_SYSTEM] == "anthropic" + assert span["attributes"][SPANDATA.GEN_AI_OPERATION_NAME] == "chat" + assert span["attributes"][SPANDATA.GEN_AI_REQUEST_MODEL] == "model" - if send_default_pii and include_prompts: - assert SPANDATA.GEN_AI_SYSTEM_INSTRUCTIONS in span["data"] - system_instructions = json.loads( - span["data"][SPANDATA.GEN_AI_SYSTEM_INSTRUCTIONS] - ) - assert system_instructions == [ - {"type": "text", "content": "You are a helpful assistant."} - ] + if send_default_pii and include_prompts: + assert SPANDATA.GEN_AI_SYSTEM_INSTRUCTIONS in span["attributes"] + system_instructions = json.loads( + span["attributes"][SPANDATA.GEN_AI_SYSTEM_INSTRUCTIONS] + ) + assert system_instructions == [ + {"type": "text", "content": "You are a helpful assistant."} + ] + assert SPANDATA.GEN_AI_REQUEST_MESSAGES in span["attributes"] + stored_messages = json.loads( + span["attributes"][SPANDATA.GEN_AI_REQUEST_MESSAGES] + ) + assert len(stored_messages) == 1 + assert stored_messages[0]["role"] == "user" + assert stored_messages[0]["content"] == "Hello, Claude" + assert ( + span["attributes"][SPANDATA.GEN_AI_RESPONSE_TEXT] == "Hi! I'm Claude!" + ) + else: + assert SPANDATA.GEN_AI_SYSTEM_INSTRUCTIONS not in span["attributes"] + assert SPANDATA.GEN_AI_REQUEST_MESSAGES not in span["attributes"] + assert SPANDATA.GEN_AI_RESPONSE_TEXT not in span["attributes"] + + assert span["attributes"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS] == 10 + assert span["attributes"][SPANDATA.GEN_AI_USAGE_OUTPUT_TOKENS] == 10 + assert span["attributes"][SPANDATA.GEN_AI_USAGE_TOTAL_TOKENS] == 20 + assert span["attributes"][SPANDATA.GEN_AI_RESPONSE_STREAMING] is True + else: + events = capture_events() - assert SPANDATA.GEN_AI_REQUEST_MESSAGES in span["data"] - stored_messages = json.loads(span["data"][SPANDATA.GEN_AI_REQUEST_MESSAGES]) - assert len(stored_messages) == 1 - assert stored_messages[0]["role"] == "user" - assert stored_messages[0]["content"] == "Hello, Claude" - assert span["data"][SPANDATA.GEN_AI_RESPONSE_TEXT] == "Hi! I'm Claude!" + with mock.patch.object( + client._client, + "send", + return_value=response, + ) as _, start_transaction(name="anthropic"), client.messages.stream( + max_tokens=1024, + messages=messages, + model="model", + system="You are a helpful assistant.", + ) as stream: + for event in stream: + pass - else: - assert SPANDATA.GEN_AI_SYSTEM_INSTRUCTIONS not in span["data"] - assert SPANDATA.GEN_AI_REQUEST_MESSAGES not in span["data"] - assert SPANDATA.GEN_AI_RESPONSE_TEXT not in span["data"] + assert len(events) == 1 + (event,) = events - assert span["data"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS] == 10 - assert span["data"][SPANDATA.GEN_AI_USAGE_OUTPUT_TOKENS] == 10 - assert span["data"][SPANDATA.GEN_AI_USAGE_TOTAL_TOKENS] == 20 - assert span["data"][SPANDATA.GEN_AI_RESPONSE_STREAMING] is True + assert event["type"] == "transaction" + assert event["transaction"] == "anthropic" + assert len(event["spans"]) == 1 + (span,) = event["spans"] + assert span["op"] == OP.GEN_AI_CHAT + assert span["description"] == "chat model" + assert span["data"][SPANDATA.GEN_AI_SYSTEM] == "anthropic" + assert span["data"][SPANDATA.GEN_AI_OPERATION_NAME] == "chat" + assert span["data"][SPANDATA.GEN_AI_REQUEST_MODEL] == "model" + + if send_default_pii and include_prompts: + assert SPANDATA.GEN_AI_SYSTEM_INSTRUCTIONS in span["data"] + system_instructions = json.loads( + span["data"][SPANDATA.GEN_AI_SYSTEM_INSTRUCTIONS] + ) + assert system_instructions == [ + {"type": "text", "content": "You are a helpful assistant."} + ] + assert SPANDATA.GEN_AI_REQUEST_MESSAGES in span["data"] + stored_messages = json.loads(span["data"][SPANDATA.GEN_AI_REQUEST_MESSAGES]) + assert len(stored_messages) == 1 + assert stored_messages[0]["role"] == "user" + assert stored_messages[0]["content"] == "Hello, Claude" + assert span["data"][SPANDATA.GEN_AI_RESPONSE_TEXT] == "Hi! I'm Claude!" + else: + assert SPANDATA.GEN_AI_SYSTEM_INSTRUCTIONS not in span["data"] + assert SPANDATA.GEN_AI_REQUEST_MESSAGES not in span["data"] + assert SPANDATA.GEN_AI_RESPONSE_TEXT not in span["data"] + + assert span["data"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS] == 10 + assert span["data"][SPANDATA.GEN_AI_USAGE_OUTPUT_TOKENS] == 10 + assert span["data"][SPANDATA.GEN_AI_USAGE_TOTAL_TOKENS] == 20 + assert span["data"][SPANDATA.GEN_AI_RESPONSE_STREAMING] is True + + +@pytest.mark.parametrize("stream_gen_ai_spans", [True, False]) @pytest.mark.asyncio @pytest.mark.parametrize( "send_default_pii, include_prompts", @@ -2985,11 +4521,13 @@ def test_stream_messages_with_system_prompt( async def test_stream_message_with_system_prompt_async( sentry_init, capture_events, + capture_items, send_default_pii, include_prompts, get_model_response, async_iterator, server_side_event_chunks, + stream_gen_ai_spans, ): """Test that system prompts are properly captured in streaming mode (async).""" client = AsyncAnthropic(api_key="z") @@ -3037,8 +4575,8 @@ async def test_stream_message_with_system_prompt_async( integrations=[AnthropicIntegration(include_prompts=include_prompts)], traces_sample_rate=1.0, send_default_pii=send_default_pii, + stream_gen_ai_spans=stream_gen_ai_spans, ) - events = capture_events() messages = [ { @@ -3047,12 +4585,14 @@ async def test_stream_message_with_system_prompt_async( } ] - with mock.patch.object( - client._client, - "send", - return_value=response, - ) as _: - with start_transaction(name="anthropic"): + if stream_gen_ai_spans: + items = capture_items("transaction", "span") + + with mock.patch.object( + client._client, + "send", + return_value=response, + ) as _, start_transaction(name="anthropic"): async with client.messages.stream( max_tokens=1024, messages=messages, @@ -3062,48 +4602,107 @@ async def test_stream_message_with_system_prompt_async( async for event in stream: pass - assert len(events) == 1 - (event,) = events + (event,) = (item.payload for item in items if item.type == "transaction") + assert event["transaction"] == "anthropic" - assert event["type"] == "transaction" - assert event["transaction"] == "anthropic" + spans = [item.payload for item in items if item.type == "span"] + assert len(spans) == 1 + (span,) = spans - assert len(event["spans"]) == 1 - (span,) = event["spans"] + assert span["attributes"]["sentry.op"] == OP.GEN_AI_CHAT + assert span["name"] == "chat model" + assert span["attributes"][SPANDATA.GEN_AI_SYSTEM] == "anthropic" + assert span["attributes"][SPANDATA.GEN_AI_OPERATION_NAME] == "chat" + assert span["attributes"][SPANDATA.GEN_AI_REQUEST_MODEL] == "model" - assert span["op"] == OP.GEN_AI_CHAT - assert span["description"] == "chat model" - assert span["data"][SPANDATA.GEN_AI_SYSTEM] == "anthropic" - assert span["data"][SPANDATA.GEN_AI_OPERATION_NAME] == "chat" - assert span["data"][SPANDATA.GEN_AI_REQUEST_MODEL] == "model" + if send_default_pii and include_prompts: + assert SPANDATA.GEN_AI_SYSTEM_INSTRUCTIONS in span["attributes"] + system_instructions = json.loads( + span["attributes"][SPANDATA.GEN_AI_SYSTEM_INSTRUCTIONS] + ) + assert system_instructions == [ + {"type": "text", "content": "You are a helpful assistant."} + ] - if send_default_pii and include_prompts: - assert SPANDATA.GEN_AI_SYSTEM_INSTRUCTIONS in span["data"] - system_instructions = json.loads( - span["data"][SPANDATA.GEN_AI_SYSTEM_INSTRUCTIONS] - ) - assert system_instructions == [ - {"type": "text", "content": "You are a helpful assistant."} - ] + assert SPANDATA.GEN_AI_REQUEST_MESSAGES in span["attributes"] + stored_messages = json.loads( + span["attributes"][SPANDATA.GEN_AI_REQUEST_MESSAGES] + ) + assert len(stored_messages) == 1 + assert stored_messages[0]["role"] == "user" + assert stored_messages[0]["content"] == "Hello, Claude" + assert ( + span["attributes"][SPANDATA.GEN_AI_RESPONSE_TEXT] == "Hi! I'm Claude!" + ) + else: + assert SPANDATA.GEN_AI_SYSTEM_INSTRUCTIONS not in span["attributes"] + assert SPANDATA.GEN_AI_REQUEST_MESSAGES not in span["attributes"] + assert SPANDATA.GEN_AI_RESPONSE_TEXT not in span["attributes"] + + assert span["attributes"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS] == 10 + assert span["attributes"][SPANDATA.GEN_AI_USAGE_OUTPUT_TOKENS] == 10 + assert span["attributes"][SPANDATA.GEN_AI_USAGE_TOTAL_TOKENS] == 20 + assert span["attributes"][SPANDATA.GEN_AI_RESPONSE_STREAMING] is True + else: + events = capture_events() - assert SPANDATA.GEN_AI_REQUEST_MESSAGES in span["data"] - stored_messages = json.loads(span["data"][SPANDATA.GEN_AI_REQUEST_MESSAGES]) - assert len(stored_messages) == 1 - assert stored_messages[0]["role"] == "user" - assert stored_messages[0]["content"] == "Hello, Claude" - assert span["data"][SPANDATA.GEN_AI_RESPONSE_TEXT] == "Hi! I'm Claude!" + with mock.patch.object( + client._client, + "send", + return_value=response, + ) as _, start_transaction(name="anthropic"): + async with client.messages.stream( + max_tokens=1024, + messages=messages, + model="model", + system="You are a helpful assistant.", + ) as stream: + async for event in stream: + pass - else: - assert SPANDATA.GEN_AI_SYSTEM_INSTRUCTIONS not in span["data"] - assert SPANDATA.GEN_AI_REQUEST_MESSAGES not in span["data"] - assert SPANDATA.GEN_AI_RESPONSE_TEXT not in span["data"] + assert len(events) == 1 + (event,) = events + + assert event["type"] == "transaction" + assert event["transaction"] == "anthropic" + + assert len(event["spans"]) == 1 + (span,) = event["spans"] + + assert span["op"] == OP.GEN_AI_CHAT + assert span["description"] == "chat model" + assert span["data"][SPANDATA.GEN_AI_SYSTEM] == "anthropic" + assert span["data"][SPANDATA.GEN_AI_OPERATION_NAME] == "chat" + assert span["data"][SPANDATA.GEN_AI_REQUEST_MODEL] == "model" + + if send_default_pii and include_prompts: + assert SPANDATA.GEN_AI_SYSTEM_INSTRUCTIONS in span["data"] + system_instructions = json.loads( + span["data"][SPANDATA.GEN_AI_SYSTEM_INSTRUCTIONS] + ) + assert system_instructions == [ + {"type": "text", "content": "You are a helpful assistant."} + ] + + assert SPANDATA.GEN_AI_REQUEST_MESSAGES in span["data"] + stored_messages = json.loads(span["data"][SPANDATA.GEN_AI_REQUEST_MESSAGES]) + assert len(stored_messages) == 1 + assert stored_messages[0]["role"] == "user" + assert stored_messages[0]["content"] == "Hello, Claude" + assert span["data"][SPANDATA.GEN_AI_RESPONSE_TEXT] == "Hi! I'm Claude!" - assert span["data"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS] == 10 - assert span["data"][SPANDATA.GEN_AI_USAGE_OUTPUT_TOKENS] == 10 - assert span["data"][SPANDATA.GEN_AI_USAGE_TOTAL_TOKENS] == 20 - assert span["data"][SPANDATA.GEN_AI_RESPONSE_STREAMING] is True + else: + assert SPANDATA.GEN_AI_SYSTEM_INSTRUCTIONS not in span["data"] + assert SPANDATA.GEN_AI_REQUEST_MESSAGES not in span["data"] + assert SPANDATA.GEN_AI_RESPONSE_TEXT not in span["data"] + assert span["data"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS] == 10 + assert span["data"][SPANDATA.GEN_AI_USAGE_OUTPUT_TOKENS] == 10 + assert span["data"][SPANDATA.GEN_AI_USAGE_TOTAL_TOKENS] == 20 + assert span["data"][SPANDATA.GEN_AI_RESPONSE_STREAMING] is True + +@pytest.mark.parametrize("stream_gen_ai_spans", [True, False]) @pytest.mark.asyncio @pytest.mark.parametrize( "send_default_pii, include_prompts", @@ -3117,11 +4716,13 @@ async def test_stream_message_with_system_prompt_async( async def test_streaming_create_message_with_system_prompt_async( sentry_init, capture_events, + capture_items, send_default_pii, include_prompts, get_model_response, async_iterator, server_side_event_chunks, + stream_gen_ai_spans, ): """Test that system prompts are properly captured in streaming mode (async).""" client = AsyncAnthropic(api_key="z") @@ -3169,8 +4770,8 @@ async def test_streaming_create_message_with_system_prompt_async( integrations=[AnthropicIntegration(include_prompts=include_prompts)], traces_sample_rate=1.0, send_default_pii=send_default_pii, + stream_gen_ai_spans=stream_gen_ai_spans, ) - events = capture_events() messages = [ { @@ -3179,12 +4780,14 @@ async def test_streaming_create_message_with_system_prompt_async( } ] - with mock.patch.object( - client._client, - "send", - return_value=response, - ) as _: - with start_transaction(name="anthropic"): + if stream_gen_ai_spans: + items = capture_items("transaction", "span") + + with mock.patch.object( + client._client, + "send", + return_value=response, + ) as _, start_transaction(name="anthropic"): message = await client.messages.create( max_tokens=1024, messages=messages, @@ -3196,56 +4799,124 @@ async def test_streaming_create_message_with_system_prompt_async( async for _ in message: pass - assert len(events) == 1 - (event,) = events + (event,) = (item.payload for item in items if item.type == "transaction") + assert event["transaction"] == "anthropic" - assert event["type"] == "transaction" - assert event["transaction"] == "anthropic" + spans = [item.payload for item in items if item.type == "span"] + assert len(spans) == 1 + (span,) = spans - assert len(event["spans"]) == 1 - (span,) = event["spans"] + assert span["attributes"]["sentry.op"] == OP.GEN_AI_CHAT + assert span["name"] == "chat model" + assert span["attributes"][SPANDATA.GEN_AI_SYSTEM] == "anthropic" + assert span["attributes"][SPANDATA.GEN_AI_OPERATION_NAME] == "chat" + assert span["attributes"][SPANDATA.GEN_AI_REQUEST_MODEL] == "model" - assert span["op"] == OP.GEN_AI_CHAT - assert span["description"] == "chat model" - assert span["data"][SPANDATA.GEN_AI_SYSTEM] == "anthropic" - assert span["data"][SPANDATA.GEN_AI_OPERATION_NAME] == "chat" - assert span["data"][SPANDATA.GEN_AI_REQUEST_MODEL] == "model" + if send_default_pii and include_prompts: + assert SPANDATA.GEN_AI_SYSTEM_INSTRUCTIONS in span["attributes"] + system_instructions = json.loads( + span["attributes"][SPANDATA.GEN_AI_SYSTEM_INSTRUCTIONS] + ) + assert system_instructions == [ + {"type": "text", "content": "You are a helpful assistant."} + ] - if send_default_pii and include_prompts: - assert SPANDATA.GEN_AI_SYSTEM_INSTRUCTIONS in span["data"] - system_instructions = json.loads( - span["data"][SPANDATA.GEN_AI_SYSTEM_INSTRUCTIONS] - ) - assert system_instructions == [ - {"type": "text", "content": "You are a helpful assistant."} - ] + assert SPANDATA.GEN_AI_REQUEST_MESSAGES in span["attributes"] + stored_messages = json.loads( + span["attributes"][SPANDATA.GEN_AI_REQUEST_MESSAGES] + ) - assert SPANDATA.GEN_AI_REQUEST_MESSAGES in span["data"] - stored_messages = json.loads(span["data"][SPANDATA.GEN_AI_REQUEST_MESSAGES]) - assert len(stored_messages) == 1 - assert stored_messages[0]["role"] == "user" - assert stored_messages[0]["content"] == "Hello, Claude" - assert span["data"][SPANDATA.GEN_AI_RESPONSE_TEXT] == "Hi! I'm Claude!" + assert len(stored_messages) == 1 + assert stored_messages[0]["role"] == "user" + assert stored_messages[0]["content"] == "Hello, Claude" + assert ( + span["attributes"][SPANDATA.GEN_AI_RESPONSE_TEXT] == "Hi! I'm Claude!" + ) + else: + assert SPANDATA.GEN_AI_SYSTEM_INSTRUCTIONS not in span["attributes"] + assert SPANDATA.GEN_AI_REQUEST_MESSAGES not in span["attributes"] + assert SPANDATA.GEN_AI_RESPONSE_TEXT not in span["attributes"] + + assert span["attributes"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS] == 10 + assert span["attributes"][SPANDATA.GEN_AI_USAGE_OUTPUT_TOKENS] == 10 + assert span["attributes"][SPANDATA.GEN_AI_USAGE_TOTAL_TOKENS] == 20 + assert span["attributes"][SPANDATA.GEN_AI_RESPONSE_STREAMING] is True else: - assert SPANDATA.GEN_AI_SYSTEM_INSTRUCTIONS not in span["data"] - assert SPANDATA.GEN_AI_REQUEST_MESSAGES not in span["data"] - assert SPANDATA.GEN_AI_RESPONSE_TEXT not in span["data"] + events = capture_events() + + with mock.patch.object( + client._client, + "send", + return_value=response, + ) as _, start_transaction(name="anthropic"): + message = await client.messages.create( + max_tokens=1024, + messages=messages, + model="model", + stream=True, + system="You are a helpful assistant.", + ) + + async for _ in message: + pass + + assert len(events) == 1 + (event,) = events - assert span["data"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS] == 10 - assert span["data"][SPANDATA.GEN_AI_USAGE_OUTPUT_TOKENS] == 10 - assert span["data"][SPANDATA.GEN_AI_USAGE_TOTAL_TOKENS] == 20 - assert span["data"][SPANDATA.GEN_AI_RESPONSE_STREAMING] is True + assert event["type"] == "transaction" + assert event["transaction"] == "anthropic" + assert len(event["spans"]) == 1 + (span,) = event["spans"] -def test_system_prompt_with_complex_structure(sentry_init, capture_events): + assert span["op"] == OP.GEN_AI_CHAT + assert span["description"] == "chat model" + assert span["data"][SPANDATA.GEN_AI_SYSTEM] == "anthropic" + assert span["data"][SPANDATA.GEN_AI_OPERATION_NAME] == "chat" + assert span["data"][SPANDATA.GEN_AI_REQUEST_MODEL] == "model" + + if send_default_pii and include_prompts: + assert SPANDATA.GEN_AI_SYSTEM_INSTRUCTIONS in span["data"] + system_instructions = json.loads( + span["data"][SPANDATA.GEN_AI_SYSTEM_INSTRUCTIONS] + ) + assert system_instructions == [ + {"type": "text", "content": "You are a helpful assistant."} + ] + assert SPANDATA.GEN_AI_REQUEST_MESSAGES in span["data"] + stored_messages = json.loads(span["data"][SPANDATA.GEN_AI_REQUEST_MESSAGES]) + assert len(stored_messages) == 1 + assert stored_messages[0]["role"] == "user" + assert stored_messages[0]["content"] == "Hello, Claude" + assert span["data"][SPANDATA.GEN_AI_RESPONSE_TEXT] == "Hi! I'm Claude!" + + else: + assert SPANDATA.GEN_AI_SYSTEM_INSTRUCTIONS not in span["data"] + assert SPANDATA.GEN_AI_REQUEST_MESSAGES not in span["data"] + assert SPANDATA.GEN_AI_RESPONSE_TEXT not in span["data"] + + assert span["data"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS] == 10 + assert span["data"][SPANDATA.GEN_AI_USAGE_OUTPUT_TOKENS] == 10 + assert span["data"][SPANDATA.GEN_AI_USAGE_TOTAL_TOKENS] == 20 + assert span["data"][SPANDATA.GEN_AI_RESPONSE_STREAMING] is True + + +@pytest.mark.parametrize("stream_gen_ai_spans", [True, False]) +def test_system_prompt_with_complex_structure( + sentry_init, + capture_events, + capture_items, + stream_gen_ai_spans, +): """Test that complex system prompt structures (list of text blocks) are properly captured.""" sentry_init( integrations=[AnthropicIntegration(include_prompts=True)], traces_sample_rate=1.0, send_default_pii=True, + stream_gen_ai_spans=stream_gen_ai_spans, ) - events = capture_events() + client = Anthropic(api_key="z") client.messages._post = mock.Mock(return_value=EXAMPLE_MESSAGE) @@ -3262,33 +4933,72 @@ def test_system_prompt_with_complex_structure(sentry_init, capture_events): } ] - with start_transaction(name="anthropic"): - response = client.messages.create( - max_tokens=1024, messages=messages, model="model", system=system_prompt + if stream_gen_ai_spans: + items = capture_items("transaction", "span") + + with start_transaction(name="anthropic"): + response = client.messages.create( + max_tokens=1024, messages=messages, model="model", system=system_prompt + ) + + assert response == EXAMPLE_MESSAGE + + spans = [item.payload for item in items if item.type == "span"] + assert len(spans) == 1 + (span,) = spans + + assert span["attributes"][SPANDATA.GEN_AI_SYSTEM] == "anthropic" + assert span["attributes"][SPANDATA.GEN_AI_OPERATION_NAME] == "chat" + + assert SPANDATA.GEN_AI_SYSTEM_INSTRUCTIONS in span["attributes"] + system_instructions = json.loads( + span["attributes"][SPANDATA.GEN_AI_SYSTEM_INSTRUCTIONS] + ) + + # System content should be a list of text blocks + assert isinstance(system_instructions, list) + assert system_instructions == [ + {"type": "text", "content": "You are a helpful assistant."}, + {"type": "text", "content": "Be concise and clear."}, + ] + + assert SPANDATA.GEN_AI_REQUEST_MESSAGES in span["attributes"] + stored_messages = json.loads( + span["attributes"][SPANDATA.GEN_AI_REQUEST_MESSAGES] ) + else: + events = capture_events() - assert response == EXAMPLE_MESSAGE - assert len(events) == 1 - (event,) = events + with start_transaction(name="anthropic"): + response = client.messages.create( + max_tokens=1024, messages=messages, model="model", system=system_prompt + ) - assert len(event["spans"]) == 1 - (span,) = event["spans"] + assert response == EXAMPLE_MESSAGE - assert span["data"][SPANDATA.GEN_AI_SYSTEM] == "anthropic" - assert span["data"][SPANDATA.GEN_AI_OPERATION_NAME] == "chat" + assert len(events) == 1 + (event,) = events - assert SPANDATA.GEN_AI_SYSTEM_INSTRUCTIONS in span["data"] - system_instructions = json.loads(span["data"][SPANDATA.GEN_AI_SYSTEM_INSTRUCTIONS]) + assert len(event["spans"]) == 1 + (span,) = event["spans"] - # System content should be a list of text blocks - assert isinstance(system_instructions, list) - assert system_instructions == [ - {"type": "text", "content": "You are a helpful assistant."}, - {"type": "text", "content": "Be concise and clear."}, - ] + assert span["data"][SPANDATA.GEN_AI_SYSTEM] == "anthropic" + assert span["data"][SPANDATA.GEN_AI_OPERATION_NAME] == "chat" + + assert SPANDATA.GEN_AI_SYSTEM_INSTRUCTIONS in span["data"] + system_instructions = json.loads( + span["data"][SPANDATA.GEN_AI_SYSTEM_INSTRUCTIONS] + ) - assert SPANDATA.GEN_AI_REQUEST_MESSAGES in span["data"] - stored_messages = json.loads(span["data"][SPANDATA.GEN_AI_REQUEST_MESSAGES]) + # System content should be a list of text blocks + assert isinstance(system_instructions, list) + assert system_instructions == [ + {"type": "text", "content": "You are a helpful assistant."}, + {"type": "text", "content": "Be concise and clear."}, + ] + + assert SPANDATA.GEN_AI_REQUEST_MESSAGES in span["data"] + stored_messages = json.loads(span["data"][SPANDATA.GEN_AI_REQUEST_MESSAGES]) assert len(stored_messages) == 1 assert stored_messages[0]["role"] == "user" @@ -3490,14 +5200,21 @@ def test_transform_message_content_list_anthropic(): # Integration tests for binary data in messages -def test_message_with_base64_image(sentry_init, capture_events): +@pytest.mark.parametrize("stream_gen_ai_spans", [True, False]) +def test_message_with_base64_image( + sentry_init, + capture_events, + capture_items, + stream_gen_ai_spans, +): """Test that messages with base64 images are properly captured.""" sentry_init( integrations=[AnthropicIntegration(include_prompts=True)], traces_sample_rate=1.0, send_default_pii=True, + stream_gen_ai_spans=stream_gen_ai_spans, ) - events = capture_events() + client = Anthropic(api_key="z") client.messages._post = mock.Mock(return_value=EXAMPLE_MESSAGE) @@ -3518,15 +5235,31 @@ def test_message_with_base64_image(sentry_init, capture_events): } ] - with start_transaction(name="anthropic"): - client.messages.create(max_tokens=1024, messages=messages, model="model") + if stream_gen_ai_spans: + items = capture_items("transaction", "span") + + with start_transaction(name="anthropic"): + client.messages.create(max_tokens=1024, messages=messages, model="model") - assert len(events) == 1 - (event,) = events - (span,) = event["spans"] + spans = [item.payload for item in items if item.type == "span"] + (span,) = spans - assert SPANDATA.GEN_AI_REQUEST_MESSAGES in span["data"] - stored_messages = json.loads(span["data"][SPANDATA.GEN_AI_REQUEST_MESSAGES]) + assert SPANDATA.GEN_AI_REQUEST_MESSAGES in span["attributes"] + stored_messages = json.loads( + span["attributes"][SPANDATA.GEN_AI_REQUEST_MESSAGES] + ) + else: + events = capture_events() + + with start_transaction(name="anthropic"): + client.messages.create(max_tokens=1024, messages=messages, model="model") + + assert len(events) == 1 + (event,) = events + (span,) = event["spans"] + + assert SPANDATA.GEN_AI_REQUEST_MESSAGES in span["data"] + stored_messages = json.loads(span["data"][SPANDATA.GEN_AI_REQUEST_MESSAGES]) assert len(stored_messages) == 1 assert stored_messages[0]["role"] == "user" @@ -3541,14 +5274,21 @@ def test_message_with_base64_image(sentry_init, capture_events): } -def test_message_with_url_image(sentry_init, capture_events): +@pytest.mark.parametrize("stream_gen_ai_spans", [True, False]) +def test_message_with_url_image( + sentry_init, + capture_events, + capture_items, + stream_gen_ai_spans, +): """Test that messages with URL-referenced images are properly captured.""" sentry_init( integrations=[AnthropicIntegration(include_prompts=True)], traces_sample_rate=1.0, send_default_pii=True, + stream_gen_ai_spans=stream_gen_ai_spans, ) - events = capture_events() + client = Anthropic(api_key="z") client.messages._post = mock.Mock(return_value=EXAMPLE_MESSAGE) @@ -3568,14 +5308,30 @@ def test_message_with_url_image(sentry_init, capture_events): } ] - with start_transaction(name="anthropic"): - client.messages.create(max_tokens=1024, messages=messages, model="model") + if stream_gen_ai_spans: + items = capture_items("transaction", "span") + + with start_transaction(name="anthropic"): + client.messages.create(max_tokens=1024, messages=messages, model="model") + + spans = [item.payload for item in items if item.type == "span"] + (span,) = spans + + stored_messages = json.loads( + span["attributes"][SPANDATA.GEN_AI_REQUEST_MESSAGES] + ) + else: + events = capture_events() + + with start_transaction(name="anthropic"): + client.messages.create(max_tokens=1024, messages=messages, model="model") - assert len(events) == 1 - (event,) = events - (span,) = event["spans"] + assert len(events) == 1 + (event,) = events + (span,) = event["spans"] + + stored_messages = json.loads(span["data"][SPANDATA.GEN_AI_REQUEST_MESSAGES]) - stored_messages = json.loads(span["data"][SPANDATA.GEN_AI_REQUEST_MESSAGES]) content = stored_messages[0]["content"] assert content[1] == { "type": "uri", @@ -3585,14 +5341,21 @@ def test_message_with_url_image(sentry_init, capture_events): } -def test_message_with_file_image(sentry_init, capture_events): +@pytest.mark.parametrize("stream_gen_ai_spans", [True, False]) +def test_message_with_file_image( + sentry_init, + capture_events, + capture_items, + stream_gen_ai_spans, +): """Test that messages with file_id-referenced images are properly captured.""" sentry_init( integrations=[AnthropicIntegration(include_prompts=True)], traces_sample_rate=1.0, send_default_pii=True, + stream_gen_ai_spans=stream_gen_ai_spans, ) - events = capture_events() + client = Anthropic(api_key="z") client.messages._post = mock.Mock(return_value=EXAMPLE_MESSAGE) @@ -3613,14 +5376,30 @@ def test_message_with_file_image(sentry_init, capture_events): } ] - with start_transaction(name="anthropic"): - client.messages.create(max_tokens=1024, messages=messages, model="model") + if stream_gen_ai_spans: + items = capture_items("transaction", "span") + + with start_transaction(name="anthropic"): + client.messages.create(max_tokens=1024, messages=messages, model="model") + + spans = [item.payload for item in items if item.type == "span"] + (span,) = spans - assert len(events) == 1 - (event,) = events - (span,) = event["spans"] + stored_messages = json.loads( + span["attributes"][SPANDATA.GEN_AI_REQUEST_MESSAGES] + ) + else: + events = capture_events() + + with start_transaction(name="anthropic"): + client.messages.create(max_tokens=1024, messages=messages, model="model") + + assert len(events) == 1 + (event,) = events + (span,) = event["spans"] + + stored_messages = json.loads(span["data"][SPANDATA.GEN_AI_REQUEST_MESSAGES]) - stored_messages = json.loads(span["data"][SPANDATA.GEN_AI_REQUEST_MESSAGES]) content = stored_messages[0]["content"] assert content[1] == { "type": "file", @@ -3630,14 +5409,21 @@ def test_message_with_file_image(sentry_init, capture_events): } -def test_message_with_base64_pdf(sentry_init, capture_events): +@pytest.mark.parametrize("stream_gen_ai_spans", [True, False]) +def test_message_with_base64_pdf( + sentry_init, + capture_events, + capture_items, + stream_gen_ai_spans, +): """Test that messages with base64-encoded PDF documents are properly captured.""" sentry_init( integrations=[AnthropicIntegration(include_prompts=True)], traces_sample_rate=1.0, send_default_pii=True, + stream_gen_ai_spans=stream_gen_ai_spans, ) - events = capture_events() + client = Anthropic(api_key="z") client.messages._post = mock.Mock(return_value=EXAMPLE_MESSAGE) @@ -3658,14 +5444,30 @@ def test_message_with_base64_pdf(sentry_init, capture_events): } ] - with start_transaction(name="anthropic"): - client.messages.create(max_tokens=1024, messages=messages, model="model") + if stream_gen_ai_spans: + items = capture_items("transaction", "span") + + with start_transaction(name="anthropic"): + client.messages.create(max_tokens=1024, messages=messages, model="model") + + spans = [item.payload for item in items if item.type == "span"] + (span,) = spans - assert len(events) == 1 - (event,) = events - (span,) = event["spans"] + stored_messages = json.loads( + span["attributes"][SPANDATA.GEN_AI_REQUEST_MESSAGES] + ) + else: + events = capture_events() + + with start_transaction(name="anthropic"): + client.messages.create(max_tokens=1024, messages=messages, model="model") + + assert len(events) == 1 + (event,) = events + (span,) = event["spans"] + + stored_messages = json.loads(span["data"][SPANDATA.GEN_AI_REQUEST_MESSAGES]) - stored_messages = json.loads(span["data"][SPANDATA.GEN_AI_REQUEST_MESSAGES]) content = stored_messages[0]["content"] assert content[1] == { "type": "blob", @@ -3675,14 +5477,21 @@ def test_message_with_base64_pdf(sentry_init, capture_events): } -def test_message_with_url_pdf(sentry_init, capture_events): +@pytest.mark.parametrize("stream_gen_ai_spans", [True, False]) +def test_message_with_url_pdf( + sentry_init, + capture_events, + capture_items, + stream_gen_ai_spans, +): """Test that messages with URL-referenced PDF documents are properly captured.""" sentry_init( integrations=[AnthropicIntegration(include_prompts=True)], traces_sample_rate=1.0, send_default_pii=True, + stream_gen_ai_spans=stream_gen_ai_spans, ) - events = capture_events() + client = Anthropic(api_key="z") client.messages._post = mock.Mock(return_value=EXAMPLE_MESSAGE) @@ -3702,14 +5511,30 @@ def test_message_with_url_pdf(sentry_init, capture_events): } ] - with start_transaction(name="anthropic"): - client.messages.create(max_tokens=1024, messages=messages, model="model") + if stream_gen_ai_spans: + items = capture_items("transaction", "span") - assert len(events) == 1 - (event,) = events - (span,) = event["spans"] + with start_transaction(name="anthropic"): + client.messages.create(max_tokens=1024, messages=messages, model="model") + + spans = [item.payload for item in items if item.type == "span"] + (span,) = spans + + stored_messages = json.loads( + span["attributes"][SPANDATA.GEN_AI_REQUEST_MESSAGES] + ) + else: + events = capture_events() + + with start_transaction(name="anthropic"): + client.messages.create(max_tokens=1024, messages=messages, model="model") + + assert len(events) == 1 + (event,) = events + (span,) = event["spans"] + + stored_messages = json.loads(span["data"][SPANDATA.GEN_AI_REQUEST_MESSAGES]) - stored_messages = json.loads(span["data"][SPANDATA.GEN_AI_REQUEST_MESSAGES]) content = stored_messages[0]["content"] assert content[1] == { "type": "uri", @@ -3719,14 +5544,21 @@ def test_message_with_url_pdf(sentry_init, capture_events): } -def test_message_with_file_document(sentry_init, capture_events): +@pytest.mark.parametrize("stream_gen_ai_spans", [True, False]) +def test_message_with_file_document( + sentry_init, + capture_events, + capture_items, + stream_gen_ai_spans, +): """Test that messages with file_id-referenced documents are properly captured.""" sentry_init( integrations=[AnthropicIntegration(include_prompts=True)], traces_sample_rate=1.0, send_default_pii=True, + stream_gen_ai_spans=stream_gen_ai_spans, ) - events = capture_events() + client = Anthropic(api_key="z") client.messages._post = mock.Mock(return_value=EXAMPLE_MESSAGE) @@ -3747,14 +5579,30 @@ def test_message_with_file_document(sentry_init, capture_events): } ] - with start_transaction(name="anthropic"): - client.messages.create(max_tokens=1024, messages=messages, model="model") + if stream_gen_ai_spans: + items = capture_items("transaction", "span") + + with start_transaction(name="anthropic"): + client.messages.create(max_tokens=1024, messages=messages, model="model") + + spans = [item.payload for item in items if item.type == "span"] + (span,) = spans + + stored_messages = json.loads( + span["attributes"][SPANDATA.GEN_AI_REQUEST_MESSAGES] + ) + else: + events = capture_events() + + with start_transaction(name="anthropic"): + client.messages.create(max_tokens=1024, messages=messages, model="model") + + assert len(events) == 1 + (event,) = events + (span,) = event["spans"] - assert len(events) == 1 - (event,) = events - (span,) = event["spans"] + stored_messages = json.loads(span["data"][SPANDATA.GEN_AI_REQUEST_MESSAGES]) - stored_messages = json.loads(span["data"][SPANDATA.GEN_AI_REQUEST_MESSAGES]) content = stored_messages[0]["content"] assert content[1] == { "type": "file", @@ -3764,14 +5612,21 @@ def test_message_with_file_document(sentry_init, capture_events): } -def test_message_with_mixed_content(sentry_init, capture_events): +@pytest.mark.parametrize("stream_gen_ai_spans", [True, False]) +def test_message_with_mixed_content( + sentry_init, + capture_events, + capture_items, + stream_gen_ai_spans, +): """Test that messages with mixed content (text, images, documents) are properly captured.""" sentry_init( integrations=[AnthropicIntegration(include_prompts=True)], traces_sample_rate=1.0, send_default_pii=True, + stream_gen_ai_spans=stream_gen_ai_spans, ) - events = capture_events() + client = Anthropic(api_key="z") client.messages._post = mock.Mock(return_value=EXAMPLE_MESSAGE) @@ -3808,14 +5663,30 @@ def test_message_with_mixed_content(sentry_init, capture_events): } ] - with start_transaction(name="anthropic"): - client.messages.create(max_tokens=1024, messages=messages, model="model") + if stream_gen_ai_spans: + items = capture_items("transaction", "span") + + with start_transaction(name="anthropic"): + client.messages.create(max_tokens=1024, messages=messages, model="model") + + spans = [item.payload for item in items if item.type == "span"] + (span,) = spans - assert len(events) == 1 - (event,) = events - (span,) = event["spans"] + stored_messages = json.loads( + span["attributes"][SPANDATA.GEN_AI_REQUEST_MESSAGES] + ) + else: + events = capture_events() + + with start_transaction(name="anthropic"): + client.messages.create(max_tokens=1024, messages=messages, model="model") + + assert len(events) == 1 + (event,) = events + (span,) = event["spans"] + + stored_messages = json.loads(span["data"][SPANDATA.GEN_AI_REQUEST_MESSAGES]) - stored_messages = json.loads(span["data"][SPANDATA.GEN_AI_REQUEST_MESSAGES]) content = stored_messages[0]["content"] assert len(content) == 5 @@ -3847,14 +5718,21 @@ def test_message_with_mixed_content(sentry_init, capture_events): } -def test_message_with_multiple_images_different_formats(sentry_init, capture_events): +@pytest.mark.parametrize("stream_gen_ai_spans", [True, False]) +def test_message_with_multiple_images_different_formats( + sentry_init, + capture_events, + capture_items, + stream_gen_ai_spans, +): """Test that messages with multiple images of different source types are handled.""" sentry_init( integrations=[AnthropicIntegration(include_prompts=True)], traces_sample_rate=1.0, send_default_pii=True, + stream_gen_ai_spans=stream_gen_ai_spans, ) - events = capture_events() + client = Anthropic(api_key="z") client.messages._post = mock.Mock(return_value=EXAMPLE_MESSAGE) @@ -3890,14 +5768,30 @@ def test_message_with_multiple_images_different_formats(sentry_init, capture_eve } ] - with start_transaction(name="anthropic"): - client.messages.create(max_tokens=1024, messages=messages, model="model") + if stream_gen_ai_spans: + items = capture_items("transaction", "span") + + with start_transaction(name="anthropic"): + client.messages.create(max_tokens=1024, messages=messages, model="model") + + spans = [item.payload for item in items if item.type == "span"] + (span,) = spans + + stored_messages = json.loads( + span["attributes"][SPANDATA.GEN_AI_REQUEST_MESSAGES] + ) + else: + events = capture_events() + + with start_transaction(name="anthropic"): + client.messages.create(max_tokens=1024, messages=messages, model="model") + + assert len(events) == 1 + (event,) = events + (span,) = event["spans"] - assert len(events) == 1 - (event,) = events - (span,) = event["spans"] + stored_messages = json.loads(span["data"][SPANDATA.GEN_AI_REQUEST_MESSAGES]) - stored_messages = json.loads(span["data"][SPANDATA.GEN_AI_REQUEST_MESSAGES]) content = stored_messages[0]["content"] assert len(content) == 4 @@ -3922,14 +5816,21 @@ def test_message_with_multiple_images_different_formats(sentry_init, capture_eve assert content[3] == {"type": "text", "text": "Compare these three images."} -def test_binary_content_not_stored_when_pii_disabled(sentry_init, capture_events): +@pytest.mark.parametrize("stream_gen_ai_spans", [True, False]) +def test_binary_content_not_stored_when_pii_disabled( + sentry_init, + capture_events, + capture_items, + stream_gen_ai_spans, +): """Test that binary content is not stored when send_default_pii is False.""" sentry_init( integrations=[AnthropicIntegration(include_prompts=True)], traces_sample_rate=1.0, send_default_pii=False, + stream_gen_ai_spans=stream_gen_ai_spans, ) - events = capture_events() + client = Anthropic(api_key="z") client.messages._post = mock.Mock(return_value=EXAMPLE_MESSAGE) @@ -3950,25 +5851,46 @@ def test_binary_content_not_stored_when_pii_disabled(sentry_init, capture_events } ] - with start_transaction(name="anthropic"): - client.messages.create(max_tokens=1024, messages=messages, model="model") + if stream_gen_ai_spans: + items = capture_items("transaction", "span") + + with start_transaction(name="anthropic"): + client.messages.create(max_tokens=1024, messages=messages, model="model") + + spans = [item.payload for item in items if item.type == "span"] + (span,) = spans + + # Messages should not be stored + assert SPANDATA.GEN_AI_REQUEST_MESSAGES not in span["attributes"] + else: + events = capture_events() + + with start_transaction(name="anthropic"): + client.messages.create(max_tokens=1024, messages=messages, model="model") - assert len(events) == 1 - (event,) = events - (span,) = event["spans"] + assert len(events) == 1 + (event,) = events + (span,) = event["spans"] - # Messages should not be stored - assert SPANDATA.GEN_AI_REQUEST_MESSAGES not in span["data"] + # Messages should not be stored + assert SPANDATA.GEN_AI_REQUEST_MESSAGES not in span["data"] -def test_binary_content_not_stored_when_prompts_disabled(sentry_init, capture_events): +@pytest.mark.parametrize("stream_gen_ai_spans", [True, False]) +def test_binary_content_not_stored_when_prompts_disabled( + sentry_init, + capture_events, + capture_items, + stream_gen_ai_spans, +): """Test that binary content is not stored when include_prompts is False.""" sentry_init( integrations=[AnthropicIntegration(include_prompts=False)], traces_sample_rate=1.0, send_default_pii=True, + stream_gen_ai_spans=stream_gen_ai_spans, ) - events = capture_events() + client = Anthropic(api_key="z") client.messages._post = mock.Mock(return_value=EXAMPLE_MESSAGE) @@ -3989,21 +5911,45 @@ def test_binary_content_not_stored_when_prompts_disabled(sentry_init, capture_ev } ] - with start_transaction(name="anthropic"): - client.messages.create(max_tokens=1024, messages=messages, model="model") + if stream_gen_ai_spans: + items = capture_items("transaction", "span") + + with start_transaction(name="anthropic"): + client.messages.create(max_tokens=1024, messages=messages, model="model") + + spans = [item.payload for item in items if item.type == "span"] + (span,) = spans - assert len(events) == 1 - (event,) = events - (span,) = event["spans"] + # Messages should not be stored + assert SPANDATA.GEN_AI_REQUEST_MESSAGES not in span["attributes"] + else: + events = capture_events() + + with start_transaction(name="anthropic"): + client.messages.create(max_tokens=1024, messages=messages, model="model") + + assert len(events) == 1 + (event,) = events + (span,) = event["spans"] - # Messages should not be stored - assert SPANDATA.GEN_AI_REQUEST_MESSAGES not in span["data"] + # Messages should not be stored + assert SPANDATA.GEN_AI_REQUEST_MESSAGES not in span["data"] -def test_cache_tokens_nonstreaming(sentry_init, capture_events): +@pytest.mark.parametrize("stream_gen_ai_spans", [True, False]) +def test_cache_tokens_nonstreaming( + sentry_init, + capture_events, + capture_items, + stream_gen_ai_spans, +): """Test cache read/write tokens are tracked for non-streaming responses.""" - sentry_init(integrations=[AnthropicIntegration()], traces_sample_rate=1.0) - events = capture_events() + sentry_init( + integrations=[AnthropicIntegration()], + traces_sample_rate=1.0, + stream_gen_ai_spans=stream_gen_ai_spans, + ) + client = Anthropic(api_key="z") client.messages._post = mock.Mock( @@ -4022,23 +5968,49 @@ def test_cache_tokens_nonstreaming(sentry_init, capture_events): ) ) - with start_transaction(name="anthropic"): - client.messages.create( - max_tokens=1024, - messages=[{"role": "user", "content": "Hello"}], - model="claude-3-5-sonnet-20241022", - ) + if stream_gen_ai_spans: + items = capture_items("transaction", "span") + + with start_transaction(name="anthropic"): + client.messages.create( + max_tokens=1024, + messages=[{"role": "user", "content": "Hello"}], + model="claude-3-5-sonnet-20241022", + ) - (span,) = events[0]["spans"] - # input_tokens normalized: 100 + 80 (cache_read) + 20 (cache_write) = 200 - assert span["data"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS] == 200 - assert span["data"][SPANDATA.GEN_AI_USAGE_OUTPUT_TOKENS] == 50 - assert span["data"][SPANDATA.GEN_AI_USAGE_TOTAL_TOKENS] == 250 - assert span["data"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS_CACHED] == 80 - assert span["data"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS_CACHE_WRITE] == 20 + (span,) = (item.payload for item in items if item.type == "span") + # input_tokens normalized: 100 + 80 (cache_read) + 20 (cache_write) = 200 + assert span["attributes"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS] == 200 + assert span["attributes"][SPANDATA.GEN_AI_USAGE_OUTPUT_TOKENS] == 50 + assert span["attributes"][SPANDATA.GEN_AI_USAGE_TOTAL_TOKENS] == 250 + assert span["attributes"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS_CACHED] == 80 + assert span["attributes"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS_CACHE_WRITE] == 20 + else: + events = capture_events() + + with start_transaction(name="anthropic"): + client.messages.create( + max_tokens=1024, + messages=[{"role": "user", "content": "Hello"}], + model="claude-3-5-sonnet-20241022", + ) + (span,) = events[0]["spans"] + # input_tokens normalized: 100 + 80 (cache_read) + 20 (cache_write) = 200 + assert span["data"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS] == 200 + assert span["data"][SPANDATA.GEN_AI_USAGE_OUTPUT_TOKENS] == 50 + assert span["data"][SPANDATA.GEN_AI_USAGE_TOTAL_TOKENS] == 250 + assert span["data"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS_CACHED] == 80 + assert span["data"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS_CACHE_WRITE] == 20 -def test_input_tokens_include_cache_write_nonstreaming(sentry_init, capture_events): + +@pytest.mark.parametrize("stream_gen_ai_spans", [True, False]) +def test_input_tokens_include_cache_write_nonstreaming( + sentry_init, + capture_events, + capture_items, + stream_gen_ai_spans, +): """ Test that gen_ai.usage.input_tokens includes cache_write tokens (non-streaming). @@ -4050,8 +6022,12 @@ def test_input_tokens_include_cache_write_nonstreaming(sentry_init, capture_even Usage(input_tokens=19, output_tokens=14, cache_creation_input_tokens=2846, cache_read_input_tokens=0) """ - sentry_init(integrations=[AnthropicIntegration()], traces_sample_rate=1.0) - events = capture_events() + sentry_init( + integrations=[AnthropicIntegration()], + traces_sample_rate=1.0, + stream_gen_ai_spans=stream_gen_ai_spans, + ) + client = Anthropic(api_key="z") client.messages._post = mock.Mock( @@ -4070,23 +6046,53 @@ def test_input_tokens_include_cache_write_nonstreaming(sentry_init, capture_even ) ) - with start_transaction(name="anthropic"): - client.messages.create( - max_tokens=1024, - messages=[{"role": "user", "content": "What is 3+3?"}], - model="claude-sonnet-4-20250514", + if stream_gen_ai_spans: + items = capture_items("transaction", "span") + + with start_transaction(name="anthropic"): + client.messages.create( + max_tokens=1024, + messages=[{"role": "user", "content": "What is 3+3?"}], + model="claude-sonnet-4-20250514", + ) + + (span,) = (item.payload for item in items if item.type == "span") + + # input_tokens should be total: 19 (non-cached) + 2846 (cache_write) = 2865 + assert span["attributes"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS] == 2865 + assert ( + span["attributes"][SPANDATA.GEN_AI_USAGE_TOTAL_TOKENS] == 2879 + ) # 2865 + 14 + assert span["attributes"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS_CACHED] == 0 + assert ( + span["attributes"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS_CACHE_WRITE] == 2846 ) + else: + events = capture_events() + + with start_transaction(name="anthropic"): + client.messages.create( + max_tokens=1024, + messages=[{"role": "user", "content": "What is 3+3?"}], + model="claude-sonnet-4-20250514", + ) - (span,) = events[0]["spans"] + (span,) = events[0]["spans"] - # input_tokens should be total: 19 (non-cached) + 2846 (cache_write) = 2865 - assert span["data"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS] == 2865 - assert span["data"][SPANDATA.GEN_AI_USAGE_TOTAL_TOKENS] == 2879 # 2865 + 14 - assert span["data"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS_CACHED] == 0 - assert span["data"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS_CACHE_WRITE] == 2846 + # input_tokens should be total: 19 (non-cached) + 2846 (cache_write) = 2865 + assert span["data"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS] == 2865 + assert span["data"][SPANDATA.GEN_AI_USAGE_TOTAL_TOKENS] == 2879 # 2865 + 14 + assert span["data"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS_CACHED] == 0 + assert span["data"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS_CACHE_WRITE] == 2846 -def test_input_tokens_include_cache_read_nonstreaming(sentry_init, capture_events): +@pytest.mark.parametrize("stream_gen_ai_spans", [True, False]) +def test_input_tokens_include_cache_read_nonstreaming( + sentry_init, + capture_events, + capture_items, + stream_gen_ai_spans, +): """ Test that gen_ai.usage.input_tokens includes cache_read tokens (non-streaming). @@ -4098,8 +6104,12 @@ def test_input_tokens_include_cache_read_nonstreaming(sentry_init, capture_event Usage(input_tokens=19, output_tokens=14, cache_creation_input_tokens=0, cache_read_input_tokens=2846) """ - sentry_init(integrations=[AnthropicIntegration()], traces_sample_rate=1.0) - events = capture_events() + sentry_init( + integrations=[AnthropicIntegration()], + traces_sample_rate=1.0, + stream_gen_ai_spans=stream_gen_ai_spans, + ) + client = Anthropic(api_key="z") client.messages._post = mock.Mock( @@ -4118,27 +6128,52 @@ def test_input_tokens_include_cache_read_nonstreaming(sentry_init, capture_event ) ) - with start_transaction(name="anthropic"): - client.messages.create( - max_tokens=1024, - messages=[{"role": "user", "content": "What is 5+5?"}], - model="claude-sonnet-4-20250514", - ) + if stream_gen_ai_spans: + items = capture_items("transaction", "span") - (span,) = events[0]["spans"] + with start_transaction(name="anthropic"): + client.messages.create( + max_tokens=1024, + messages=[{"role": "user", "content": "What is 5+5?"}], + model="claude-sonnet-4-20250514", + ) + + (span,) = [item.payload for item in items if item.type == "span"] + + # input_tokens should be total: 19 (non-cached) + 2846 (cache_read) = 2865 + assert span["attributes"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS] == 2865 + assert ( + span["attributes"][SPANDATA.GEN_AI_USAGE_TOTAL_TOKENS] == 2879 + ) # 2865 + 14 + assert span["attributes"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS_CACHED] == 2846 + assert span["attributes"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS_CACHE_WRITE] == 0 + else: + events = capture_events() + + with start_transaction(name="anthropic"): + client.messages.create( + max_tokens=1024, + messages=[{"role": "user", "content": "What is 5+5?"}], + model="claude-sonnet-4-20250514", + ) + + (span,) = events[0]["spans"] - # input_tokens should be total: 19 (non-cached) + 2846 (cache_read) = 2865 - assert span["data"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS] == 2865 - assert span["data"][SPANDATA.GEN_AI_USAGE_TOTAL_TOKENS] == 2879 # 2865 + 14 - assert span["data"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS_CACHED] == 2846 - assert span["data"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS_CACHE_WRITE] == 0 + # input_tokens should be total: 19 (non-cached) + 2846 (cache_read) = 2865 + assert span["data"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS] == 2865 + assert span["data"][SPANDATA.GEN_AI_USAGE_TOTAL_TOKENS] == 2879 # 2865 + 14 + assert span["data"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS_CACHED] == 2846 + assert span["data"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS_CACHE_WRITE] == 0 +@pytest.mark.parametrize("stream_gen_ai_spans", [True, False]) def test_input_tokens_include_cache_read_streaming( sentry_init, capture_events, + capture_items, get_model_response, server_side_event_chunks, + stream_gen_ai_spans, ): """ Test that gen_ai.usage.input_tokens includes cache_read tokens (streaming). @@ -4175,15 +6210,45 @@ def test_input_tokens_include_cache_read_streaming( ) ) - sentry_init(integrations=[AnthropicIntegration()], traces_sample_rate=1.0) - events = capture_events() + sentry_init( + integrations=[AnthropicIntegration()], + traces_sample_rate=1.0, + stream_gen_ai_spans=stream_gen_ai_spans, + ) + + if stream_gen_ai_spans: + items = capture_items("transaction", "span") - with mock.patch.object( - client._client, - "send", - return_value=response, - ) as _: - with start_transaction(name="anthropic"): + with mock.patch.object( + client._client, + "send", + return_value=response, + ) as _, start_transaction(name="anthropic"): + for _ in client.messages.create( + max_tokens=1024, + messages=[{"role": "user", "content": "What is 5+5?"}], + model="claude-sonnet-4-20250514", + stream=True, + ): + pass + + (span,) = (item.payload for item in items if item.type == "span") + + # input_tokens should be total: 19 + 2846 = test_stream_messages_input_tokens_include_cache_read_streaming + assert span["attributes"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS] == 2865 + assert ( + span["attributes"][SPANDATA.GEN_AI_USAGE_TOTAL_TOKENS] == 2879 + ) # 2865 + 14 + assert span["attributes"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS_CACHED] == 2846 + assert span["attributes"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS_CACHE_WRITE] == 0 + else: + events = capture_events() + + with mock.patch.object( + client._client, + "send", + return_value=response, + ) as _, start_transaction(name="anthropic"): for _ in client.messages.create( max_tokens=1024, messages=[{"role": "user", "content": "What is 5+5?"}], @@ -4192,20 +6257,23 @@ def test_input_tokens_include_cache_read_streaming( ): pass - (span,) = events[0]["spans"] + (span,) = events[0]["spans"] - # input_tokens should be total: 19 + 2846 = test_stream_messages_input_tokens_include_cache_read_streaming - assert span["data"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS] == 2865 - assert span["data"][SPANDATA.GEN_AI_USAGE_TOTAL_TOKENS] == 2879 # 2865 + 14 - assert span["data"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS_CACHED] == 2846 - assert span["data"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS_CACHE_WRITE] == 0 + # input_tokens should be total: 19 + 2846 = test_stream_messages_input_tokens_include_cache_read_streaming + assert span["data"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS] == 2865 + assert span["data"][SPANDATA.GEN_AI_USAGE_TOTAL_TOKENS] == 2879 # 2865 + 14 + assert span["data"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS_CACHED] == 2846 + assert span["data"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS_CACHE_WRITE] == 0 +@pytest.mark.parametrize("stream_gen_ai_spans", [True, False]) def test_stream_messages_input_tokens_include_cache_read_streaming( sentry_init, capture_events, + capture_items, get_model_response, server_side_event_chunks, + stream_gen_ai_spans, ): """ Test that gen_ai.usage.input_tokens includes cache_read tokens (streaming). @@ -4241,41 +6309,79 @@ def test_stream_messages_input_tokens_include_cache_read_streaming( ) ) - sentry_init(integrations=[AnthropicIntegration()], traces_sample_rate=1.0) - events = capture_events() + sentry_init( + integrations=[AnthropicIntegration()], + traces_sample_rate=1.0, + stream_gen_ai_spans=stream_gen_ai_spans, + ) + + if stream_gen_ai_spans: + items = capture_items("transaction", "span") - with mock.patch.object( - client._client, - "send", - return_value=response, - ) as _: - with start_transaction(name="anthropic"): - with client.messages.stream( - max_tokens=1024, - messages=[{"role": "user", "content": "What is 5+5?"}], - model="claude-sonnet-4-20250514", - ) as stream: - for event in stream: - pass + with mock.patch.object( + client._client, + "send", + return_value=response, + ) as _, start_transaction(name="anthropic"), client.messages.stream( + max_tokens=1024, + messages=[{"role": "user", "content": "What is 5+5?"}], + model="claude-sonnet-4-20250514", + ) as stream: + for event in stream: + pass + + (span,) = (item.payload for item in items if item.type == "span") + + # input_tokens should be total: 19 + 2846 = 2865 + assert span["attributes"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS] == 2865 + assert ( + span["attributes"][SPANDATA.GEN_AI_USAGE_TOTAL_TOKENS] == 2879 + ) # 2865 + 14 + assert span["attributes"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS_CACHED] == 2846 + assert span["attributes"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS_CACHE_WRITE] == 0 + else: + events = capture_events() + + with mock.patch.object( + client._client, + "send", + return_value=response, + ) as _, start_transaction(name="anthropic"), client.messages.stream( + max_tokens=1024, + messages=[{"role": "user", "content": "What is 5+5?"}], + model="claude-sonnet-4-20250514", + ) as stream: + for event in stream: + pass - (span,) = events[0]["spans"] + (span,) = events[0]["spans"] - # input_tokens should be total: 19 + 2846 = 2865 - assert span["data"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS] == 2865 - assert span["data"][SPANDATA.GEN_AI_USAGE_TOTAL_TOKENS] == 2879 # 2865 + 14 - assert span["data"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS_CACHED] == 2846 - assert span["data"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS_CACHE_WRITE] == 0 + # input_tokens should be total: 19 + 2846 = 2865 + assert span["data"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS] == 2865 + assert span["data"][SPANDATA.GEN_AI_USAGE_TOTAL_TOKENS] == 2879 # 2865 + 14 + assert span["data"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS_CACHED] == 2846 + assert span["data"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS_CACHE_WRITE] == 0 -def test_input_tokens_unchanged_without_caching(sentry_init, capture_events): +@pytest.mark.parametrize("stream_gen_ai_spans", [True, False]) +def test_input_tokens_unchanged_without_caching( + sentry_init, + capture_events, + capture_items, + stream_gen_ai_spans, +): """ Test that input_tokens is unchanged when there are no cached tokens. Real Anthropic response (from E2E test, simple call without caching): Usage(input_tokens=20, output_tokens=12) """ - sentry_init(integrations=[AnthropicIntegration()], traces_sample_rate=1.0) - events = capture_events() + sentry_init( + integrations=[AnthropicIntegration()], + traces_sample_rate=1.0, + stream_gen_ai_spans=stream_gen_ai_spans, + ) + client = Anthropic(api_key="z") client.messages._post = mock.Mock( @@ -4292,24 +6398,44 @@ def test_input_tokens_unchanged_without_caching(sentry_init, capture_events): ) ) - with start_transaction(name="anthropic"): - client.messages.create( - max_tokens=1024, - messages=[{"role": "user", "content": "What is 2+2?"}], - model="claude-sonnet-4-20250514", - ) + if stream_gen_ai_spans: + items = capture_items("transaction", "span") + + with start_transaction(name="anthropic"): + client.messages.create( + max_tokens=1024, + messages=[{"role": "user", "content": "What is 2+2?"}], + model="claude-sonnet-4-20250514", + ) + + (span,) = (item.payload for item in items if item.type == "span") + + assert span["attributes"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS] == 20 + assert span["attributes"][SPANDATA.GEN_AI_USAGE_TOTAL_TOKENS] == 32 # 20 + 12 + else: + events = capture_events() - (span,) = events[0]["spans"] + with start_transaction(name="anthropic"): + client.messages.create( + max_tokens=1024, + messages=[{"role": "user", "content": "What is 2+2?"}], + model="claude-sonnet-4-20250514", + ) + + (span,) = events[0]["spans"] - assert span["data"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS] == 20 - assert span["data"][SPANDATA.GEN_AI_USAGE_TOTAL_TOKENS] == 32 # 20 + 12 + assert span["data"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS] == 20 + assert span["data"][SPANDATA.GEN_AI_USAGE_TOTAL_TOKENS] == 32 # 20 + 12 +@pytest.mark.parametrize("stream_gen_ai_spans", [True, False]) def test_cache_tokens_streaming( sentry_init, capture_events, + capture_items, get_model_response, server_side_event_chunks, + stream_gen_ai_spans, ): """Test cache tokens are tracked for streaming responses.""" client = Anthropic(api_key="z") @@ -4342,15 +6468,43 @@ def test_cache_tokens_streaming( ) ) - sentry_init(integrations=[AnthropicIntegration()], traces_sample_rate=1.0) - events = capture_events() + sentry_init( + integrations=[AnthropicIntegration()], + traces_sample_rate=1.0, + stream_gen_ai_spans=stream_gen_ai_spans, + ) + + if stream_gen_ai_spans: + items = capture_items("transaction", "span") - with mock.patch.object( - client._client, - "send", - return_value=response, - ) as _: - with start_transaction(name="anthropic"): + with mock.patch.object( + client._client, + "send", + return_value=response, + ) as _, start_transaction(name="anthropic"): + for _ in client.messages.create( + max_tokens=1024, + messages=[{"role": "user", "content": "Hello"}], + model="claude-3-5-sonnet-20241022", + stream=True, + ): + pass + + (span,) = (item.payload for item in items if item.type == "span") + # input_tokens normalized: 100 + 80 (cache_read) + 20 (cache_write) = 200 + assert span["attributes"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS] == 200 + assert span["attributes"][SPANDATA.GEN_AI_USAGE_OUTPUT_TOKENS] == 10 + assert span["attributes"][SPANDATA.GEN_AI_USAGE_TOTAL_TOKENS] == 210 + assert span["attributes"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS_CACHED] == 80 + assert span["attributes"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS_CACHE_WRITE] == 20 + else: + events = capture_events() + + with mock.patch.object( + client._client, + "send", + return_value=response, + ) as _, start_transaction(name="anthropic"): for _ in client.messages.create( max_tokens=1024, messages=[{"role": "user", "content": "Hello"}], @@ -4359,17 +6513,23 @@ def test_cache_tokens_streaming( ): pass - (span,) = events[0]["spans"] - # input_tokens normalized: 100 + 80 (cache_read) + 20 (cache_write) = 200 - assert span["data"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS] == 200 - assert span["data"][SPANDATA.GEN_AI_USAGE_OUTPUT_TOKENS] == 10 - assert span["data"][SPANDATA.GEN_AI_USAGE_TOTAL_TOKENS] == 210 - assert span["data"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS_CACHED] == 80 - assert span["data"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS_CACHE_WRITE] == 20 + (span,) = events[0]["spans"] + # input_tokens normalized: 100 + 80 (cache_read) + 20 (cache_write) = 200 + assert span["data"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS] == 200 + assert span["data"][SPANDATA.GEN_AI_USAGE_OUTPUT_TOKENS] == 10 + assert span["data"][SPANDATA.GEN_AI_USAGE_TOTAL_TOKENS] == 210 + assert span["data"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS_CACHED] == 80 + assert span["data"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS_CACHE_WRITE] == 20 +@pytest.mark.parametrize("stream_gen_ai_spans", [True, False]) def test_stream_messages_cache_tokens( - sentry_init, capture_events, get_model_response, server_side_event_chunks + sentry_init, + capture_events, + capture_items, + get_model_response, + server_side_event_chunks, + stream_gen_ai_spans, ): """Test cache tokens are tracked for streaming responses.""" client = Anthropic(api_key="z") @@ -4402,27 +6562,53 @@ def test_stream_messages_cache_tokens( ) ) - sentry_init(integrations=[AnthropicIntegration()], traces_sample_rate=1.0) - events = capture_events() + sentry_init( + integrations=[AnthropicIntegration()], + traces_sample_rate=1.0, + stream_gen_ai_spans=stream_gen_ai_spans, + ) - with mock.patch.object( - client._client, - "send", - return_value=response, - ) as _: - with start_transaction(name="anthropic"): - with client.messages.stream( - max_tokens=1024, - messages=[{"role": "user", "content": "Hello"}], - model="claude-3-5-sonnet-20241022", - ) as stream: - for event in stream: - pass + if stream_gen_ai_spans: + items = capture_items("transaction", "span") + + with mock.patch.object( + client._client, + "send", + return_value=response, + ) as _, start_transaction(name="anthropic"), client.messages.stream( + max_tokens=1024, + messages=[{"role": "user", "content": "Hello"}], + model="claude-3-5-sonnet-20241022", + ) as stream: + for event in stream: + pass + + (span,) = (item.payload for item in items if item.type == "span") + # input_tokens normalized: 100 + 80 (cache_read) + 20 (cache_write) = 200 + assert span["attributes"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS] == 200 + assert span["attributes"][SPANDATA.GEN_AI_USAGE_OUTPUT_TOKENS] == 10 + assert span["attributes"][SPANDATA.GEN_AI_USAGE_TOTAL_TOKENS] == 210 + assert span["attributes"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS_CACHED] == 80 + assert span["attributes"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS_CACHE_WRITE] == 20 + else: + events = capture_events() + + with mock.patch.object( + client._client, + "send", + return_value=response, + ) as _, start_transaction(name="anthropic"), client.messages.stream( + max_tokens=1024, + messages=[{"role": "user", "content": "Hello"}], + model="claude-3-5-sonnet-20241022", + ) as stream: + for event in stream: + pass - (span,) = events[0]["spans"] - # input_tokens normalized: 100 + 80 (cache_read) + 20 (cache_write) = 200 - assert span["data"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS] == 200 - assert span["data"][SPANDATA.GEN_AI_USAGE_OUTPUT_TOKENS] == 10 - assert span["data"][SPANDATA.GEN_AI_USAGE_TOTAL_TOKENS] == 210 - assert span["data"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS_CACHED] == 80 - assert span["data"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS_CACHE_WRITE] == 20 + (span,) = events[0]["spans"] + # input_tokens normalized: 100 + 80 (cache_read) + 20 (cache_write) = 200 + assert span["data"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS] == 200 + assert span["data"][SPANDATA.GEN_AI_USAGE_OUTPUT_TOKENS] == 10 + assert span["data"][SPANDATA.GEN_AI_USAGE_TOTAL_TOKENS] == 210 + assert span["data"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS_CACHED] == 80 + assert span["data"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS_CACHE_WRITE] == 20 diff --git a/tests/integrations/google_genai/test_google_genai.py b/tests/integrations/google_genai/test_google_genai.py index 6e91ba6634..79318eaea5 100644 --- a/tests/integrations/google_genai/test_google_genai.py +++ b/tests/integrations/google_genai/test_google_genai.py @@ -114,6 +114,7 @@ def create_test_config( return genai_types.GenerateContentConfig(**config_dict) +@pytest.mark.parametrize("stream_gen_ai_spans", [True, False]) @pytest.mark.parametrize( "send_default_pii, include_prompts", [ @@ -124,62 +125,120 @@ def create_test_config( ], ) def test_nonstreaming_generate_content( - sentry_init, capture_events, send_default_pii, include_prompts, mock_genai_client + sentry_init, + capture_events, + capture_items, + send_default_pii, + include_prompts, + mock_genai_client, + stream_gen_ai_spans, ): sentry_init( integrations=[GoogleGenAIIntegration(include_prompts=include_prompts)], traces_sample_rate=1.0, send_default_pii=send_default_pii, + stream_gen_ai_spans=stream_gen_ai_spans, ) - events = capture_events() # Mock the HTTP response at the _api_client.request() level mock_http_response = create_mock_http_response(EXAMPLE_API_RESPONSE_JSON) - with mock.patch.object( - mock_genai_client._api_client, - "request", - return_value=mock_http_response, - ): - with start_transaction(name="google_genai"): + if stream_gen_ai_spans: + items = capture_items("transaction", "span") + + with mock.patch.object( + mock_genai_client._api_client, + "request", + return_value=mock_http_response, + ), start_transaction(name="google_genai"): config = create_test_config(temperature=0.7, max_output_tokens=100) mock_genai_client.models.generate_content( model="gemini-1.5-flash", contents="Tell me a joke", config=config ) - assert len(events) == 1 - (event,) = events - assert event["type"] == "transaction" - assert event["transaction"] == "google_genai" - - assert len(event["spans"]) == 1 - chat_span = event["spans"][0] - - # Check chat span - assert chat_span["op"] == OP.GEN_AI_CHAT - assert chat_span["description"] == "chat gemini-1.5-flash" - assert chat_span["data"][SPANDATA.GEN_AI_OPERATION_NAME] == "chat" - assert chat_span["data"][SPANDATA.GEN_AI_SYSTEM] == "gcp.gemini" - assert chat_span["data"][SPANDATA.GEN_AI_REQUEST_MODEL] == "gemini-1.5-flash" - - if send_default_pii and include_prompts: - # Response text is stored as a JSON array - response_text = chat_span["data"][SPANDATA.GEN_AI_RESPONSE_TEXT] - # Parse the JSON array - response_texts = json.loads(response_text) - assert response_texts == ["Hello! How can I help you today?"] + (event,) = (item.payload for item in items if item.type == "transaction") + assert event["transaction"] == "google_genai" + + spans = [item.payload for item in items if item.type == "span"] + assert len(spans) == 1 + chat_span = next(item.payload for item in items if item.type == "span") + + # Check chat span + assert chat_span["attributes"]["sentry.op"] == OP.GEN_AI_CHAT + assert chat_span["name"] == "chat gemini-1.5-flash" + assert chat_span["attributes"][SPANDATA.GEN_AI_OPERATION_NAME] == "chat" + assert chat_span["attributes"][SPANDATA.GEN_AI_SYSTEM] == "gcp.gemini" + assert ( + chat_span["attributes"][SPANDATA.GEN_AI_REQUEST_MODEL] == "gemini-1.5-flash" + ) + + if send_default_pii and include_prompts: + # Response text is stored as a JSON array + response_text = chat_span["attributes"][SPANDATA.GEN_AI_RESPONSE_TEXT] + + # Parse the JSON array + response_texts = json.loads(response_text) + assert response_texts == ["Hello! How can I help you today?"] + else: + assert SPANDATA.GEN_AI_RESPONSE_TEXT not in chat_span["attributes"] + + # Check token usage + assert chat_span["attributes"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS] == 10 + # Output tokens now include reasoning tokens: candidates_token_count (20) + thoughts_token_count (3) = 23 + assert chat_span["attributes"][SPANDATA.GEN_AI_USAGE_OUTPUT_TOKENS] == 23 + assert chat_span["attributes"][SPANDATA.GEN_AI_USAGE_TOTAL_TOKENS] == 30 + assert chat_span["attributes"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS_CACHED] == 5 + assert ( + chat_span["attributes"][SPANDATA.GEN_AI_USAGE_OUTPUT_TOKENS_REASONING] == 3 + ) else: - assert SPANDATA.GEN_AI_RESPONSE_TEXT not in chat_span["data"] - - # Check token usage - assert chat_span["data"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS] == 10 - # Output tokens now include reasoning tokens: candidates_token_count (20) + thoughts_token_count (3) = 23 - assert chat_span["data"][SPANDATA.GEN_AI_USAGE_OUTPUT_TOKENS] == 23 - assert chat_span["data"][SPANDATA.GEN_AI_USAGE_TOTAL_TOKENS] == 30 - assert chat_span["data"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS_CACHED] == 5 - assert chat_span["data"][SPANDATA.GEN_AI_USAGE_OUTPUT_TOKENS_REASONING] == 3 + events = capture_events() + with mock.patch.object( + mock_genai_client._api_client, + "request", + return_value=mock_http_response, + ), start_transaction(name="google_genai"): + config = create_test_config(temperature=0.7, max_output_tokens=100) + mock_genai_client.models.generate_content( + model="gemini-1.5-flash", contents="Tell me a joke", config=config + ) + assert len(events) == 1 + (event,) = events + + assert event["type"] == "transaction" + assert event["transaction"] == "google_genai" + + assert len(event["spans"]) == 1 + chat_span = event["spans"][0] + + # Check chat span + assert chat_span["op"] == OP.GEN_AI_CHAT + assert chat_span["description"] == "chat gemini-1.5-flash" + assert chat_span["data"][SPANDATA.GEN_AI_OPERATION_NAME] == "chat" + assert chat_span["data"][SPANDATA.GEN_AI_SYSTEM] == "gcp.gemini" + assert chat_span["data"][SPANDATA.GEN_AI_REQUEST_MODEL] == "gemini-1.5-flash" + + if send_default_pii and include_prompts: + # Response text is stored as a JSON array + response_text = chat_span["data"][SPANDATA.GEN_AI_RESPONSE_TEXT] + # Parse the JSON array + response_texts = json.loads(response_text) + assert response_texts == ["Hello! How can I help you today?"] + else: + assert SPANDATA.GEN_AI_RESPONSE_TEXT not in chat_span["data"] + + # Check token usage + assert chat_span["data"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS] == 10 + # Output tokens now include reasoning tokens: candidates_token_count (20) + thoughts_token_count (3) = 23 + assert chat_span["data"][SPANDATA.GEN_AI_USAGE_OUTPUT_TOKENS] == 23 + assert chat_span["data"][SPANDATA.GEN_AI_USAGE_TOTAL_TOKENS] == 30 + assert chat_span["data"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS_CACHED] == 5 + assert chat_span["data"][SPANDATA.GEN_AI_USAGE_OUTPUT_TOKENS_REASONING] == 3 + + +@pytest.mark.parametrize("stream_gen_ai_spans", [True, False]) @pytest.mark.parametrize("generate_content_config", (False, True)) @pytest.mark.parametrize( "system_instructions,expected_texts", @@ -211,24 +270,28 @@ def test_nonstreaming_generate_content( def test_generate_content_with_system_instruction( sentry_init, capture_events, + capture_items, mock_genai_client, generate_content_config, system_instructions, expected_texts, + stream_gen_ai_spans, ): sentry_init( integrations=[GoogleGenAIIntegration(include_prompts=True)], traces_sample_rate=1.0, send_default_pii=True, + stream_gen_ai_spans=stream_gen_ai_spans, ) - events = capture_events() mock_http_response = create_mock_http_response(EXAMPLE_API_RESPONSE_JSON) - with mock.patch.object( - mock_genai_client._api_client, "request", return_value=mock_http_response - ): - with start_transaction(name="google_genai"): + if stream_gen_ai_spans: + items = capture_items("span") + + with mock.patch.object( + mock_genai_client._api_client, "request", return_value=mock_http_response + ), start_transaction(name="google_genai"): config = { "system_instruction": system_instructions, "temperature": 0.5, @@ -243,29 +306,66 @@ def test_generate_content_with_system_instruction( config=config, ) - (event,) = events - invoke_span = event["spans"][0] + invoke_span = next(item.payload for item in items if item.type == "span") - if expected_texts is None: - assert SPANDATA.GEN_AI_SYSTEM_INSTRUCTIONS not in invoke_span["data"] - return + if expected_texts is None: + assert SPANDATA.GEN_AI_SYSTEM_INSTRUCTIONS not in invoke_span["attributes"] + return - # (PII is enabled and include_prompts is True in this test) - system_instructions = json.loads( - invoke_span["data"][SPANDATA.GEN_AI_SYSTEM_INSTRUCTIONS] - ) + # (PII is enabled and include_prompts is True in this test) + system_instructions = json.loads( + invoke_span["attributes"][SPANDATA.GEN_AI_SYSTEM_INSTRUCTIONS] + ) + else: + events = capture_events() + + with mock.patch.object( + mock_genai_client._api_client, "request", return_value=mock_http_response + ), start_transaction(name="google_genai"): + config = { + "system_instruction": system_instructions, + "temperature": 0.5, + } + + if generate_content_config: + config = create_test_config(**config) + + mock_genai_client.models.generate_content( + model="gemini-1.5-flash", + contents="What is 2+2?", + config=config, + ) + + (event,) = events + invoke_span = event["spans"][0] + + if expected_texts is None: + assert SPANDATA.GEN_AI_SYSTEM_INSTRUCTIONS not in invoke_span["data"] + return + + # (PII is enabled and include_prompts is True in this test) + system_instructions = json.loads( + invoke_span["data"][SPANDATA.GEN_AI_SYSTEM_INSTRUCTIONS] + ) assert system_instructions == [ {"type": "text", "content": text} for text in expected_texts ] -def test_generate_content_with_tools(sentry_init, capture_events, mock_genai_client): +@pytest.mark.parametrize("stream_gen_ai_spans", [True, False]) +def test_generate_content_with_tools( + sentry_init, + capture_events, + capture_items, + mock_genai_client, + stream_gen_ai_spans, +): sentry_init( integrations=[GoogleGenAIIntegration()], traces_sample_rate=1.0, + stream_gen_ai_spans=stream_gen_ai_spans, ) - events = capture_events() # Create a mock tool function def get_weather(location: str) -> str: @@ -310,28 +410,55 @@ def get_weather(location: str) -> str: mock_http_response = create_mock_http_response(tool_response_json) - with mock.patch.object( - mock_genai_client._api_client, "request", return_value=mock_http_response - ): - with start_transaction(name="google_genai"): + if stream_gen_ai_spans: + items = capture_items("span") + + with mock.patch.object( + mock_genai_client._api_client, "request", return_value=mock_http_response + ), start_transaction(name="google_genai"): config = create_test_config(tools=[get_weather, mock_tool]) mock_genai_client.models.generate_content( model="gemini-1.5-flash", contents="What's the weather?", config=config ) - (event,) = events - invoke_span = event["spans"][0] + invoke_span = next(item.payload for item in items if item.type == "span") - # Check that tools are recorded (data is serialized as a string) - tools_data_str = invoke_span["data"][SPANDATA.GEN_AI_REQUEST_AVAILABLE_TOOLS] - # Parse the JSON string to verify content - tools_data = json.loads(tools_data_str) - assert len(tools_data) == 2 + # Check that tools are recorded (data is serialized as a string) + tools_data_str = invoke_span["attributes"][ + SPANDATA.GEN_AI_REQUEST_AVAILABLE_TOOLS + ] + # Parse the JSON string to verify content + tools_data = json.loads(tools_data_str) + assert len(tools_data) == 2 + + # The order of tools may not be guaranteed, so sort by name and description for comparison + sorted_tools = sorted( + tools_data, key=lambda t: (t.get("name", ""), t.get("name", "")) + ) + else: + events = capture_events() - # The order of tools may not be guaranteed, so sort by name and description for comparison - sorted_tools = sorted( - tools_data, key=lambda t: (t.get("name", ""), t.get("description", "")) - ) + with mock.patch.object( + mock_genai_client._api_client, "request", return_value=mock_http_response + ), start_transaction(name="google_genai"): + config = create_test_config(tools=[get_weather, mock_tool]) + mock_genai_client.models.generate_content( + model="gemini-1.5-flash", contents="What's the weather?", config=config + ) + + (event,) = events + invoke_span = event["spans"][0] + + # Check that tools are recorded (data is serialized as a string) + tools_data_str = invoke_span["data"][SPANDATA.GEN_AI_REQUEST_AVAILABLE_TOOLS] + # Parse the JSON string to verify content + tools_data = json.loads(tools_data_str) + assert len(tools_data) == 2 + + # The order of tools may not be guaranteed, so sort by name and description for comparison + sorted_tools = sorted( + tools_data, key=lambda t: (t.get("name", ""), t.get("description", "")) + ) # The function tool assert sorted_tools[0]["name"] == "get_weather" @@ -342,13 +469,19 @@ def get_weather(location: str) -> str: assert sorted_tools[1]["description"] == "Get weather information (tool object)" -def test_tool_execution(sentry_init, capture_events): +@pytest.mark.parametrize("stream_gen_ai_spans", [True, False]) +def test_tool_execution( + sentry_init, + capture_events, + capture_items, + stream_gen_ai_spans, +): sentry_init( integrations=[GoogleGenAIIntegration(include_prompts=True)], traces_sample_rate=1.0, send_default_pii=True, + stream_gen_ai_spans=stream_gen_ai_spans, ) - events = capture_events() # Create a mock tool function def get_weather(location: str) -> str: @@ -360,47 +493,95 @@ def get_weather(location: str) -> str: wrapped_weather = wrapped_tool(get_weather) - # Execute the wrapped tool - with start_transaction(name="test_tool"): - result = wrapped_weather("San Francisco") + if stream_gen_ai_spans: + items = capture_items("span") - assert result == "The weather in San Francisco is sunny" + # Execute the wrapped tool + with start_transaction(name="test_tool"): + result = wrapped_weather("San Francisco") - (event,) = events - assert len(event["spans"]) == 1 - tool_span = event["spans"][0] + assert result == "The weather in San Francisco is sunny" - assert tool_span["op"] == OP.GEN_AI_EXECUTE_TOOL - assert tool_span["description"] == "execute_tool get_weather" - assert tool_span["data"][SPANDATA.GEN_AI_TOOL_NAME] == "get_weather" - assert ( - tool_span["data"][SPANDATA.GEN_AI_TOOL_DESCRIPTION] - == "Get the weather for a location" - ) + spans = [item.payload for item in items if item.type == "span"] + assert len(spans) == 1 + tool_span = next(item.payload for item in items if item.type == "span") + + assert tool_span["attributes"]["sentry.op"] == OP.GEN_AI_EXECUTE_TOOL + assert tool_span["name"] == "execute_tool get_weather" + assert tool_span["attributes"][SPANDATA.GEN_AI_TOOL_NAME] == "get_weather" + assert ( + tool_span["attributes"][SPANDATA.GEN_AI_TOOL_DESCRIPTION] + == "Get the weather for a location" + ) + else: + events = capture_events() + # Execute the wrapped tool + with start_transaction(name="test_tool"): + result = wrapped_weather("San Francisco") -def test_error_handling(sentry_init, capture_events, mock_genai_client): + assert result == "The weather in San Francisco is sunny" + + (event,) = events + assert len(event["spans"]) == 1 + tool_span = event["spans"][0] + + assert tool_span["op"] == OP.GEN_AI_EXECUTE_TOOL + assert tool_span["description"] == "execute_tool get_weather" + assert tool_span["data"][SPANDATA.GEN_AI_TOOL_NAME] == "get_weather" + assert ( + tool_span["data"][SPANDATA.GEN_AI_TOOL_DESCRIPTION] + == "Get the weather for a location" + ) + + +@pytest.mark.parametrize("stream_gen_ai_spans", [True, False]) +def test_error_handling( + sentry_init, + capture_events, + capture_items, + mock_genai_client, + stream_gen_ai_spans, +): sentry_init( integrations=[GoogleGenAIIntegration()], traces_sample_rate=1.0, + stream_gen_ai_spans=stream_gen_ai_spans, ) - events = capture_events() - - # Mock an error at the HTTP level - with mock.patch.object( - mock_genai_client._api_client, "request", side_effect=Exception("API Error") - ): - with start_transaction(name="google_genai"): - with pytest.raises(Exception, match="API Error"): - mock_genai_client.models.generate_content( - model="gemini-1.5-flash", - contents="This will fail", - config=create_test_config(), - ) + if stream_gen_ai_spans: + items = capture_items("event", "transaction") + + # Mock an error at the HTTP level + with mock.patch.object( + mock_genai_client._api_client, "request", side_effect=Exception("API Error") + ), start_transaction(name="google_genai"), pytest.raises( + Exception, match="API Error" + ): + mock_genai_client.models.generate_content( + model="gemini-1.5-flash", + contents="This will fail", + config=create_test_config(), + ) - # Should have both transaction and error events - assert len(events) == 2 - error_event, transaction_event = events + (error_event,) = (item.payload for item in items if item.type == "event") + else: + events = capture_events() + + # Mock an error at the HTTP level + with mock.patch.object( + mock_genai_client._api_client, "request", side_effect=Exception("API Error") + ), start_transaction(name="google_genai"), pytest.raises( + Exception, match="API Error" + ): + mock_genai_client.models.generate_content( + model="gemini-1.5-flash", + contents="This will fail", + config=create_test_config(), + ) + + # Should have both transaction and error events + assert len(events) == 2 + error_event, transaction_event = events assert error_event["level"] == "error" assert error_event["exception"]["values"][0]["type"] == "Exception" @@ -408,14 +589,21 @@ def test_error_handling(sentry_init, capture_events, mock_genai_client): assert error_event["exception"]["values"][0]["mechanism"]["type"] == "google_genai" -def test_streaming_generate_content(sentry_init, capture_events, mock_genai_client): +@pytest.mark.parametrize("stream_gen_ai_spans", [True, False]) +def test_streaming_generate_content( + sentry_init, + capture_events, + capture_items, + mock_genai_client, + stream_gen_ai_spans, +): """Test streaming with generate_content_stream, verifying chunk accumulation.""" sentry_init( integrations=[GoogleGenAIIntegration(include_prompts=True)], traces_sample_rate=1.0, send_default_pii=True, + stream_gen_ai_spans=stream_gen_ai_spans, ) - events = capture_events() # Create streaming chunks - simulating a multi-chunk response # Chunk 1: First part of text with partial usage metadata @@ -479,10 +667,12 @@ def test_streaming_generate_content(sentry_init, capture_events, mock_genai_clie stream_chunks = [chunk1_json, chunk2_json, chunk3_json] mock_stream = create_mock_streaming_responses(stream_chunks) - with mock.patch.object( - mock_genai_client._api_client, "request_streamed", return_value=mock_stream - ): - with start_transaction(name="google_genai"): + if stream_gen_ai_spans: + items = capture_items("span") + + with mock.patch.object( + mock_genai_client._api_client, "request_streamed", return_value=mock_stream + ), start_transaction(name="google_genai"): config = create_test_config() stream = mock_genai_client.models.generate_content_stream( model="gemini-1.5-flash", contents="Stream me a response", config=config @@ -491,74 +681,165 @@ def test_streaming_generate_content(sentry_init, capture_events, mock_genai_clie # Consume the stream (this is what users do with the integration wrapper) collected_chunks = list(stream) - # Verify we got all chunks - assert len(collected_chunks) == 3 - assert collected_chunks[0].candidates[0].content.parts[0].text == "Hello! " - assert collected_chunks[1].candidates[0].content.parts[0].text == "How can I " - assert collected_chunks[2].candidates[0].content.parts[0].text == "help you today?" - - (event,) = events - - assert len(event["spans"]) == 1 - chat_span = event["spans"][0] - - # Check that streaming flag is set on both spans - assert chat_span["data"][SPANDATA.GEN_AI_RESPONSE_STREAMING] is True - - # Verify accumulated response text (all chunks combined) - expected_full_text = "Hello! How can I help you today?" - # Response text is stored as a JSON string - chat_response_text = json.loads(chat_span["data"][SPANDATA.GEN_AI_RESPONSE_TEXT]) - assert chat_response_text == [expected_full_text] - - # Verify finish reasons (only the final chunk has a finish reason) - # When there's a single finish reason, it's stored as a plain string (not JSON) - assert SPANDATA.GEN_AI_RESPONSE_FINISH_REASONS in chat_span["data"] - assert chat_span["data"][SPANDATA.GEN_AI_RESPONSE_FINISH_REASONS] == "STOP" - assert chat_span["data"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS] == 10 - assert chat_span["data"][SPANDATA.GEN_AI_USAGE_OUTPUT_TOKENS] == 10 - assert chat_span["data"][SPANDATA.GEN_AI_USAGE_TOTAL_TOKENS] == 25 - assert chat_span["data"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS_CACHED] == 5 - assert chat_span["data"][SPANDATA.GEN_AI_USAGE_OUTPUT_TOKENS_REASONING] == 3 + # Verify we got all chunks + assert len(collected_chunks) == 3 + assert collected_chunks[0].candidates[0].content.parts[0].text == "Hello! " + assert collected_chunks[1].candidates[0].content.parts[0].text == "How can I " + assert ( + collected_chunks[2].candidates[0].content.parts[0].text == "help you today?" + ) + + spans = [item.payload for item in items if item.type == "span"] + assert len(spans) == 1 + chat_span = next(item.payload for item in items if item.type == "span") + + # Check that streaming flag is set on both spans + assert chat_span["attributes"][SPANDATA.GEN_AI_RESPONSE_STREAMING] is True + + # Verify accumulated response text (all chunks combined) + expected_full_text = "Hello! How can I help you today?" + # Response text is stored as a JSON string + chat_response_text = json.loads( + chat_span["attributes"][SPANDATA.GEN_AI_RESPONSE_TEXT] + ) + + assert chat_response_text == [expected_full_text] + + # Verify finish reasons (only the final chunk has a finish reason) + # When there's a single finish reason, it's stored as a plain string (not JSON) + assert SPANDATA.GEN_AI_RESPONSE_FINISH_REASONS in chat_span["attributes"] + assert ( + chat_span["attributes"][SPANDATA.GEN_AI_RESPONSE_FINISH_REASONS] == "STOP" + ) + assert chat_span["attributes"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS] == 10 + assert chat_span["attributes"][SPANDATA.GEN_AI_USAGE_OUTPUT_TOKENS] == 10 + assert chat_span["attributes"][SPANDATA.GEN_AI_USAGE_TOTAL_TOKENS] == 25 + assert chat_span["attributes"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS_CACHED] == 5 + assert ( + chat_span["attributes"][SPANDATA.GEN_AI_USAGE_OUTPUT_TOKENS_REASONING] == 3 + ) + + # Verify model name + assert ( + chat_span["attributes"][SPANDATA.GEN_AI_REQUEST_MODEL] == "gemini-1.5-flash" + ) + else: + events = capture_events() - # Verify model name - assert chat_span["data"][SPANDATA.GEN_AI_REQUEST_MODEL] == "gemini-1.5-flash" + with mock.patch.object( + mock_genai_client._api_client, "request_streamed", return_value=mock_stream + ), start_transaction(name="google_genai"): + config = create_test_config() + stream = mock_genai_client.models.generate_content_stream( + model="gemini-1.5-flash", contents="Stream me a response", config=config + ) + # Consume the stream (this is what users do with the integration wrapper) + collected_chunks = list(stream) -def test_span_origin(sentry_init, capture_events, mock_genai_client): + # Verify we got all chunks + assert len(collected_chunks) == 3 + assert collected_chunks[0].candidates[0].content.parts[0].text == "Hello! " + assert collected_chunks[1].candidates[0].content.parts[0].text == "How can I " + assert ( + collected_chunks[2].candidates[0].content.parts[0].text == "help you today?" + ) + + (event,) = events + + assert len(event["spans"]) == 1 + chat_span = event["spans"][0] + + # Check that streaming flag is set on both spans + assert chat_span["data"][SPANDATA.GEN_AI_RESPONSE_STREAMING] is True + + # Verify accumulated response text (all chunks combined) + expected_full_text = "Hello! How can I help you today?" + # Response text is stored as a JSON string + chat_response_text = json.loads( + chat_span["data"][SPANDATA.GEN_AI_RESPONSE_TEXT] + ) + assert chat_response_text == [expected_full_text] + + # Verify finish reasons (only the final chunk has a finish reason) + # When there's a single finish reason, it's stored as a plain string (not JSON) + assert SPANDATA.GEN_AI_RESPONSE_FINISH_REASONS in chat_span["data"] + assert chat_span["data"][SPANDATA.GEN_AI_RESPONSE_FINISH_REASONS] == "STOP" + assert chat_span["data"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS] == 10 + assert chat_span["data"][SPANDATA.GEN_AI_USAGE_OUTPUT_TOKENS] == 10 + assert chat_span["data"][SPANDATA.GEN_AI_USAGE_TOTAL_TOKENS] == 25 + assert chat_span["data"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS_CACHED] == 5 + assert chat_span["data"][SPANDATA.GEN_AI_USAGE_OUTPUT_TOKENS_REASONING] == 3 + + # Verify model name + assert chat_span["data"][SPANDATA.GEN_AI_REQUEST_MODEL] == "gemini-1.5-flash" + + +@pytest.mark.parametrize("stream_gen_ai_spans", [True, False]) +def test_span_origin( + sentry_init, + capture_events, + capture_items, + mock_genai_client, + stream_gen_ai_spans, +): sentry_init( integrations=[GoogleGenAIIntegration()], traces_sample_rate=1.0, + stream_gen_ai_spans=stream_gen_ai_spans, ) - events = capture_events() mock_http_response = create_mock_http_response(EXAMPLE_API_RESPONSE_JSON) - with mock.patch.object( - mock_genai_client._api_client, "request", return_value=mock_http_response - ): - with start_transaction(name="google_genai"): + if stream_gen_ai_spans: + items = capture_items("span", "transaction") + + with mock.patch.object( + mock_genai_client._api_client, "request", return_value=mock_http_response + ), start_transaction(name="google_genai"): config = create_test_config() mock_genai_client.models.generate_content( model="gemini-1.5-flash", contents="Test origin", config=config ) - (event,) = events + (event,) = (item.payload for item in items if item.type == "transaction") + assert event["contexts"]["trace"]["origin"] == "manual" - assert event["contexts"]["trace"]["origin"] == "manual" - for span in event["spans"]: - assert span["origin"] == "auto.ai.google_genai" + spans = [item.payload for item in items if item.type == "span"] + for span in spans: + assert span["attributes"]["sentry.origin"] == "auto.ai.google_genai" + else: + events = capture_events() + + with mock.patch.object( + mock_genai_client._api_client, "request", return_value=mock_http_response + ), start_transaction(name="google_genai"): + config = create_test_config() + mock_genai_client.models.generate_content( + model="gemini-1.5-flash", contents="Test origin", config=config + ) + + (event,) = events + + assert event["contexts"]["trace"]["origin"] == "manual" + for span in event["spans"]: + assert span["origin"] == "auto.ai.google_genai" +@pytest.mark.parametrize("stream_gen_ai_spans", [True, False]) def test_response_without_usage_metadata( - sentry_init, capture_events, mock_genai_client + sentry_init, + capture_events, + capture_items, + mock_genai_client, + stream_gen_ai_spans, ): """Test handling of responses without usage metadata""" sentry_init( integrations=[GoogleGenAIIntegration()], traces_sample_rate=1.0, + stream_gen_ai_spans=stream_gen_ai_spans, ) - events = capture_events() # Response without usage metadata response_json = { @@ -575,32 +856,58 @@ def test_response_without_usage_metadata( mock_http_response = create_mock_http_response(response_json) - with mock.patch.object( - mock_genai_client._api_client, "request", return_value=mock_http_response - ): - with start_transaction(name="google_genai"): + if stream_gen_ai_spans: + items = capture_items("span") + + with mock.patch.object( + mock_genai_client._api_client, "request", return_value=mock_http_response + ), start_transaction(name="google_genai"): config = create_test_config() mock_genai_client.models.generate_content( model="gemini-1.5-flash", contents="Test", config=config ) - (event,) = events - chat_span = event["spans"][0] + chat_span = next(item.payload for item in items if item.type == "span") + + # Usage data should not be present + assert SPANDATA.GEN_AI_USAGE_INPUT_TOKENS not in chat_span["attributes"] + assert SPANDATA.GEN_AI_USAGE_OUTPUT_TOKENS not in chat_span["attributes"] + assert SPANDATA.GEN_AI_USAGE_TOTAL_TOKENS not in chat_span["attributes"] + else: + events = capture_events() + + with mock.patch.object( + mock_genai_client._api_client, "request", return_value=mock_http_response + ), start_transaction(name="google_genai"): + config = create_test_config() + mock_genai_client.models.generate_content( + model="gemini-1.5-flash", contents="Test", config=config + ) - # Usage data should not be present - assert SPANDATA.GEN_AI_USAGE_INPUT_TOKENS not in chat_span["data"] - assert SPANDATA.GEN_AI_USAGE_OUTPUT_TOKENS not in chat_span["data"] - assert SPANDATA.GEN_AI_USAGE_TOTAL_TOKENS not in chat_span["data"] + (event,) = events + chat_span = event["spans"][0] + # Usage data should not be present + assert SPANDATA.GEN_AI_USAGE_INPUT_TOKENS not in chat_span["data"] + assert SPANDATA.GEN_AI_USAGE_OUTPUT_TOKENS not in chat_span["data"] + assert SPANDATA.GEN_AI_USAGE_TOTAL_TOKENS not in chat_span["data"] -def test_multiple_candidates(sentry_init, capture_events, mock_genai_client): + +@pytest.mark.parametrize("stream_gen_ai_spans", [True, False]) +def test_multiple_candidates( + sentry_init, + capture_events, + capture_items, + mock_genai_client, + stream_gen_ai_spans, +): """Test handling of multiple response candidates""" sentry_init( integrations=[GoogleGenAIIntegration(include_prompts=True)], traces_sample_rate=1.0, send_default_pii=True, + stream_gen_ai_spans=stream_gen_ai_spans, ) - events = capture_events() # Response with multiple candidates multi_candidate_json = { @@ -629,50 +936,91 @@ def test_multiple_candidates(sentry_init, capture_events, mock_genai_client): mock_http_response = create_mock_http_response(multi_candidate_json) - with mock.patch.object( - mock_genai_client._api_client, "request", return_value=mock_http_response - ): - with start_transaction(name="google_genai"): + if stream_gen_ai_spans: + items = capture_items("span") + + with mock.patch.object( + mock_genai_client._api_client, "request", return_value=mock_http_response + ), start_transaction(name="google_genai"): config = create_test_config() mock_genai_client.models.generate_content( model="gemini-1.5-flash", contents="Generate multiple", config=config ) - (event,) = events - chat_span = event["spans"][0] + chat_span = next(item.payload for item in items if item.type == "span") + + # Should capture all responses + # Response text is stored as a JSON string when there are multiple responses + response_text = chat_span["attributes"][SPANDATA.GEN_AI_RESPONSE_TEXT] - # Should capture all responses - # Response text is stored as a JSON string when there are multiple responses - response_text = chat_span["data"][SPANDATA.GEN_AI_RESPONSE_TEXT] - if isinstance(response_text, str) and response_text.startswith("["): - # It's a JSON array - response_list = json.loads(response_text) - assert response_list == ["Response 1", "Response 2"] + if isinstance(response_text, str) and response_text.startswith("["): + # It's a JSON array + response_list = json.loads(response_text) + assert response_list == ["Response 1", "Response 2"] + else: + # It's concatenated + assert response_text == "Response 1\nResponse 2" + + # Finish reasons are serialized as JSON + finish_reasons = json.loads( + chat_span["attributes"][SPANDATA.GEN_AI_RESPONSE_FINISH_REASONS] + ) else: - # It's concatenated - assert response_text == "Response 1\nResponse 2" + events = capture_events() - # Finish reasons are serialized as JSON - finish_reasons = json.loads( - chat_span["data"][SPANDATA.GEN_AI_RESPONSE_FINISH_REASONS] - ) + with mock.patch.object( + mock_genai_client._api_client, "request", return_value=mock_http_response + ), start_transaction(name="google_genai"): + config = create_test_config() + mock_genai_client.models.generate_content( + model="gemini-1.5-flash", contents="Generate multiple", config=config + ) + + (event,) = events + chat_span = event["spans"][0] + + # Should capture all responses + # Response text is stored as a JSON string when there are multiple responses + response_text = chat_span["data"][SPANDATA.GEN_AI_RESPONSE_TEXT] + + if isinstance(response_text, str) and response_text.startswith("["): + # It's a JSON array + response_list = json.loads(response_text) + assert response_list == ["Response 1", "Response 2"] + else: + # It's concatenated + assert response_text == "Response 1\nResponse 2" + + # Finish reasons are serialized as JSON + finish_reasons = json.loads( + chat_span["data"][SPANDATA.GEN_AI_RESPONSE_FINISH_REASONS] + ) assert finish_reasons == ["STOP", "MAX_TOKENS"] -def test_all_configuration_parameters(sentry_init, capture_events, mock_genai_client): +@pytest.mark.parametrize("stream_gen_ai_spans", [True, False]) +def test_all_configuration_parameters( + sentry_init, + capture_events, + capture_items, + mock_genai_client, + stream_gen_ai_spans, +): """Test that all configuration parameters are properly recorded""" sentry_init( integrations=[GoogleGenAIIntegration()], traces_sample_rate=1.0, + stream_gen_ai_spans=stream_gen_ai_spans, ) - events = capture_events() mock_http_response = create_mock_http_response(EXAMPLE_API_RESPONSE_JSON) - with mock.patch.object( - mock_genai_client._api_client, "request", return_value=mock_http_response - ): - with start_transaction(name="google_genai"): + if stream_gen_ai_spans: + items = capture_items("span") + + with mock.patch.object( + mock_genai_client._api_client, "request", return_value=mock_http_response + ), start_transaction(name="google_genai"): config = create_test_config( temperature=0.8, top_p=0.95, @@ -686,57 +1034,121 @@ def test_all_configuration_parameters(sentry_init, capture_events, mock_genai_cl model="gemini-1.5-flash", contents="Test all params", config=config ) - (event,) = events - invoke_span = event["spans"][0] + invoke_span = next(item.payload for item in items if item.type == "span") + + # Check all parameters are recorded + assert invoke_span["attributes"][SPANDATA.GEN_AI_REQUEST_TEMPERATURE] == 0.8 + assert invoke_span["attributes"][SPANDATA.GEN_AI_REQUEST_TOP_P] == 0.95 + assert invoke_span["attributes"][SPANDATA.GEN_AI_REQUEST_TOP_K] == 40 + assert invoke_span["attributes"][SPANDATA.GEN_AI_REQUEST_MAX_TOKENS] == 2048 + assert ( + invoke_span["attributes"][SPANDATA.GEN_AI_REQUEST_PRESENCE_PENALTY] == 0.1 + ) + assert ( + invoke_span["attributes"][SPANDATA.GEN_AI_REQUEST_FREQUENCY_PENALTY] == 0.2 + ) + assert invoke_span["attributes"][SPANDATA.GEN_AI_REQUEST_SEED] == 12345 + else: + events = capture_events() - # Check all parameters are recorded - assert invoke_span["data"][SPANDATA.GEN_AI_REQUEST_TEMPERATURE] == 0.8 - assert invoke_span["data"][SPANDATA.GEN_AI_REQUEST_TOP_P] == 0.95 - assert invoke_span["data"][SPANDATA.GEN_AI_REQUEST_TOP_K] == 40 - assert invoke_span["data"][SPANDATA.GEN_AI_REQUEST_MAX_TOKENS] == 2048 - assert invoke_span["data"][SPANDATA.GEN_AI_REQUEST_PRESENCE_PENALTY] == 0.1 - assert invoke_span["data"][SPANDATA.GEN_AI_REQUEST_FREQUENCY_PENALTY] == 0.2 - assert invoke_span["data"][SPANDATA.GEN_AI_REQUEST_SEED] == 12345 + with mock.patch.object( + mock_genai_client._api_client, "request", return_value=mock_http_response + ), start_transaction(name="google_genai"): + config = create_test_config( + temperature=0.8, + top_p=0.95, + top_k=40, + max_output_tokens=2048, + presence_penalty=0.1, + frequency_penalty=0.2, + seed=12345, + ) + mock_genai_client.models.generate_content( + model="gemini-1.5-flash", contents="Test all params", config=config + ) + (event,) = events + invoke_span = event["spans"][0] -def test_empty_response(sentry_init, capture_events, mock_genai_client): + # Check all parameters are recorded + assert invoke_span["data"][SPANDATA.GEN_AI_REQUEST_TEMPERATURE] == 0.8 + assert invoke_span["data"][SPANDATA.GEN_AI_REQUEST_TOP_P] == 0.95 + assert invoke_span["data"][SPANDATA.GEN_AI_REQUEST_TOP_K] == 40 + assert invoke_span["data"][SPANDATA.GEN_AI_REQUEST_MAX_TOKENS] == 2048 + assert invoke_span["data"][SPANDATA.GEN_AI_REQUEST_PRESENCE_PENALTY] == 0.1 + assert invoke_span["data"][SPANDATA.GEN_AI_REQUEST_FREQUENCY_PENALTY] == 0.2 + assert invoke_span["data"][SPANDATA.GEN_AI_REQUEST_SEED] == 12345 + + +@pytest.mark.parametrize("stream_gen_ai_spans", [True, False]) +def test_empty_response( + sentry_init, + capture_events, + capture_items, + mock_genai_client, + stream_gen_ai_spans, +): """Test handling of minimal response with no content""" sentry_init( integrations=[GoogleGenAIIntegration()], traces_sample_rate=1.0, + stream_gen_ai_spans=stream_gen_ai_spans, ) - events = capture_events() # Minimal response with empty candidates array minimal_response_json = {"candidates": []} mock_http_response = create_mock_http_response(minimal_response_json) - with mock.patch.object( - mock_genai_client._api_client, "request", return_value=mock_http_response - ): - with start_transaction(name="google_genai"): + if stream_gen_ai_spans: + items = capture_items("span") + + with mock.patch.object( + mock_genai_client._api_client, "request", return_value=mock_http_response + ), start_transaction(name="google_genai"): response = mock_genai_client.models.generate_content( model="gemini-1.5-flash", contents="Test", config=create_test_config() ) - # Response will have an empty candidates list - assert response is not None - assert len(response.candidates) == 0 + # Response will have an empty candidates list + assert response is not None + assert len(response.candidates) == 0 - (event,) = events - # Should still create spans even with empty candidates - assert len(event["spans"]) == 1 + # Should still create spans even with empty candidates + spans = [item.payload for item in items if item.type == "span"] + assert len(spans) == 1 + else: + events = capture_events() + with mock.patch.object( + mock_genai_client._api_client, "request", return_value=mock_http_response + ), start_transaction(name="google_genai"): + response = mock_genai_client.models.generate_content( + model="gemini-1.5-flash", contents="Test", config=create_test_config() + ) + # Response will have an empty candidates list + assert response is not None + assert len(response.candidates) == 0 + + (event,) = events + # Should still create spans even with empty candidates + assert len(event["spans"]) == 1 + + +@pytest.mark.parametrize("stream_gen_ai_spans", [True, False]) def test_response_with_different_id_fields( - sentry_init, capture_events, mock_genai_client + sentry_init, + capture_events, + capture_items, + mock_genai_client, + stream_gen_ai_spans, ): """Test handling of different response ID field names""" sentry_init( integrations=[GoogleGenAIIntegration()], traces_sample_rate=1.0, + stream_gen_ai_spans=stream_gen_ai_spans, ) - events = capture_events() # Response with response_id and model_version response_json = { @@ -755,28 +1167,48 @@ def test_response_with_different_id_fields( mock_http_response = create_mock_http_response(response_json) - with mock.patch.object( - mock_genai_client._api_client, "request", return_value=mock_http_response - ): - with start_transaction(name="google_genai"): + if stream_gen_ai_spans: + items = capture_items("span") + + with mock.patch.object( + mock_genai_client._api_client, "request", return_value=mock_http_response + ), start_transaction(name="google_genai"): mock_genai_client.models.generate_content( model="gemini-1.5-flash", contents="Test", config=create_test_config() ) - (event,) = events - chat_span = event["spans"][0] + chat_span = next(item.payload for item in items if item.type == "span") - assert chat_span["data"][SPANDATA.GEN_AI_RESPONSE_ID] == "resp-456" - assert chat_span["data"][SPANDATA.GEN_AI_RESPONSE_MODEL] == "gemini-1.5-flash-001" + assert chat_span["attributes"][SPANDATA.GEN_AI_RESPONSE_ID] == "resp-456" + assert ( + chat_span["attributes"][SPANDATA.GEN_AI_RESPONSE_MODEL] + == "gemini-1.5-flash-001" + ) + else: + events = capture_events() + + with mock.patch.object( + mock_genai_client._api_client, "request", return_value=mock_http_response + ), start_transaction(name="google_genai"): + mock_genai_client.models.generate_content( + model="gemini-1.5-flash", contents="Test", config=create_test_config() + ) + + (event,) = events + chat_span = event["spans"][0] + assert chat_span["data"][SPANDATA.GEN_AI_RESPONSE_ID] == "resp-456" + assert ( + chat_span["data"][SPANDATA.GEN_AI_RESPONSE_MODEL] == "gemini-1.5-flash-001" + ) -def test_tool_with_async_function(sentry_init, capture_events): + +def test_tool_with_async_function(sentry_init): """Test that async tool functions are properly wrapped""" sentry_init( integrations=[GoogleGenAIIntegration()], traces_sample_rate=1.0, ) - capture_events() # Create an async tool function async def async_tool(param: str) -> str: @@ -792,41 +1224,72 @@ async def async_tool(param: str) -> str: assert hasattr(wrapped_async_tool, "__wrapped__") # Should preserve original -def test_contents_as_none(sentry_init, capture_events, mock_genai_client): +@pytest.mark.parametrize("stream_gen_ai_spans", [True, False]) +def test_contents_as_none( + sentry_init, + capture_events, + capture_items, + mock_genai_client, + stream_gen_ai_spans, +): """Test handling when contents parameter is None""" sentry_init( integrations=[GoogleGenAIIntegration(include_prompts=True)], traces_sample_rate=1.0, send_default_pii=True, + stream_gen_ai_spans=stream_gen_ai_spans, ) - events = capture_events() mock_http_response = create_mock_http_response(EXAMPLE_API_RESPONSE_JSON) - with mock.patch.object( - mock_genai_client._api_client, "request", return_value=mock_http_response - ): - with start_transaction(name="google_genai"): + if stream_gen_ai_spans: + items = capture_items("span") + + with mock.patch.object( + mock_genai_client._api_client, "request", return_value=mock_http_response + ), start_transaction(name="google_genai"): mock_genai_client.models.generate_content( model="gemini-1.5-flash", contents=None, config=create_test_config() ) - (event,) = events - invoke_span = event["spans"][0] + invoke_span = next(item.payload for item in items if item.type == "span") + + # Should handle None contents gracefully + messages = invoke_span["attributes"].get(SPANDATA.GEN_AI_REQUEST_MESSAGES, []) + else: + events = capture_events() + + with mock.patch.object( + mock_genai_client._api_client, "request", return_value=mock_http_response + ), start_transaction(name="google_genai"): + mock_genai_client.models.generate_content( + model="gemini-1.5-flash", contents=None, config=create_test_config() + ) + + (event,) = events + invoke_span = event["spans"][0] + + # Should handle None contents gracefully + messages = invoke_span["data"].get(SPANDATA.GEN_AI_REQUEST_MESSAGES, []) - # Should handle None contents gracefully - messages = invoke_span["data"].get(SPANDATA.GEN_AI_REQUEST_MESSAGES, []) # Should only have system message if any, not user message assert all(msg["role"] != "user" or msg["content"] is not None for msg in messages) -def test_tool_calls_extraction(sentry_init, capture_events, mock_genai_client): +@pytest.mark.parametrize("stream_gen_ai_spans", [True, False]) +def test_tool_calls_extraction( + sentry_init, + capture_events, + capture_items, + mock_genai_client, + stream_gen_ai_spans, +): """Test extraction of tool/function calls from response""" sentry_init( integrations=[GoogleGenAIIntegration()], traces_sample_rate=1.0, + stream_gen_ai_spans=stream_gen_ai_spans, ) - events = capture_events() # Response with function calls function_call_response_json = { @@ -865,24 +1328,49 @@ def test_tool_calls_extraction(sentry_init, capture_events, mock_genai_client): mock_http_response = create_mock_http_response(function_call_response_json) - with mock.patch.object( - mock_genai_client._api_client, "request", return_value=mock_http_response - ): - with start_transaction(name="google_genai"): + if stream_gen_ai_spans: + items = capture_items("span") + + with mock.patch.object( + mock_genai_client._api_client, "request", return_value=mock_http_response + ), start_transaction(name="google_genai"): + mock_genai_client.models.generate_content( + model="gemini-1.5-flash", + contents="What's the weather and time?", + config=create_test_config(), + ) + + chat_span = next( + item.payload for item in items if item.type == "span" + ) # The chat span + + # Check that tool calls are extracted and stored + assert SPANDATA.GEN_AI_RESPONSE_TOOL_CALLS in chat_span["attributes"] + + # Parse the JSON string to verify content + tool_calls = json.loads( + chat_span["attributes"][SPANDATA.GEN_AI_RESPONSE_TOOL_CALLS] + ) + else: + events = capture_events() + + with mock.patch.object( + mock_genai_client._api_client, "request", return_value=mock_http_response + ), start_transaction(name="google_genai"): mock_genai_client.models.generate_content( model="gemini-1.5-flash", contents="What's the weather and time?", config=create_test_config(), ) - (event,) = events - chat_span = event["spans"][0] # The chat span + (event,) = events + chat_span = event["spans"][0] # The chat span - # Check that tool calls are extracted and stored - assert SPANDATA.GEN_AI_RESPONSE_TOOL_CALLS in chat_span["data"] + # Check that tool calls are extracted and stored + assert SPANDATA.GEN_AI_RESPONSE_TOOL_CALLS in chat_span["data"] - # Parse the JSON string to verify content - tool_calls = json.loads(chat_span["data"][SPANDATA.GEN_AI_RESPONSE_TOOL_CALLS]) + # Parse the JSON string to verify content + tool_calls = json.loads(chat_span["data"][SPANDATA.GEN_AI_RESPONSE_TOOL_CALLS]) assert len(tool_calls) == 2 @@ -902,16 +1390,21 @@ def test_tool_calls_extraction(sentry_init, capture_events, mock_genai_client): assert json.loads(tool_calls[1]["arguments"]) == {"timezone": "PST"} +@pytest.mark.parametrize("stream_gen_ai_spans", [True, False]) def test_google_genai_message_truncation( - sentry_init, capture_events, mock_genai_client + sentry_init, + capture_events, + capture_items, + mock_genai_client, + stream_gen_ai_spans, ): """Test that large messages are truncated properly in Google GenAI integration.""" sentry_init( integrations=[GoogleGenAIIntegration(include_prompts=True)], traces_sample_rate=1.0, send_default_pii=True, + stream_gen_ai_spans=stream_gen_ai_spans, ) - events = capture_events() large_content = ( "This is a very long message that will exceed our size limits. " * 1000 @@ -920,21 +1413,39 @@ def test_google_genai_message_truncation( mock_http_response = create_mock_http_response(EXAMPLE_API_RESPONSE_JSON) - with mock.patch.object( - mock_genai_client._api_client, "request", return_value=mock_http_response - ): - with start_transaction(name="google_genai"): + if stream_gen_ai_spans: + items = capture_items("span") + + with mock.patch.object( + mock_genai_client._api_client, "request", return_value=mock_http_response + ), start_transaction(name="google_genai"): + mock_genai_client.models.generate_content( + model="gemini-1.5-flash", + contents=[large_content, small_content], + config=create_test_config(), + ) + + invoke_span = next(item.payload for item in items if item.type == "span") + assert SPANDATA.GEN_AI_REQUEST_MESSAGES in invoke_span["attributes"] + + messages_data = invoke_span["attributes"][SPANDATA.GEN_AI_REQUEST_MESSAGES] + else: + events = capture_events() + + with mock.patch.object( + mock_genai_client._api_client, "request", return_value=mock_http_response + ), start_transaction(name="google_genai"): mock_genai_client.models.generate_content( model="gemini-1.5-flash", contents=[large_content, small_content], config=create_test_config(), ) - (event,) = events - invoke_span = event["spans"][0] - assert SPANDATA.GEN_AI_REQUEST_MESSAGES in invoke_span["data"] + (event,) = events + invoke_span = event["spans"][0] + assert SPANDATA.GEN_AI_REQUEST_MESSAGES in invoke_span["data"] - messages_data = invoke_span["data"][SPANDATA.GEN_AI_REQUEST_MESSAGES] + messages_data = invoke_span["data"][SPANDATA.GEN_AI_REQUEST_MESSAGES] assert isinstance(messages_data, str) parsed_messages = json.loads(messages_data) @@ -970,6 +1481,7 @@ def test_google_genai_message_truncation( } +@pytest.mark.parametrize("stream_gen_ai_spans", [True, False]) @pytest.mark.parametrize( "send_default_pii, include_prompts", [ @@ -980,24 +1492,83 @@ def test_google_genai_message_truncation( ], ) def test_embed_content( - sentry_init, capture_events, send_default_pii, include_prompts, mock_genai_client + sentry_init, + capture_events, + capture_items, + send_default_pii, + include_prompts, + mock_genai_client, + stream_gen_ai_spans, ): sentry_init( integrations=[GoogleGenAIIntegration(include_prompts=include_prompts)], traces_sample_rate=1.0, send_default_pii=send_default_pii, + stream_gen_ai_spans=stream_gen_ai_spans, ) - events = capture_events() # Mock the HTTP response at the _api_client.request() level mock_http_response = create_mock_http_response(EXAMPLE_EMBED_RESPONSE_JSON) - with mock.patch.object( - mock_genai_client._api_client, - "request", - return_value=mock_http_response, - ): - with start_transaction(name="google_genai_embeddings"): + if stream_gen_ai_spans: + items = capture_items("transaction", "span") + + with mock.patch.object( + mock_genai_client._api_client, + "request", + return_value=mock_http_response, + ), start_transaction(name="google_genai_embeddings"): + mock_genai_client.models.embed_content( + model="text-embedding-004", + contents=[ + "What is your name?", + "What is your favorite color?", + ], + ) + + (event,) = (item.payload for item in items if item.type == "transaction") + + assert event["transaction"] == "google_genai_embeddings" + + # Should have 1 span for embeddings + spans = [item.payload for item in items if item.type == "span"] + assert len(spans) == 1 + (embed_span,) = spans + + # Check embeddings span + assert embed_span["attributes"]["sentry.op"] == OP.GEN_AI_EMBEDDINGS + assert embed_span["name"] == "embeddings text-embedding-004" + assert embed_span["attributes"][SPANDATA.GEN_AI_OPERATION_NAME] == "embeddings" + assert embed_span["attributes"][SPANDATA.GEN_AI_SYSTEM] == "gcp.gemini" + assert ( + embed_span["attributes"][SPANDATA.GEN_AI_REQUEST_MODEL] + == "text-embedding-004" + ) + + # Check input texts if PII is allowed + if send_default_pii and include_prompts: + input_texts = json.loads( + embed_span["attributes"][SPANDATA.GEN_AI_EMBEDDINGS_INPUT] + ) + assert input_texts == [ + "What is your name?", + "What is your favorite color?", + ] + else: + assert SPANDATA.GEN_AI_EMBEDDINGS_INPUT not in embed_span["attributes"] + + # Check usage data (sum of token counts from statistics: 10 + 15 = 25) + # Note: Only available in newer versions with ContentEmbeddingStatistics + if SPANDATA.GEN_AI_USAGE_INPUT_TOKENS in embed_span["attributes"]: + assert embed_span["attributes"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS] == 25 + else: + events = capture_events() + + with mock.patch.object( + mock_genai_client._api_client, + "request", + return_value=mock_http_response, + ), start_transaction(name="google_genai_embeddings"): mock_genai_client.models.embed_content( model="text-embedding-004", contents=[ @@ -1006,47 +1577,56 @@ def test_embed_content( ], ) - assert len(events) == 1 - (event,) = events + assert len(events) == 1 + (event,) = events - assert event["type"] == "transaction" - assert event["transaction"] == "google_genai_embeddings" + assert event["type"] == "transaction" + assert event["transaction"] == "google_genai_embeddings" - # Should have 1 span for embeddings - assert len(event["spans"]) == 1 - (embed_span,) = event["spans"] + # Should have 1 span for embeddings + assert len(event["spans"]) == 1 + (embed_span,) = event["spans"] - # Check embeddings span - assert embed_span["op"] == OP.GEN_AI_EMBEDDINGS - assert embed_span["description"] == "embeddings text-embedding-004" - assert embed_span["data"][SPANDATA.GEN_AI_OPERATION_NAME] == "embeddings" - assert embed_span["data"][SPANDATA.GEN_AI_SYSTEM] == "gcp.gemini" - assert embed_span["data"][SPANDATA.GEN_AI_REQUEST_MODEL] == "text-embedding-004" + # Check embeddings span + assert embed_span["op"] == OP.GEN_AI_EMBEDDINGS + assert embed_span["description"] == "embeddings text-embedding-004" + assert embed_span["data"][SPANDATA.GEN_AI_OPERATION_NAME] == "embeddings" + assert embed_span["data"][SPANDATA.GEN_AI_SYSTEM] == "gcp.gemini" + assert embed_span["data"][SPANDATA.GEN_AI_REQUEST_MODEL] == "text-embedding-004" - # Check input texts if PII is allowed - if send_default_pii and include_prompts: - input_texts = json.loads(embed_span["data"][SPANDATA.GEN_AI_EMBEDDINGS_INPUT]) - assert input_texts == [ - "What is your name?", - "What is your favorite color?", - ] - else: - assert SPANDATA.GEN_AI_EMBEDDINGS_INPUT not in embed_span["data"] + # Check input texts if PII is allowed + if send_default_pii and include_prompts: + input_texts = json.loads( + embed_span["data"][SPANDATA.GEN_AI_EMBEDDINGS_INPUT] + ) + assert input_texts == [ + "What is your name?", + "What is your favorite color?", + ] + else: + assert SPANDATA.GEN_AI_EMBEDDINGS_INPUT not in embed_span["data"] - # Check usage data (sum of token counts from statistics: 10 + 15 = 25) - # Note: Only available in newer versions with ContentEmbeddingStatistics - if SPANDATA.GEN_AI_USAGE_INPUT_TOKENS in embed_span["data"]: - assert embed_span["data"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS] == 25 + # Check usage data (sum of token counts from statistics: 10 + 15 = 25) + # Note: Only available in newer versions with ContentEmbeddingStatistics + if SPANDATA.GEN_AI_USAGE_INPUT_TOKENS in embed_span["data"]: + assert embed_span["data"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS] == 25 -def test_embed_content_string_input(sentry_init, capture_events, mock_genai_client): +@pytest.mark.parametrize("stream_gen_ai_spans", [True, False]) +def test_embed_content_string_input( + sentry_init, + capture_events, + capture_items, + mock_genai_client, + stream_gen_ai_spans, +): """Test embed_content with a single string instead of list.""" sentry_init( integrations=[GoogleGenAIIntegration(include_prompts=True)], traces_sample_rate=1.0, send_default_pii=True, + stream_gen_ai_spans=stream_gen_ai_spans, ) - events = capture_events() # Mock response with single embedding single_embed_response = { @@ -1065,51 +1645,103 @@ def test_embed_content_string_input(sentry_init, capture_events, mock_genai_clie } mock_http_response = create_mock_http_response(single_embed_response) - with mock.patch.object( - mock_genai_client._api_client, "request", return_value=mock_http_response - ): - with start_transaction(name="google_genai_embeddings"): + if stream_gen_ai_spans: + items = capture_items("span") + + with mock.patch.object( + mock_genai_client._api_client, "request", return_value=mock_http_response + ), start_transaction(name="google_genai_embeddings"): + mock_genai_client.models.embed_content( + model="text-embedding-004", + contents="Single text input", + ) + + spans = [item.payload for item in items if item.type == "span"] + (embed_span,) = spans + + # Check that single string is handled correctly + input_texts = json.loads( + embed_span["attributes"][SPANDATA.GEN_AI_EMBEDDINGS_INPUT] + ) + + assert input_texts == ["Single text input"] + # Should use token_count from statistics (5), not billable_character_count (10) + # Note: Only available in newer versions with ContentEmbeddingStatistics + if SPANDATA.GEN_AI_USAGE_INPUT_TOKENS in embed_span["attributes"]: + assert embed_span["attributes"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS] == 5 + else: + events = capture_events() + + with mock.patch.object( + mock_genai_client._api_client, "request", return_value=mock_http_response + ), start_transaction(name="google_genai_embeddings"): mock_genai_client.models.embed_content( model="text-embedding-004", contents="Single text input", ) - (event,) = events - (embed_span,) = event["spans"] + (event,) = events + (embed_span,) = event["spans"] - # Check that single string is handled correctly - input_texts = json.loads(embed_span["data"][SPANDATA.GEN_AI_EMBEDDINGS_INPUT]) - assert input_texts == ["Single text input"] - # Should use token_count from statistics (5), not billable_character_count (10) - # Note: Only available in newer versions with ContentEmbeddingStatistics - if SPANDATA.GEN_AI_USAGE_INPUT_TOKENS in embed_span["data"]: - assert embed_span["data"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS] == 5 + # Check that single string is handled correctly + input_texts = json.loads(embed_span["data"][SPANDATA.GEN_AI_EMBEDDINGS_INPUT]) + assert input_texts == ["Single text input"] + # Should use token_count from statistics (5), not billable_character_count (10) + # Note: Only available in newer versions with ContentEmbeddingStatistics + if SPANDATA.GEN_AI_USAGE_INPUT_TOKENS in embed_span["data"]: + assert embed_span["data"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS] == 5 -def test_embed_content_error_handling(sentry_init, capture_events, mock_genai_client): +@pytest.mark.parametrize("stream_gen_ai_spans", [True, False]) +def test_embed_content_error_handling( + sentry_init, + capture_events, + capture_items, + mock_genai_client, + stream_gen_ai_spans, +): """Test error handling in embed_content.""" sentry_init( integrations=[GoogleGenAIIntegration()], traces_sample_rate=1.0, + stream_gen_ai_spans=stream_gen_ai_spans, ) - events = capture_events() - - # Mock an error at the HTTP level - with mock.patch.object( - mock_genai_client._api_client, - "request", - side_effect=Exception("Embedding API Error"), - ): - with start_transaction(name="google_genai_embeddings"): - with pytest.raises(Exception, match="Embedding API Error"): - mock_genai_client.models.embed_content( - model="text-embedding-004", - contents=["This will fail"], - ) + if stream_gen_ai_spans: + items = capture_items("transaction", "event") + + # Mock an error at the HTTP level + with mock.patch.object( + mock_genai_client._api_client, + "request", + side_effect=Exception("Embedding API Error"), + ), start_transaction(name="google_genai_embeddings"), pytest.raises( + Exception, match="Embedding API Error" + ): + mock_genai_client.models.embed_content( + model="text-embedding-004", + contents=["This will fail"], + ) - # Should have both transaction and error events - assert len(events) == 2 - error_event, _ = events + (error_event,) = (item.payload for item in items if item.type == "event") + else: + events = capture_events() + + # Mock an error at the HTTP level + with mock.patch.object( + mock_genai_client._api_client, + "request", + side_effect=Exception("Embedding API Error"), + ), start_transaction(name="google_genai_embeddings"), pytest.raises( + Exception, match="Embedding API Error" + ): + mock_genai_client.models.embed_content( + model="text-embedding-004", + contents=["This will fail"], + ) + + # Should have both transaction and error events + assert len(events) == 2 + error_event, _ = events assert error_event["level"] == "error" assert error_event["exception"]["values"][0]["type"] == "Exception" @@ -1117,15 +1749,20 @@ def test_embed_content_error_handling(sentry_init, capture_events, mock_genai_cl assert error_event["exception"]["values"][0]["mechanism"]["type"] == "google_genai" +@pytest.mark.parametrize("stream_gen_ai_spans", [True, False]) def test_embed_content_without_statistics( - sentry_init, capture_events, mock_genai_client + sentry_init, + capture_events, + capture_items, + mock_genai_client, + stream_gen_ai_spans, ): """Test embed_content response without statistics (older package versions).""" sentry_init( integrations=[GoogleGenAIIntegration()], traces_sample_rate=1.0, + stream_gen_ai_spans=stream_gen_ai_spans, ) - events = capture_events() # Response without statistics (typical for older google-genai versions) # Embeddings exist but don't have the statistics field @@ -1141,49 +1778,92 @@ def test_embed_content_without_statistics( } mock_http_response = create_mock_http_response(old_version_response) - with mock.patch.object( - mock_genai_client._api_client, "request", return_value=mock_http_response - ): - with start_transaction(name="google_genai_embeddings"): + if stream_gen_ai_spans: + items = capture_items("span") + + with mock.patch.object( + mock_genai_client._api_client, "request", return_value=mock_http_response + ), start_transaction(name="google_genai_embeddings"): mock_genai_client.models.embed_content( model="text-embedding-004", contents=["Test without statistics", "Another test"], ) - (event,) = events - (embed_span,) = event["spans"] + spans = [item.payload for item in items if item.type == "span"] + (embed_span,) = spans + + # No usage tokens since there are no statistics in older versions + # This is expected and the integration should handle it gracefully + assert SPANDATA.GEN_AI_USAGE_INPUT_TOKENS not in embed_span["attributes"] + else: + events = capture_events() + + with mock.patch.object( + mock_genai_client._api_client, "request", return_value=mock_http_response + ), start_transaction(name="google_genai_embeddings"): + mock_genai_client.models.embed_content( + model="text-embedding-004", + contents=["Test without statistics", "Another test"], + ) - # No usage tokens since there are no statistics in older versions - # This is expected and the integration should handle it gracefully - assert SPANDATA.GEN_AI_USAGE_INPUT_TOKENS not in embed_span["data"] + (event,) = events + (embed_span,) = event["spans"] + # No usage tokens since there are no statistics in older versions + # This is expected and the integration should handle it gracefully + assert SPANDATA.GEN_AI_USAGE_INPUT_TOKENS not in embed_span["data"] -def test_embed_content_span_origin(sentry_init, capture_events, mock_genai_client): + +@pytest.mark.parametrize("stream_gen_ai_spans", [True, False]) +def test_embed_content_span_origin( + sentry_init, + capture_events, + capture_items, + mock_genai_client, + stream_gen_ai_spans, +): """Test that embed_content spans have correct origin.""" sentry_init( integrations=[GoogleGenAIIntegration()], traces_sample_rate=1.0, + stream_gen_ai_spans=stream_gen_ai_spans, ) - events = capture_events() mock_http_response = create_mock_http_response(EXAMPLE_EMBED_RESPONSE_JSON) + if stream_gen_ai_spans: + items = capture_items("transaction", "span") + with mock.patch.object( + mock_genai_client._api_client, "request", return_value=mock_http_response + ), start_transaction(name="google_genai_embeddings"): + mock_genai_client.models.embed_content( + model="text-embedding-004", + contents=["Test origin"], + ) + + (event,) = (item.payload for item in items if item.type == "transaction") + assert event["contexts"]["trace"]["origin"] == "manual" - with mock.patch.object( - mock_genai_client._api_client, "request", return_value=mock_http_response - ): - with start_transaction(name="google_genai_embeddings"): + spans = [item.payload for item in items if item.type == "span"] + for span in spans: + assert span["attributes"]["sentry.origin"] == "auto.ai.google_genai" + else: + events = capture_events() + with mock.patch.object( + mock_genai_client._api_client, "request", return_value=mock_http_response + ), start_transaction(name="google_genai_embeddings"): mock_genai_client.models.embed_content( model="text-embedding-004", contents=["Test origin"], ) - (event,) = events + (event,) = events - assert event["contexts"]["trace"]["origin"] == "manual" - for span in event["spans"]: - assert span["origin"] == "auto.ai.google_genai" + assert event["contexts"]["trace"]["origin"] == "manual" + for span in event["spans"]: + assert span["origin"] == "auto.ai.google_genai" +@pytest.mark.parametrize("stream_gen_ai_spans", [True, False]) @pytest.mark.asyncio @pytest.mark.parametrize( "send_default_pii, include_prompts", @@ -1195,25 +1875,33 @@ def test_embed_content_span_origin(sentry_init, capture_events, mock_genai_clien ], ) async def test_async_embed_content( - sentry_init, capture_events, send_default_pii, include_prompts, mock_genai_client + sentry_init, + capture_events, + capture_items, + send_default_pii, + include_prompts, + mock_genai_client, + stream_gen_ai_spans, ): """Test async embed_content method.""" sentry_init( integrations=[GoogleGenAIIntegration(include_prompts=include_prompts)], traces_sample_rate=1.0, send_default_pii=send_default_pii, + stream_gen_ai_spans=stream_gen_ai_spans, ) - events = capture_events() # Mock the async HTTP response mock_http_response = create_mock_http_response(EXAMPLE_EMBED_RESPONSE_JSON) - with mock.patch.object( - mock_genai_client._api_client, - "async_request", - return_value=mock_http_response, - ): - with start_transaction(name="google_genai_embeddings_async"): + if stream_gen_ai_spans: + items = capture_items("transaction", "span") + + with mock.patch.object( + mock_genai_client._api_client, + "async_request", + return_value=mock_http_response, + ), start_transaction(name="google_genai_embeddings_async"): await mock_genai_client.aio.models.embed_content( model="text-embedding-004", contents=[ @@ -1222,50 +1910,109 @@ async def test_async_embed_content( ], ) - assert len(events) == 1 - (event,) = events + (event,) = (item.payload for item in items if item.type == "transaction") + + assert event["transaction"] == "google_genai_embeddings_async" + + # Should have 1 span for embeddings + spans = [item.payload for item in items if item.type == "span"] + assert len(spans) == 1 + (embed_span,) = spans + + # Check embeddings span + assert embed_span["attributes"]["sentry.op"] == OP.GEN_AI_EMBEDDINGS + assert embed_span["name"] == "embeddings text-embedding-004" + assert embed_span["attributes"][SPANDATA.GEN_AI_OPERATION_NAME] == "embeddings" + assert embed_span["attributes"][SPANDATA.GEN_AI_SYSTEM] == "gcp.gemini" + assert ( + embed_span["attributes"][SPANDATA.GEN_AI_REQUEST_MODEL] + == "text-embedding-004" + ) + + # Check input texts if PII is allowed + if send_default_pii and include_prompts: + input_texts = json.loads( + embed_span["attributes"][SPANDATA.GEN_AI_EMBEDDINGS_INPUT] + ) + assert input_texts == [ + "What is your name?", + "What is your favorite color?", + ] + else: + assert SPANDATA.GEN_AI_EMBEDDINGS_INPUT not in embed_span["attributes"] + + # Check usage data (sum of token counts from statistics: 10 + 15 = 25) + # Note: Only available in newer versions with ContentEmbeddingStatistics + if SPANDATA.GEN_AI_USAGE_INPUT_TOKENS in embed_span["attributes"]: + assert embed_span["attributes"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS] == 25 + else: + events = capture_events() - assert event["type"] == "transaction" - assert event["transaction"] == "google_genai_embeddings_async" + with mock.patch.object( + mock_genai_client._api_client, + "async_request", + return_value=mock_http_response, + ), start_transaction(name="google_genai_embeddings_async"): + await mock_genai_client.aio.models.embed_content( + model="text-embedding-004", + contents=[ + "What is your name?", + "What is your favorite color?", + ], + ) - # Should have 1 span for embeddings - assert len(event["spans"]) == 1 - (embed_span,) = event["spans"] + assert len(events) == 1 + (event,) = events - # Check embeddings span - assert embed_span["op"] == OP.GEN_AI_EMBEDDINGS - assert embed_span["description"] == "embeddings text-embedding-004" - assert embed_span["data"][SPANDATA.GEN_AI_OPERATION_NAME] == "embeddings" - assert embed_span["data"][SPANDATA.GEN_AI_SYSTEM] == "gcp.gemini" - assert embed_span["data"][SPANDATA.GEN_AI_REQUEST_MODEL] == "text-embedding-004" + assert event["type"] == "transaction" - # Check input texts if PII is allowed - if send_default_pii and include_prompts: - input_texts = json.loads(embed_span["data"][SPANDATA.GEN_AI_EMBEDDINGS_INPUT]) - assert input_texts == [ - "What is your name?", - "What is your favorite color?", - ] - else: - assert SPANDATA.GEN_AI_EMBEDDINGS_INPUT not in embed_span["data"] + assert event["transaction"] == "google_genai_embeddings_async" - # Check usage data (sum of token counts from statistics: 10 + 15 = 25) - # Note: Only available in newer versions with ContentEmbeddingStatistics - if SPANDATA.GEN_AI_USAGE_INPUT_TOKENS in embed_span["data"]: - assert embed_span["data"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS] == 25 + # Should have 1 span for embeddings + assert len(event["spans"]) == 1 + (embed_span,) = event["spans"] + + # Check embeddings span + assert embed_span["op"] == OP.GEN_AI_EMBEDDINGS + assert embed_span["description"] == "embeddings text-embedding-004" + assert embed_span["data"][SPANDATA.GEN_AI_OPERATION_NAME] == "embeddings" + assert embed_span["data"][SPANDATA.GEN_AI_SYSTEM] == "gcp.gemini" + assert embed_span["data"][SPANDATA.GEN_AI_REQUEST_MODEL] == "text-embedding-004" + + # Check input texts if PII is allowed + if send_default_pii and include_prompts: + input_texts = json.loads( + embed_span["data"][SPANDATA.GEN_AI_EMBEDDINGS_INPUT] + ) + assert input_texts == [ + "What is your name?", + "What is your favorite color?", + ] + else: + assert SPANDATA.GEN_AI_EMBEDDINGS_INPUT not in embed_span["data"] + # Check usage data (sum of token counts from statistics: 10 + 15 = 25) + # Note: Only available in newer versions with ContentEmbeddingStatistics + if SPANDATA.GEN_AI_USAGE_INPUT_TOKENS in embed_span["data"]: + assert embed_span["data"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS] == 25 + +@pytest.mark.parametrize("stream_gen_ai_spans", [True, False]) @pytest.mark.asyncio async def test_async_embed_content_string_input( - sentry_init, capture_events, mock_genai_client + sentry_init, + capture_events, + capture_items, + mock_genai_client, + stream_gen_ai_spans, ): """Test async embed_content with a single string instead of list.""" sentry_init( integrations=[GoogleGenAIIntegration(include_prompts=True)], traces_sample_rate=1.0, send_default_pii=True, + stream_gen_ai_spans=stream_gen_ai_spans, ) - events = capture_events() # Mock response with single embedding single_embed_response = { @@ -1284,54 +2031,108 @@ async def test_async_embed_content_string_input( } mock_http_response = create_mock_http_response(single_embed_response) - with mock.patch.object( - mock_genai_client._api_client, "async_request", return_value=mock_http_response - ): - with start_transaction(name="google_genai_embeddings_async"): + if stream_gen_ai_spans: + items = capture_items("span") + + with mock.patch.object( + mock_genai_client._api_client, + "async_request", + return_value=mock_http_response, + ), start_transaction(name="google_genai_embeddings_async"): await mock_genai_client.aio.models.embed_content( model="text-embedding-004", contents="Single text input", ) - (event,) = events - (embed_span,) = event["spans"] + spans = [item.payload for item in items if item.type == "span"] + (embed_span,) = spans + + # Check that single string is handled correctly + input_texts = json.loads( + embed_span["attributes"][SPANDATA.GEN_AI_EMBEDDINGS_INPUT] + ) + assert input_texts == ["Single text input"] + # Should use token_count from statistics (5), not billable_character_count (10) + # Note: Only available in newer versions with ContentEmbeddingStatistics + if SPANDATA.GEN_AI_USAGE_INPUT_TOKENS in embed_span["attributes"]: + assert embed_span["attributes"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS] == 5 + else: + events = capture_events() - # Check that single string is handled correctly - input_texts = json.loads(embed_span["data"][SPANDATA.GEN_AI_EMBEDDINGS_INPUT]) - assert input_texts == ["Single text input"] - # Should use token_count from statistics (5), not billable_character_count (10) - # Note: Only available in newer versions with ContentEmbeddingStatistics - if SPANDATA.GEN_AI_USAGE_INPUT_TOKENS in embed_span["data"]: - assert embed_span["data"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS] == 5 + with mock.patch.object( + mock_genai_client._api_client, + "async_request", + return_value=mock_http_response, + ), start_transaction(name="google_genai_embeddings_async"): + await mock_genai_client.aio.models.embed_content( + model="text-embedding-004", + contents="Single text input", + ) + (event,) = events + (embed_span,) = event["spans"] + + # Check that single string is handled correctly + input_texts = json.loads(embed_span["data"][SPANDATA.GEN_AI_EMBEDDINGS_INPUT]) + assert input_texts == ["Single text input"] + # Should use token_count from statistics (5), not billable_character_count (10) + # Note: Only available in newer versions with ContentEmbeddingStatistics + if SPANDATA.GEN_AI_USAGE_INPUT_TOKENS in embed_span["data"]: + assert embed_span["data"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS] == 5 + +@pytest.mark.parametrize("stream_gen_ai_spans", [True, False]) @pytest.mark.asyncio async def test_async_embed_content_error_handling( - sentry_init, capture_events, mock_genai_client + sentry_init, + capture_events, + capture_items, + mock_genai_client, + stream_gen_ai_spans, ): """Test error handling in async embed_content.""" sentry_init( integrations=[GoogleGenAIIntegration()], traces_sample_rate=1.0, + stream_gen_ai_spans=stream_gen_ai_spans, ) - events = capture_events() - - # Mock an error at the HTTP level - with mock.patch.object( - mock_genai_client._api_client, - "async_request", - side_effect=Exception("Async Embedding API Error"), - ): - with start_transaction(name="google_genai_embeddings_async"): - with pytest.raises(Exception, match="Async Embedding API Error"): - await mock_genai_client.aio.models.embed_content( - model="text-embedding-004", - contents=["This will fail"], - ) - # Should have both transaction and error events - assert len(events) == 2 - error_event, _ = events + if stream_gen_ai_spans: + items = capture_items("transaction", "event") + + # Mock an error at the HTTP level + with mock.patch.object( + mock_genai_client._api_client, + "async_request", + side_effect=Exception("Async Embedding API Error"), + ), start_transaction(name="google_genai_embeddings_async"), pytest.raises( + Exception, match="Async Embedding API Error" + ): + await mock_genai_client.aio.models.embed_content( + model="text-embedding-004", + contents=["This will fail"], + ) + + (error_event,) = (item.payload for item in items if item.type == "event") + else: + events = capture_events() + + # Mock an error at the HTTP level + with mock.patch.object( + mock_genai_client._api_client, + "async_request", + side_effect=Exception("Async Embedding API Error"), + ), start_transaction(name="google_genai_embeddings_async"), pytest.raises( + Exception, match="Async Embedding API Error" + ): + await mock_genai_client.aio.models.embed_content( + model="text-embedding-004", + contents=["This will fail"], + ) + + # Should have both transaction and error events + assert len(events) == 2 + error_event, _ = events assert error_event["level"] == "error" assert error_event["exception"]["values"][0]["type"] == "Exception" @@ -1339,16 +2140,21 @@ async def test_async_embed_content_error_handling( assert error_event["exception"]["values"][0]["mechanism"]["type"] == "google_genai" +@pytest.mark.parametrize("stream_gen_ai_spans", [True, False]) @pytest.mark.asyncio async def test_async_embed_content_without_statistics( - sentry_init, capture_events, mock_genai_client + sentry_init, + capture_events, + capture_items, + mock_genai_client, + stream_gen_ai_spans, ): """Test async embed_content response without statistics (older package versions).""" sentry_init( integrations=[GoogleGenAIIntegration()], traces_sample_rate=1.0, + stream_gen_ai_spans=stream_gen_ai_spans, ) - events = capture_events() # Response without statistics (typical for older google-genai versions) # Embeddings exist but don't have the statistics field @@ -1364,63 +2170,119 @@ async def test_async_embed_content_without_statistics( } mock_http_response = create_mock_http_response(old_version_response) - with mock.patch.object( - mock_genai_client._api_client, "async_request", return_value=mock_http_response - ): - with start_transaction(name="google_genai_embeddings_async"): + if stream_gen_ai_spans: + items = capture_items("span") + + with mock.patch.object( + mock_genai_client._api_client, + "async_request", + return_value=mock_http_response, + ), start_transaction(name="google_genai_embeddings_async"): await mock_genai_client.aio.models.embed_content( model="text-embedding-004", contents=["Test without statistics", "Another test"], ) - (event,) = events - (embed_span,) = event["spans"] + spans = [item.payload for item in items if item.type == "span"] + (embed_span,) = spans - # No usage tokens since there are no statistics in older versions - # This is expected and the integration should handle it gracefully - assert SPANDATA.GEN_AI_USAGE_INPUT_TOKENS not in embed_span["data"] + # No usage tokens since there are no statistics in older versions + # This is expected and the integration should handle it gracefully + assert SPANDATA.GEN_AI_USAGE_INPUT_TOKENS not in embed_span["attributes"] + else: + events = capture_events() + + with mock.patch.object( + mock_genai_client._api_client, + "async_request", + return_value=mock_http_response, + ), start_transaction(name="google_genai_embeddings_async"): + await mock_genai_client.aio.models.embed_content( + model="text-embedding-004", + contents=["Test without statistics", "Another test"], + ) + + (event,) = events + (embed_span,) = event["spans"] + + # No usage tokens since there are no statistics in older versions + # This is expected and the integration should handle it gracefully + assert SPANDATA.GEN_AI_USAGE_INPUT_TOKENS not in embed_span["data"] +@pytest.mark.parametrize("stream_gen_ai_spans", [True, False]) @pytest.mark.asyncio async def test_async_embed_content_span_origin( - sentry_init, capture_events, mock_genai_client + sentry_init, + capture_events, + capture_items, + mock_genai_client, + stream_gen_ai_spans, ): """Test that async embed_content spans have correct origin.""" sentry_init( integrations=[GoogleGenAIIntegration()], traces_sample_rate=1.0, + stream_gen_ai_spans=stream_gen_ai_spans, ) - events = capture_events() mock_http_response = create_mock_http_response(EXAMPLE_EMBED_RESPONSE_JSON) - with mock.patch.object( - mock_genai_client._api_client, "async_request", return_value=mock_http_response - ): - with start_transaction(name="google_genai_embeddings_async"): + if stream_gen_ai_spans: + items = capture_items("transaction", "span") + + with mock.patch.object( + mock_genai_client._api_client, + "async_request", + return_value=mock_http_response, + ), start_transaction(name="google_genai_embeddings_async"): + await mock_genai_client.aio.models.embed_content( + model="text-embedding-004", + contents=["Test origin"], + ) + + (event,) = [item.payload for item in items if item.type == "transaction"] + assert event["contexts"]["trace"]["origin"] == "manual" + + spans = [item.payload for item in items if item.type == "span"] + for span in spans: + assert span["attributes"]["sentry.origin"] == "auto.ai.google_genai" + else: + events = capture_events() + + with mock.patch.object( + mock_genai_client._api_client, + "async_request", + return_value=mock_http_response, + ), start_transaction(name="google_genai_embeddings_async"): await mock_genai_client.aio.models.embed_content( model="text-embedding-004", contents=["Test origin"], ) - (event,) = events + (event,) = events - assert event["contexts"]["trace"]["origin"] == "manual" - for span in event["spans"]: - assert span["origin"] == "auto.ai.google_genai" + assert event["contexts"]["trace"]["origin"] == "manual" + for span in event["spans"]: + assert span["origin"] == "auto.ai.google_genai" # Integration tests for generate_content with different input message formats +@pytest.mark.parametrize("stream_gen_ai_spans", [True, False]) def test_generate_content_with_content_object( - sentry_init, capture_events, mock_genai_client + sentry_init, + capture_events, + capture_items, + mock_genai_client, + stream_gen_ai_spans, ): """Test generate_content with Content object input.""" sentry_init( integrations=[GoogleGenAIIntegration(include_prompts=True)], traces_sample_rate=1.0, send_default_pii=True, + stream_gen_ai_spans=stream_gen_ai_spans, ) - events = capture_events() mock_http_response = create_mock_http_response(EXAMPLE_API_RESPONSE_JSON) @@ -1429,18 +2291,36 @@ def test_generate_content_with_content_object( role="user", parts=[genai_types.Part(text="Hello from Content object")] ) - with mock.patch.object( - mock_genai_client._api_client, "request", return_value=mock_http_response - ): - with start_transaction(name="google_genai"): + if stream_gen_ai_spans: + items = capture_items("span") + + with mock.patch.object( + mock_genai_client._api_client, "request", return_value=mock_http_response + ), start_transaction(name="google_genai"): + mock_genai_client.models.generate_content( + model="gemini-1.5-flash", contents=content, config=create_test_config() + ) + + invoke_span = next(item.payload for item in items if item.type == "span") + + messages = json.loads( + invoke_span["attributes"][SPANDATA.GEN_AI_REQUEST_MESSAGES] + ) + else: + events = capture_events() + + with mock.patch.object( + mock_genai_client._api_client, "request", return_value=mock_http_response + ), start_transaction(name="google_genai"): mock_genai_client.models.generate_content( model="gemini-1.5-flash", contents=content, config=create_test_config() ) - (event,) = events - invoke_span = event["spans"][0] + (event,) = events + invoke_span = event["spans"][0] + + messages = json.loads(invoke_span["data"][SPANDATA.GEN_AI_REQUEST_MESSAGES]) - messages = json.loads(invoke_span["data"][SPANDATA.GEN_AI_REQUEST_MESSAGES]) assert len(messages) == 1 assert messages[0]["role"] == "user" assert messages[0]["content"] == [ @@ -1448,34 +2328,57 @@ def test_generate_content_with_content_object( ] +@pytest.mark.parametrize("stream_gen_ai_spans", [True, False]) def test_generate_content_with_dict_format( - sentry_init, capture_events, mock_genai_client + sentry_init, + capture_events, + capture_items, + mock_genai_client, + stream_gen_ai_spans, ): """Test generate_content with dict format input (ContentDict).""" sentry_init( integrations=[GoogleGenAIIntegration(include_prompts=True)], traces_sample_rate=1.0, send_default_pii=True, + stream_gen_ai_spans=stream_gen_ai_spans, ) - events = capture_events() mock_http_response = create_mock_http_response(EXAMPLE_API_RESPONSE_JSON) # Dict format content contents = {"role": "user", "parts": [{"text": "Hello from dict format"}]} - with mock.patch.object( - mock_genai_client._api_client, "request", return_value=mock_http_response - ): - with start_transaction(name="google_genai"): + if stream_gen_ai_spans: + items = capture_items("span") + + with mock.patch.object( + mock_genai_client._api_client, "request", return_value=mock_http_response + ), start_transaction(name="google_genai"): + mock_genai_client.models.generate_content( + model="gemini-1.5-flash", contents=contents, config=create_test_config() + ) + + invoke_span = next(item.payload for item in items if item.type == "span") + + messages = json.loads( + invoke_span["attributes"][SPANDATA.GEN_AI_REQUEST_MESSAGES] + ) + else: + events = capture_events() + + with mock.patch.object( + mock_genai_client._api_client, "request", return_value=mock_http_response + ), start_transaction(name="google_genai"): mock_genai_client.models.generate_content( model="gemini-1.5-flash", contents=contents, config=create_test_config() ) - (event,) = events - invoke_span = event["spans"][0] + (event,) = events + invoke_span = event["spans"][0] + + messages = json.loads(invoke_span["data"][SPANDATA.GEN_AI_REQUEST_MESSAGES]) - messages = json.loads(invoke_span["data"][SPANDATA.GEN_AI_REQUEST_MESSAGES]) assert len(messages) == 1 assert messages[0]["role"] == "user" assert messages[0]["content"] == [ @@ -1483,16 +2386,21 @@ def test_generate_content_with_dict_format( ] +@pytest.mark.parametrize("stream_gen_ai_spans", [True, False]) def test_generate_content_with_file_data( - sentry_init, capture_events, mock_genai_client + sentry_init, + capture_events, + capture_items, + mock_genai_client, + stream_gen_ai_spans, ): """Test generate_content with file_data (external file reference).""" sentry_init( integrations=[GoogleGenAIIntegration(include_prompts=True)], traces_sample_rate=1.0, send_default_pii=True, + stream_gen_ai_spans=stream_gen_ai_spans, ) - events = capture_events() mock_http_response = create_mock_http_response(EXAMPLE_API_RESPONSE_JSON) @@ -1508,18 +2416,36 @@ def test_generate_content_with_file_data( ], ) - with mock.patch.object( - mock_genai_client._api_client, "request", return_value=mock_http_response - ): - with start_transaction(name="google_genai"): + if stream_gen_ai_spans: + items = capture_items("span") + + with mock.patch.object( + mock_genai_client._api_client, "request", return_value=mock_http_response + ), start_transaction(name="google_genai"): mock_genai_client.models.generate_content( model="gemini-1.5-flash", contents=content, config=create_test_config() ) - (event,) = events - invoke_span = event["spans"][0] + invoke_span = next(item.payload for item in items if item.type == "span") + + messages = json.loads( + invoke_span["attributes"][SPANDATA.GEN_AI_REQUEST_MESSAGES] + ) + else: + events = capture_events() + + with mock.patch.object( + mock_genai_client._api_client, "request", return_value=mock_http_response + ), start_transaction(name="google_genai"): + mock_genai_client.models.generate_content( + model="gemini-1.5-flash", contents=content, config=create_test_config() + ) + + (event,) = events + invoke_span = event["spans"][0] + + messages = json.loads(invoke_span["data"][SPANDATA.GEN_AI_REQUEST_MESSAGES]) - messages = json.loads(invoke_span["data"][SPANDATA.GEN_AI_REQUEST_MESSAGES]) assert len(messages) == 1 assert messages[0]["role"] == "user" assert len(messages[0]["content"]) == 2 @@ -1533,16 +2459,21 @@ def test_generate_content_with_file_data( assert messages[0]["content"][1]["uri"] == "gs://bucket/image.jpg" +@pytest.mark.parametrize("stream_gen_ai_spans", [True, False]) def test_generate_content_with_inline_data( - sentry_init, capture_events, mock_genai_client + sentry_init, + capture_events, + capture_items, + mock_genai_client, + stream_gen_ai_spans, ): """Test generate_content with inline_data (binary data).""" sentry_init( integrations=[GoogleGenAIIntegration(include_prompts=True)], traces_sample_rate=1.0, send_default_pii=True, + stream_gen_ai_spans=stream_gen_ai_spans, ) - events = capture_events() mock_http_response = create_mock_http_response(EXAMPLE_API_RESPONSE_JSON) @@ -1557,18 +2488,36 @@ def test_generate_content_with_inline_data( ], ) - with mock.patch.object( - mock_genai_client._api_client, "request", return_value=mock_http_response - ): - with start_transaction(name="google_genai"): + if stream_gen_ai_spans: + items = capture_items("span") + + with mock.patch.object( + mock_genai_client._api_client, "request", return_value=mock_http_response + ), start_transaction(name="google_genai"): mock_genai_client.models.generate_content( model="gemini-1.5-flash", contents=content, config=create_test_config() ) - (event,) = events - invoke_span = event["spans"][0] + invoke_span = next(item.payload for item in items if item.type == "span") + + messages = json.loads( + invoke_span["attributes"][SPANDATA.GEN_AI_REQUEST_MESSAGES] + ) + else: + events = capture_events() + + with mock.patch.object( + mock_genai_client._api_client, "request", return_value=mock_http_response + ), start_transaction(name="google_genai"): + mock_genai_client.models.generate_content( + model="gemini-1.5-flash", contents=content, config=create_test_config() + ) + + (event,) = events + invoke_span = event["spans"][0] + + messages = json.loads(invoke_span["data"][SPANDATA.GEN_AI_REQUEST_MESSAGES]) - messages = json.loads(invoke_span["data"][SPANDATA.GEN_AI_REQUEST_MESSAGES]) assert len(messages) == 1 assert messages[0]["role"] == "user" assert len(messages[0]["content"]) == 2 @@ -1579,16 +2528,21 @@ def test_generate_content_with_inline_data( assert messages[0]["content"][1]["content"] == BLOB_DATA_SUBSTITUTE +@pytest.mark.parametrize("stream_gen_ai_spans", [True, False]) def test_generate_content_with_function_response( - sentry_init, capture_events, mock_genai_client + sentry_init, + capture_events, + capture_items, + mock_genai_client, + stream_gen_ai_spans, ): """Test generate_content with function_response (tool result).""" sentry_init( integrations=[GoogleGenAIIntegration(include_prompts=True)], traces_sample_rate=1.0, send_default_pii=True, + stream_gen_ai_spans=stream_gen_ai_spans, ) - events = capture_events() mock_http_response = create_mock_http_response(EXAMPLE_API_RESPONSE_JSON) @@ -1614,18 +2568,36 @@ def test_generate_content_with_function_response( ), ] - with mock.patch.object( - mock_genai_client._api_client, "request", return_value=mock_http_response - ): - with start_transaction(name="google_genai"): + if stream_gen_ai_spans: + items = capture_items("span") + + with mock.patch.object( + mock_genai_client._api_client, "request", return_value=mock_http_response + ), start_transaction(name="google_genai"): + mock_genai_client.models.generate_content( + model="gemini-1.5-flash", contents=contents, config=create_test_config() + ) + + invoke_span = next(item.payload for item in items if item.type == "span") + + messages = json.loads( + invoke_span["attributes"][SPANDATA.GEN_AI_REQUEST_MESSAGES] + ) + else: + events = capture_events() + + with mock.patch.object( + mock_genai_client._api_client, "request", return_value=mock_http_response + ), start_transaction(name="google_genai"): mock_genai_client.models.generate_content( model="gemini-1.5-flash", contents=contents, config=create_test_config() ) - (event,) = events - invoke_span = event["spans"][0] + (event,) = events + invoke_span = event["spans"][0] + + messages = json.loads(invoke_span["data"][SPANDATA.GEN_AI_REQUEST_MESSAGES]) - messages = json.loads(invoke_span["data"][SPANDATA.GEN_AI_REQUEST_MESSAGES]) assert len(messages) == 1 # First message is user message assert messages[0]["role"] == "tool" @@ -1634,16 +2606,21 @@ def test_generate_content_with_function_response( assert messages[0]["content"]["output"] == "Sunny, 72F" +@pytest.mark.parametrize("stream_gen_ai_spans", [True, False]) def test_generate_content_with_mixed_string_and_content( - sentry_init, capture_events, mock_genai_client + sentry_init, + capture_events, + capture_items, + mock_genai_client, + stream_gen_ai_spans, ): """Test generate_content with mixed string and Content objects in list.""" sentry_init( integrations=[GoogleGenAIIntegration(include_prompts=True)], traces_sample_rate=1.0, send_default_pii=True, + stream_gen_ai_spans=stream_gen_ai_spans, ) - events = capture_events() mock_http_response = create_mock_http_response(EXAMPLE_API_RESPONSE_JSON) @@ -1660,59 +2637,105 @@ def test_generate_content_with_mixed_string_and_content( ), ] - with mock.patch.object( - mock_genai_client._api_client, "request", return_value=mock_http_response - ): - with start_transaction(name="google_genai"): + if stream_gen_ai_spans: + items = capture_items("span") + + with mock.patch.object( + mock_genai_client._api_client, "request", return_value=mock_http_response + ), start_transaction(name="google_genai"): + mock_genai_client.models.generate_content( + model="gemini-1.5-flash", contents=contents, config=create_test_config() + ) + + invoke_span = next(item.payload for item in items if item.type == "span") + + messages = json.loads( + invoke_span["attributes"][SPANDATA.GEN_AI_REQUEST_MESSAGES] + ) + else: + events = capture_events() + + with mock.patch.object( + mock_genai_client._api_client, "request", return_value=mock_http_response + ), start_transaction(name="google_genai"): mock_genai_client.models.generate_content( model="gemini-1.5-flash", contents=contents, config=create_test_config() ) - (event,) = events - invoke_span = event["spans"][0] + (event,) = events + invoke_span = event["spans"][0] + + messages = json.loads(invoke_span["data"][SPANDATA.GEN_AI_REQUEST_MESSAGES]) - messages = json.loads(invoke_span["data"][SPANDATA.GEN_AI_REQUEST_MESSAGES]) assert len(messages) == 1 # User message assert messages[0]["role"] == "user" assert messages[0]["content"] == [{"text": "Tell me a joke", "type": "text"}] +@pytest.mark.parametrize("stream_gen_ai_spans", [True, False]) def test_generate_content_with_part_object_directly( - sentry_init, capture_events, mock_genai_client + sentry_init, + capture_events, + capture_items, + mock_genai_client, + stream_gen_ai_spans, ): """Test generate_content with Part object directly (not wrapped in Content).""" sentry_init( integrations=[GoogleGenAIIntegration(include_prompts=True)], traces_sample_rate=1.0, send_default_pii=True, + stream_gen_ai_spans=stream_gen_ai_spans, ) - events = capture_events() mock_http_response = create_mock_http_response(EXAMPLE_API_RESPONSE_JSON) # Part object directly part = genai_types.Part(text="Direct Part object") - with mock.patch.object( - mock_genai_client._api_client, "request", return_value=mock_http_response - ): - with start_transaction(name="google_genai"): + if stream_gen_ai_spans: + items = capture_items("span") + + with mock.patch.object( + mock_genai_client._api_client, "request", return_value=mock_http_response + ), start_transaction(name="google_genai"): mock_genai_client.models.generate_content( model="gemini-1.5-flash", contents=part, config=create_test_config() ) - (event,) = events - invoke_span = event["spans"][0] + invoke_span = next(item.payload for item in items if item.type == "span") + + messages = json.loads( + invoke_span["attributes"][SPANDATA.GEN_AI_REQUEST_MESSAGES] + ) + else: + events = capture_events() + + with mock.patch.object( + mock_genai_client._api_client, "request", return_value=mock_http_response + ), start_transaction(name="google_genai"): + mock_genai_client.models.generate_content( + model="gemini-1.5-flash", contents=part, config=create_test_config() + ) + + (event,) = events + invoke_span = event["spans"][0] + + messages = json.loads(invoke_span["data"][SPANDATA.GEN_AI_REQUEST_MESSAGES]) - messages = json.loads(invoke_span["data"][SPANDATA.GEN_AI_REQUEST_MESSAGES]) assert len(messages) == 1 assert messages[0]["role"] == "user" assert messages[0]["content"] == [{"text": "Direct Part object", "type": "text"}] +@pytest.mark.parametrize("stream_gen_ai_spans", [True, False]) def test_generate_content_with_list_of_dicts( - sentry_init, capture_events, mock_genai_client + sentry_init, + capture_events, + capture_items, + mock_genai_client, + stream_gen_ai_spans, ): """ Test generate_content with list of dict format inputs. @@ -1725,8 +2748,8 @@ def test_generate_content_with_list_of_dicts( integrations=[GoogleGenAIIntegration(include_prompts=True)], traces_sample_rate=1.0, send_default_pii=True, + stream_gen_ai_spans=stream_gen_ai_spans, ) - events = capture_events() mock_http_response = create_mock_http_response(EXAMPLE_API_RESPONSE_JSON) @@ -1737,33 +2760,56 @@ def test_generate_content_with_list_of_dicts( {"role": "user", "parts": [{"text": "Second user message"}]}, ] - with mock.patch.object( - mock_genai_client._api_client, "request", return_value=mock_http_response - ): - with start_transaction(name="google_genai"): + if stream_gen_ai_spans: + items = capture_items("span") + + with mock.patch.object( + mock_genai_client._api_client, "request", return_value=mock_http_response + ), start_transaction(name="google_genai"): mock_genai_client.models.generate_content( model="gemini-1.5-flash", contents=contents, config=create_test_config() ) - (event,) = events - invoke_span = event["spans"][0] + invoke_span = next(item.payload for item in items if item.type == "span") + + messages = json.loads( + invoke_span["attributes"][SPANDATA.GEN_AI_REQUEST_MESSAGES] + ) + else: + events = capture_events() + + with mock.patch.object( + mock_genai_client._api_client, "request", return_value=mock_http_response + ), start_transaction(name="google_genai"): + mock_genai_client.models.generate_content( + model="gemini-1.5-flash", contents=contents, config=create_test_config() + ) + + (event,) = events + invoke_span = event["spans"][0] + + messages = json.loads(invoke_span["data"][SPANDATA.GEN_AI_REQUEST_MESSAGES]) - messages = json.loads(invoke_span["data"][SPANDATA.GEN_AI_REQUEST_MESSAGES]) assert len(messages) == 1 assert messages[0]["role"] == "user" assert messages[0]["content"] == [{"text": "Second user message", "type": "text"}] +@pytest.mark.parametrize("stream_gen_ai_spans", [True, False]) def test_generate_content_with_dict_inline_data( - sentry_init, capture_events, mock_genai_client + sentry_init, + capture_events, + capture_items, + mock_genai_client, + stream_gen_ai_spans, ): """Test generate_content with dict format containing inline_data.""" sentry_init( integrations=[GoogleGenAIIntegration(include_prompts=True)], traces_sample_rate=1.0, send_default_pii=True, + stream_gen_ai_spans=stream_gen_ai_spans, ) - events = capture_events() mock_http_response = create_mock_http_response(EXAMPLE_API_RESPONSE_JSON) @@ -1776,18 +2822,36 @@ def test_generate_content_with_dict_inline_data( ], } - with mock.patch.object( - mock_genai_client._api_client, "request", return_value=mock_http_response - ): - with start_transaction(name="google_genai"): + if stream_gen_ai_spans: + items = capture_items("span") + + with mock.patch.object( + mock_genai_client._api_client, "request", return_value=mock_http_response + ), start_transaction(name="google_genai"): + mock_genai_client.models.generate_content( + model="gemini-1.5-flash", contents=contents, config=create_test_config() + ) + + invoke_span = next(item.payload for item in items if item.type == "span") + + messages = json.loads( + invoke_span["attributes"][SPANDATA.GEN_AI_REQUEST_MESSAGES] + ) + else: + events = capture_events() + + with mock.patch.object( + mock_genai_client._api_client, "request", return_value=mock_http_response + ), start_transaction(name="google_genai"): mock_genai_client.models.generate_content( model="gemini-1.5-flash", contents=contents, config=create_test_config() ) - (event,) = events - invoke_span = event["spans"][0] + (event,) = events + invoke_span = event["spans"][0] + + messages = json.loads(invoke_span["data"][SPANDATA.GEN_AI_REQUEST_MESSAGES]) - messages = json.loads(invoke_span["data"][SPANDATA.GEN_AI_REQUEST_MESSAGES]) assert len(messages) == 1 assert messages[0]["role"] == "user" assert len(messages[0]["content"]) == 2 @@ -1800,15 +2864,20 @@ def test_generate_content_with_dict_inline_data( assert messages[0]["content"][1]["content"] == BLOB_DATA_SUBSTITUTE +@pytest.mark.parametrize("stream_gen_ai_spans", [True, False]) def test_generate_content_without_parts_property_inline_data( - sentry_init, capture_events, mock_genai_client + sentry_init, + capture_events, + capture_items, + mock_genai_client, + stream_gen_ai_spans, ): sentry_init( integrations=[GoogleGenAIIntegration(include_prompts=True)], traces_sample_rate=1.0, send_default_pii=True, + stream_gen_ai_spans=stream_gen_ai_spans, ) - events = capture_events() mock_http_response = create_mock_http_response(EXAMPLE_API_RESPONSE_JSON) @@ -1817,18 +2886,35 @@ def test_generate_content_without_parts_property_inline_data( {"inline_data": {"data": b"fake_binary_data", "mime_type": "image/gif"}}, ] - with mock.patch.object( - mock_genai_client._api_client, "request", return_value=mock_http_response - ): - with start_transaction(name="google_genai"): + if stream_gen_ai_spans: + items = capture_items("span") + + with mock.patch.object( + mock_genai_client._api_client, "request", return_value=mock_http_response + ), start_transaction(name="google_genai"): + mock_genai_client.models.generate_content( + model="gemini-1.5-flash", contents=contents, config=create_test_config() + ) + + invoke_span = next(item.payload for item in items if item.type == "span") + + messages = json.loads( + invoke_span["attributes"][SPANDATA.GEN_AI_REQUEST_MESSAGES] + ) + else: + events = capture_events() + + with mock.patch.object( + mock_genai_client._api_client, "request", return_value=mock_http_response + ), start_transaction(name="google_genai"): mock_genai_client.models.generate_content( model="gemini-1.5-flash", contents=contents, config=create_test_config() ) - (event,) = events - invoke_span = event["spans"][0] + (event,) = events + invoke_span = event["spans"][0] - messages = json.loads(invoke_span["data"][SPANDATA.GEN_AI_REQUEST_MESSAGES]) + messages = json.loads(invoke_span["data"][SPANDATA.GEN_AI_REQUEST_MESSAGES]) assert len(messages) == 1 @@ -1844,15 +2930,20 @@ def test_generate_content_without_parts_property_inline_data( assert messages[0]["content"][1]["inline_data"]["mime_type"] == "image/gif" +@pytest.mark.parametrize("stream_gen_ai_spans", [True, False]) def test_generate_content_without_parts_property_inline_data_and_binary_data_within_string( - sentry_init, capture_events, mock_genai_client + sentry_init, + capture_events, + capture_items, + mock_genai_client, + stream_gen_ai_spans, ): sentry_init( integrations=[GoogleGenAIIntegration(include_prompts=True)], traces_sample_rate=1.0, send_default_pii=True, + stream_gen_ai_spans=stream_gen_ai_spans, ) - events = capture_events() mock_http_response = create_mock_http_response(EXAMPLE_API_RESPONSE_JSON) @@ -1866,18 +2957,36 @@ def test_generate_content_without_parts_property_inline_data_and_binary_data_wit }, ] - with mock.patch.object( - mock_genai_client._api_client, "request", return_value=mock_http_response - ): - with start_transaction(name="google_genai"): + if stream_gen_ai_spans: + items = capture_items("span") + + with mock.patch.object( + mock_genai_client._api_client, "request", return_value=mock_http_response + ), start_transaction(name="google_genai"): mock_genai_client.models.generate_content( model="gemini-1.5-flash", contents=contents, config=create_test_config() ) - (event,) = events - invoke_span = event["spans"][0] + invoke_span = next(item.payload for item in items if item.type == "span") + + messages = json.loads( + invoke_span["attributes"][SPANDATA.GEN_AI_REQUEST_MESSAGES] + ) + else: + events = capture_events() + + with mock.patch.object( + mock_genai_client._api_client, "request", return_value=mock_http_response + ), start_transaction(name="google_genai"): + mock_genai_client.models.generate_content( + model="gemini-1.5-flash", contents=contents, config=create_test_config() + ) + + (event,) = events + invoke_span = event["spans"][0] + + messages = json.loads(invoke_span["data"][SPANDATA.GEN_AI_REQUEST_MESSAGES]) - messages = json.loads(invoke_span["data"][SPANDATA.GEN_AI_REQUEST_MESSAGES]) assert len(messages) == 1 assert messages[0]["role"] == "user" diff --git a/tests/integrations/huggingface_hub/test_huggingface_hub.py b/tests/integrations/huggingface_hub/test_huggingface_hub.py index 9acb6087e4..4772eb368f 100644 --- a/tests/integrations/huggingface_hub/test_huggingface_hub.py +++ b/tests/integrations/huggingface_hub/test_huggingface_hub.py @@ -466,858 +466,751 @@ def mock_hf_chat_completion_api_streaming_tools(httpx_mock): yield rsps +@pytest.mark.parametrize("stream_gen_ai_spans", [True, False]) @pytest.mark.httpx_mock(assert_all_requests_were_expected=False) @pytest.mark.parametrize("send_default_pii", [True, False]) @pytest.mark.parametrize("include_prompts", [True, False]) def test_text_generation( sentry_init: "Any", capture_events: "Any", + capture_items: "Any", send_default_pii: "Any", include_prompts: "Any", mock_hf_text_generation_api: "Any", + stream_gen_ai_spans: "Any", ) -> None: sentry_init( traces_sample_rate=1.0, send_default_pii=send_default_pii, integrations=[HuggingfaceHubIntegration(include_prompts=include_prompts)], + stream_gen_ai_spans=stream_gen_ai_spans, ) - events = capture_events() client = InferenceClient(model="test-model") - with sentry_sdk.start_transaction(name="test"): - client.text_generation( - "Hello", - stream=False, - details=True, - ) + if stream_gen_ai_spans: + items = capture_items("transaction", "span") - (transaction,) = events + with sentry_sdk.start_transaction(name="test"): + client.text_generation( + "Hello", + stream=False, + details=True, + ) - span = None - for sp in transaction["spans"]: - if sp["op"].startswith("gen_ai"): - assert span is None, "there is exactly one gen_ai span" - span = sp - else: - # there should be no other spans, just the gen_ai span - # and optionally some http.client spans from talking to the hf api - assert sp["op"] == "http.client" + spans = [item.payload for item in items if item.type == "span"] + span = None + for sp in spans: + if sp["attributes"]["sentry.op"].startswith("gen_ai"): + assert span is None, "there is exactly one gen_ai span" + span = sp + else: + # there should be no other spans, just the gen_ai span + # and optionally some http.client spans from talking to the hf api + assert sp["attributes"]["sentry.op"] == "http.client" + + assert span is not None + + assert span["attributes"]["sentry.op"] == "gen_ai.text_completion" + assert span["name"] == "text_completion test-model" + assert span["attributes"]["sentry.origin"] == "auto.ai.huggingface_hub" + + expected_data = { + "gen_ai.operation.name": "text_completion", + "gen_ai.request.model": "test-model", + "gen_ai.response.finish_reasons": "length", + "gen_ai.response.streaming": False, + "gen_ai.usage.total_tokens": 10, + "sentry.environment": "production", + "sentry.op": "gen_ai.text_completion", + "sentry.origin": "auto.ai.huggingface_hub", + "sentry.release": mock.ANY, + "sentry.sdk.name": "sentry.python", + "sentry.sdk.version": mock.ANY, + "sentry.segment.id": mock.ANY, + "sentry.segment.name": "test", + "thread.id": mock.ANY, + "thread.name": mock.ANY, + } - assert span is not None + if send_default_pii and include_prompts: + expected_data["gen_ai.request.messages"] = "Hello" + expected_data["gen_ai.response.text"] = ( + "[mocked] Hello! How can i help you?" + ) + + if not send_default_pii or not include_prompts: + assert "gen_ai.request.messages" not in expected_data + assert "gen_ai.response.text" not in expected_data + + assert span["attributes"] == expected_data + + # text generation does not set the response model + assert "gen_ai.response.model" not in span["attributes"] + else: + events = capture_events() - assert span["op"] == "gen_ai.text_completion" - assert span["description"] == "text_completion test-model" - assert span["origin"] == "auto.ai.huggingface_hub" + with sentry_sdk.start_transaction(name="test"): + client.text_generation( + "Hello", + stream=False, + details=True, + ) - expected_data = { - "gen_ai.operation.name": "text_completion", - "gen_ai.request.model": "test-model", - "gen_ai.response.finish_reasons": "length", - "gen_ai.response.streaming": False, - "gen_ai.usage.total_tokens": 10, - "thread.id": mock.ANY, - "thread.name": mock.ANY, - } + (transaction,) = events + + span = None + for sp in transaction["spans"]: + if sp["op"].startswith("gen_ai"): + assert span is None, "there is exactly one gen_ai span" + span = sp + else: + # there should be no other spans, just the gen_ai span + # and optionally some http.client spans from talking to the hf api + assert sp["op"] == "http.client" + + assert span is not None + + assert span["op"] == "gen_ai.text_completion" + assert span["description"] == "text_completion test-model" + assert span["origin"] == "auto.ai.huggingface_hub" + + expected_data = { + "gen_ai.operation.name": "text_completion", + "gen_ai.request.model": "test-model", + "gen_ai.response.finish_reasons": "length", + "gen_ai.response.streaming": False, + "gen_ai.usage.total_tokens": 10, + "thread.id": mock.ANY, + "thread.name": mock.ANY, + } - if send_default_pii and include_prompts: - expected_data["gen_ai.request.messages"] = "Hello" - expected_data["gen_ai.response.text"] = "[mocked] Hello! How can i help you?" + if send_default_pii and include_prompts: + expected_data["gen_ai.request.messages"] = "Hello" + expected_data["gen_ai.response.text"] = ( + "[mocked] Hello! How can i help you?" + ) - if not send_default_pii or not include_prompts: - assert "gen_ai.request.messages" not in expected_data - assert "gen_ai.response.text" not in expected_data + if not send_default_pii or not include_prompts: + assert "gen_ai.request.messages" not in expected_data + assert "gen_ai.response.text" not in expected_data - assert span["data"] == expected_data + assert span["data"] == expected_data - # text generation does not set the response model - assert "gen_ai.response.model" not in span["data"] + # text generation does not set the response model + assert "gen_ai.response.model" not in span["data"] +@pytest.mark.parametrize("stream_gen_ai_spans", [True, False]) @pytest.mark.httpx_mock(assert_all_requests_were_expected=False) @pytest.mark.parametrize("send_default_pii", [True, False]) @pytest.mark.parametrize("include_prompts", [True, False]) def test_text_generation_streaming( sentry_init: "Any", capture_events: "Any", + capture_items: "Any", send_default_pii: "Any", include_prompts: "Any", mock_hf_text_generation_api_streaming: "Any", + stream_gen_ai_spans: "Any", ) -> None: sentry_init( traces_sample_rate=1.0, send_default_pii=send_default_pii, integrations=[HuggingfaceHubIntegration(include_prompts=include_prompts)], + stream_gen_ai_spans=stream_gen_ai_spans, ) - events = capture_events() client = InferenceClient(model="test-model") - with sentry_sdk.start_transaction(name="test"): - for _ in client.text_generation( - prompt="Hello", - stream=True, - details=True, - ): - pass - - (transaction,) = events - - span = None - for sp in transaction["spans"]: - if sp["op"].startswith("gen_ai"): - assert span is None, "there is exactly one gen_ai span" - span = sp - else: - # there should be no other spans, just the gen_ai span - # and optionally some http.client spans from talking to the hf api - assert sp["op"] == "http.client" - - assert span is not None - - assert span["op"] == "gen_ai.text_completion" - assert span["description"] == "text_completion test-model" - assert span["origin"] == "auto.ai.huggingface_hub" - - expected_data = { - "gen_ai.operation.name": "text_completion", - "gen_ai.request.model": "test-model", - "gen_ai.response.finish_reasons": "length", - "gen_ai.response.streaming": True, - "gen_ai.usage.total_tokens": 10, - "thread.id": mock.ANY, - "thread.name": mock.ANY, - } - - if send_default_pii and include_prompts: - expected_data["gen_ai.request.messages"] = "Hello" - expected_data["gen_ai.response.text"] = "the mocked model response" - - if not send_default_pii or not include_prompts: - assert "gen_ai.request.messages" not in expected_data - assert "gen_ai.response.text" not in expected_data + if stream_gen_ai_spans: + items = capture_items("transaction", "span") - assert span["data"] == expected_data + with sentry_sdk.start_transaction(name="test"): + for _ in client.text_generation( + prompt="Hello", + stream=True, + details=True, + ): + pass + + spans = [item.payload for item in items if item.type == "span"] + span = None + for sp in spans: + if sp["attributes"]["sentry.op"].startswith("gen_ai"): + assert span is None, "there is exactly one gen_ai span" + span = sp + else: + # there should be no other spans, just the gen_ai span + # and optionally some http.client spans from talking to the hf api + assert sp["attributes"]["sentry.op"] == "http.client" + + assert span is not None + + assert span["attributes"]["sentry.op"] == "gen_ai.text_completion" + assert span["name"] == "text_completion test-model" + assert span["attributes"]["sentry.origin"] == "auto.ai.huggingface_hub" + + expected_data = { + "gen_ai.operation.name": "text_completion", + "gen_ai.request.model": "test-model", + "gen_ai.response.finish_reasons": "length", + "gen_ai.response.streaming": True, + "gen_ai.usage.total_tokens": 10, + "sentry.environment": "production", + "sentry.op": "gen_ai.text_completion", + "sentry.origin": "auto.ai.huggingface_hub", + "sentry.release": mock.ANY, + "sentry.sdk.name": "sentry.python", + "sentry.sdk.version": mock.ANY, + "sentry.segment.id": mock.ANY, + "sentry.segment.name": "test", + "thread.id": mock.ANY, + "thread.name": mock.ANY, + } - # text generation does not set the response model - assert "gen_ai.response.model" not in span["data"] + if send_default_pii and include_prompts: + expected_data["gen_ai.request.messages"] = "Hello" + expected_data["gen_ai.response.text"] = "the mocked model response" + if not send_default_pii or not include_prompts: + assert "gen_ai.request.messages" not in expected_data + assert "gen_ai.response.text" not in expected_data -@pytest.mark.httpx_mock(assert_all_requests_were_expected=False) -@pytest.mark.parametrize("send_default_pii", [True, False]) -@pytest.mark.parametrize("include_prompts", [True, False]) -def test_chat_completion( - sentry_init: "Any", - capture_events: "Any", - send_default_pii: "Any", - include_prompts: "Any", - mock_hf_chat_completion_api: "Any", -) -> None: - sentry_init( - traces_sample_rate=1.0, - send_default_pii=send_default_pii, - integrations=[HuggingfaceHubIntegration(include_prompts=include_prompts)], - ) - events = capture_events() + assert span["attributes"] == expected_data - client = get_hf_provider_inference_client() + # text generation does not set the response model + assert "gen_ai.response.model" not in span["attributes"] + else: + events = capture_events() - with sentry_sdk.start_transaction(name="test"): - client.chat_completion( - messages=[{"role": "user", "content": "Hello!"}], - stream=False, - ) + with sentry_sdk.start_transaction(name="test"): + for _ in client.text_generation( + prompt="Hello", + stream=True, + details=True, + ): + pass + + (transaction,) = events + + span = None + for sp in transaction["spans"]: + if sp["op"].startswith("gen_ai"): + assert span is None, "there is exactly one gen_ai span" + span = sp + else: + # there should be no other spans, just the gen_ai span + # and optionally some http.client spans from talking to the hf api + assert sp["op"] == "http.client" + + assert span is not None + + assert span["op"] == "gen_ai.text_completion" + assert span["description"] == "text_completion test-model" + assert span["origin"] == "auto.ai.huggingface_hub" + + expected_data = { + "gen_ai.operation.name": "text_completion", + "gen_ai.request.model": "test-model", + "gen_ai.response.finish_reasons": "length", + "gen_ai.response.streaming": True, + "gen_ai.usage.total_tokens": 10, + "thread.id": mock.ANY, + "thread.name": mock.ANY, + } - (transaction,) = events + if send_default_pii and include_prompts: + expected_data["gen_ai.request.messages"] = "Hello" + expected_data["gen_ai.response.text"] = "the mocked model response" - span = None - for sp in transaction["spans"]: - if sp["op"].startswith("gen_ai"): - assert span is None, "there is exactly one gen_ai span" - span = sp - else: - # there should be no other spans, just the gen_ai span - # and optionally some http.client spans from talking to the hf api - assert sp["op"] == "http.client" - - assert span is not None - - assert span["op"] == "gen_ai.chat" - assert span["description"] == "chat test-model" - assert span["origin"] == "auto.ai.huggingface_hub" - - expected_data = { - "gen_ai.operation.name": "chat", - "gen_ai.request.model": "test-model", - "gen_ai.response.finish_reasons": "stop", - "gen_ai.response.model": "test-model-123", - "gen_ai.response.streaming": False, - "gen_ai.usage.input_tokens": 10, - "gen_ai.usage.output_tokens": 8, - "gen_ai.usage.total_tokens": 18, - "thread.id": mock.ANY, - "thread.name": mock.ANY, - } - - if send_default_pii and include_prompts: - expected_data["gen_ai.request.messages"] = ( - '[{"role": "user", "content": "Hello!"}]' - ) - expected_data["gen_ai.response.text"] = ( - "[mocked] Hello! How can I help you today?" - ) + if not send_default_pii or not include_prompts: + assert "gen_ai.request.messages" not in expected_data + assert "gen_ai.response.text" not in expected_data - if not send_default_pii or not include_prompts: - assert "gen_ai.request.messages" not in expected_data - assert "gen_ai.response.text" not in expected_data + assert span["data"] == expected_data - assert span["data"] == expected_data + # text generation does not set the response model + assert "gen_ai.response.model" not in span["data"] +@pytest.mark.parametrize("stream_gen_ai_spans", [True, False]) @pytest.mark.httpx_mock(assert_all_requests_were_expected=False) @pytest.mark.parametrize("send_default_pii", [True, False]) @pytest.mark.parametrize("include_prompts", [True, False]) -def test_chat_completion_streaming( +def test_chat_completion( sentry_init: "Any", capture_events: "Any", + capture_items: "Any", send_default_pii: "Any", include_prompts: "Any", - mock_hf_chat_completion_api_streaming: "Any", + mock_hf_chat_completion_api: "Any", + stream_gen_ai_spans: "Any", ) -> None: sentry_init( traces_sample_rate=1.0, send_default_pii=send_default_pii, integrations=[HuggingfaceHubIntegration(include_prompts=include_prompts)], + stream_gen_ai_spans=stream_gen_ai_spans, ) - events = capture_events() client = get_hf_provider_inference_client() - with sentry_sdk.start_transaction(name="test"): - _ = list( - client.chat_completion( - [{"role": "user", "content": "Hello!"}], - stream=True, - ) - ) - - (transaction,) = events - - span = None - for sp in transaction["spans"]: - if sp["op"].startswith("gen_ai"): - assert span is None, "there is exactly one gen_ai span" - span = sp - else: - # there should be no other spans, just the gen_ai span - # and optionally some http.client spans from talking to the hf api - assert sp["op"] == "http.client" - - assert span is not None - - assert span["op"] == "gen_ai.chat" - assert span["description"] == "chat test-model" - assert span["origin"] == "auto.ai.huggingface_hub" - - expected_data = { - "gen_ai.operation.name": "chat", - "gen_ai.request.model": "test-model", - "gen_ai.response.finish_reasons": "stop", - "gen_ai.response.model": "test-model-123", - "gen_ai.response.streaming": True, - "thread.id": mock.ANY, - "thread.name": mock.ANY, - } - # usage is not available in older versions of the library - if HF_VERSION and HF_VERSION >= (0, 26, 0): - expected_data["gen_ai.usage.input_tokens"] = 183 - expected_data["gen_ai.usage.output_tokens"] = 14 - expected_data["gen_ai.usage.total_tokens"] = 197 - - if send_default_pii and include_prompts: - expected_data["gen_ai.request.messages"] = ( - '[{"role": "user", "content": "Hello!"}]' - ) - expected_data["gen_ai.response.text"] = "the mocked model response" - - if not send_default_pii or not include_prompts: - assert "gen_ai.request.messages" not in expected_data - assert "gen_ai.response.text" not in expected_data - - assert span["data"] == expected_data + if stream_gen_ai_spans: + items = capture_items("transaction", "span") - -@pytest.mark.httpx_mock(assert_all_requests_were_expected=False) -def test_chat_completion_api_error( - sentry_init: "Any", capture_events: "Any", mock_hf_api_with_errors: "Any" -) -> None: - sentry_init(traces_sample_rate=1.0) - events = capture_events() - - client = get_hf_provider_inference_client() - - with sentry_sdk.start_transaction(name="test"): - with pytest.raises(HfHubHTTPError): + with sentry_sdk.start_transaction(name="test"): client.chat_completion( messages=[{"role": "user", "content": "Hello!"}], + stream=False, ) - ( - error, - transaction, - ) = events - - assert error["exception"]["values"][0]["mechanism"]["type"] == "huggingface_hub" - assert not error["exception"]["values"][0]["mechanism"]["handled"] - - span = None - for sp in transaction["spans"]: - if sp["op"].startswith("gen_ai"): - assert span is None, "there is exactly one gen_ai span" - span = sp - else: - # there should be no other spans, just the gen_ai span - # and optionally some http.client spans from talking to the hf api - assert sp["op"] == "http.client" - - assert span is not None - - assert span["op"] == "gen_ai.chat" - assert span["description"] == "chat test-model" - assert span["origin"] == "auto.ai.huggingface_hub" - assert span["status"] == "internal_error" - assert span.get("tags", {}).get("status") == "internal_error" - - assert ( - error["contexts"]["trace"]["trace_id"] - == transaction["contexts"]["trace"]["trace_id"] - ) - expected_data = { - "gen_ai.operation.name": "chat", - "gen_ai.request.model": "test-model", - "thread.id": mock.ANY, - "thread.name": mock.ANY, - } - assert span["data"] == expected_data + spans = [item.payload for item in items if item.type == "span"] + span = None + for sp in spans: + if sp["attributes"]["sentry.op"].startswith("gen_ai"): + assert span is None, "there is exactly one gen_ai span" + span = sp + else: + # there should be no other spans, just the gen_ai span + # and optionally some http.client spans from talking to the hf api + assert sp["attributes"]["sentry.op"] == "http.client" + + assert span is not None + + assert span["attributes"]["sentry.op"] == "gen_ai.chat" + assert span["name"] == "chat test-model" + assert span["attributes"]["sentry.origin"] == "auto.ai.huggingface_hub" + + expected_data = { + "gen_ai.operation.name": "chat", + "gen_ai.request.model": "test-model", + "gen_ai.response.finish_reasons": "stop", + "gen_ai.response.model": "test-model-123", + "gen_ai.response.streaming": False, + "gen_ai.usage.input_tokens": 10, + "gen_ai.usage.output_tokens": 8, + "gen_ai.usage.total_tokens": 18, + "sentry.environment": "production", + "sentry.op": "gen_ai.chat", + "sentry.origin": "auto.ai.huggingface_hub", + "sentry.release": mock.ANY, + "sentry.sdk.name": "sentry.python", + "sentry.sdk.version": mock.ANY, + "sentry.segment.id": mock.ANY, + "sentry.segment.name": "test", + "thread.id": mock.ANY, + "thread.name": mock.ANY, + } + if send_default_pii and include_prompts: + expected_data["gen_ai.request.messages"] = ( + '[{"role": "user", "content": "Hello!"}]' + ) + expected_data["gen_ai.response.text"] = ( + "[mocked] Hello! How can I help you today?" + ) -@pytest.mark.httpx_mock(assert_all_requests_were_expected=False) -def test_span_status_error( - sentry_init: "Any", capture_events: "Any", mock_hf_api_with_errors: "Any" -) -> None: - sentry_init(traces_sample_rate=1.0) - events = capture_events() + if not send_default_pii or not include_prompts: + assert "gen_ai.request.messages" not in expected_data + assert "gen_ai.response.text" not in expected_data - client = get_hf_provider_inference_client() + assert span["attributes"] == expected_data + else: + events = capture_events() - with sentry_sdk.start_transaction(name="test"): - with pytest.raises(HfHubHTTPError): + with sentry_sdk.start_transaction(name="test"): client.chat_completion( messages=[{"role": "user", "content": "Hello!"}], + stream=False, ) - (error, transaction) = events - assert error["level"] == "error" - - span = None - for sp in transaction["spans"]: - if sp["op"].startswith("gen_ai"): - assert span is None, "there is exactly one gen_ai span" - span = sp - else: - # there should be no other spans, just the gen_ai span - # and optionally some http.client spans from talking to the hf api - assert sp["op"] == "http.client" - - assert span is not None - assert span["status"] == "internal_error" - assert span["tags"]["status"] == "internal_error" - - -@pytest.mark.httpx_mock(assert_all_requests_were_expected=False) -@pytest.mark.parametrize("send_default_pii", [True, False]) -@pytest.mark.parametrize("include_prompts", [True, False]) -def test_chat_completion_with_tools( - sentry_init: "Any", - capture_events: "Any", - send_default_pii: "Any", - include_prompts: "Any", - mock_hf_chat_completion_api_tools: "Any", -) -> None: - sentry_init( - traces_sample_rate=1.0, - send_default_pii=send_default_pii, - integrations=[HuggingfaceHubIntegration(include_prompts=include_prompts)], - ) - events = capture_events() - - client = get_hf_provider_inference_client() - - tools = [ - { - "type": "function", - "function": { - "name": "get_weather", - "description": "Get current weather", - "parameters": { - "type": "object", - "properties": {"location": {"type": "string"}}, - "required": ["location"], - }, - }, + (transaction,) = events + + span = None + for sp in transaction["spans"]: + if sp["op"].startswith("gen_ai"): + assert span is None, "there is exactly one gen_ai span" + span = sp + else: + # there should be no other spans, just the gen_ai span + # and optionally some http.client spans from talking to the hf api + assert sp["op"] == "http.client" + + assert span is not None + + assert span["op"] == "gen_ai.chat" + assert span["description"] == "chat test-model" + assert span["origin"] == "auto.ai.huggingface_hub" + + expected_data = { + "gen_ai.operation.name": "chat", + "gen_ai.request.model": "test-model", + "gen_ai.response.finish_reasons": "stop", + "gen_ai.response.model": "test-model-123", + "gen_ai.response.streaming": False, + "gen_ai.usage.input_tokens": 10, + "gen_ai.usage.output_tokens": 8, + "gen_ai.usage.total_tokens": 18, + "thread.id": mock.ANY, + "thread.name": mock.ANY, } - ] - with sentry_sdk.start_transaction(name="test"): - client.chat_completion( - messages=[{"role": "user", "content": "What is the weather in Paris?"}], - tools=tools, - tool_choice="auto", - ) - - (transaction,) = events - - span = None - for sp in transaction["spans"]: - if sp["op"].startswith("gen_ai"): - assert span is None, "there is exactly one gen_ai span" - span = sp - else: - # there should be no other spans, just the gen_ai span - # and optionally some http.client spans from talking to the hf api - assert sp["op"] == "http.client" - - assert span is not None - - assert span["op"] == "gen_ai.chat" - assert span["description"] == "chat test-model" - assert span["origin"] == "auto.ai.huggingface_hub" - - expected_data = { - "gen_ai.operation.name": "chat", - "gen_ai.request.available_tools": '[{"type": "function", "function": {"name": "get_weather", "description": "Get current weather", "parameters": {"type": "object", "properties": {"location": {"type": "string"}}, "required": ["location"]}}}]', - "gen_ai.request.model": "test-model", - "gen_ai.response.finish_reasons": "tool_calls", - "gen_ai.response.model": "test-model-123", - "gen_ai.usage.input_tokens": 10, - "gen_ai.usage.output_tokens": 8, - "gen_ai.usage.total_tokens": 18, - "thread.id": mock.ANY, - "thread.name": mock.ANY, - } - - if send_default_pii and include_prompts: - expected_data["gen_ai.request.messages"] = ( - '[{"role": "user", "content": "What is the weather in Paris?"}]' - ) - expected_data["gen_ai.response.tool_calls"] = ( - '[{"function": {"arguments": {"location": "Paris"}, "name": "get_weather", "description": "None"}, "id": "call_123", "type": "function"}]' - ) + if send_default_pii and include_prompts: + expected_data["gen_ai.request.messages"] = ( + '[{"role": "user", "content": "Hello!"}]' + ) + expected_data["gen_ai.response.text"] = ( + "[mocked] Hello! How can I help you today?" + ) - if not send_default_pii or not include_prompts: - assert "gen_ai.request.messages" not in expected_data - assert "gen_ai.response.text" not in expected_data - assert "gen_ai.response.tool_calls" not in expected_data + if not send_default_pii or not include_prompts: + assert "gen_ai.request.messages" not in expected_data + assert "gen_ai.response.text" not in expected_data - assert span["data"] == expected_data + assert span["data"] == expected_data +@pytest.mark.parametrize("stream_gen_ai_spans", [True, False]) @pytest.mark.httpx_mock(assert_all_requests_were_expected=False) @pytest.mark.parametrize("send_default_pii", [True, False]) @pytest.mark.parametrize("include_prompts", [True, False]) -def test_chat_completion_streaming_with_tools( +def test_chat_completion_streaming( sentry_init: "Any", capture_events: "Any", + capture_items: "Any", send_default_pii: "Any", include_prompts: "Any", - mock_hf_chat_completion_api_streaming_tools: "Any", + mock_hf_chat_completion_api_streaming: "Any", + stream_gen_ai_spans: "Any", ) -> None: sentry_init( traces_sample_rate=1.0, send_default_pii=send_default_pii, integrations=[HuggingfaceHubIntegration(include_prompts=include_prompts)], + stream_gen_ai_spans=stream_gen_ai_spans, ) - events = capture_events() client = get_hf_provider_inference_client() - tools = [ - { - "type": "function", - "function": { - "name": "get_weather", - "description": "Get current weather", - "parameters": { - "type": "object", - "properties": {"location": {"type": "string"}}, - "required": ["location"], - }, - }, - } - ] + if stream_gen_ai_spans: + items = capture_items("transaction", "span") - with sentry_sdk.start_transaction(name="test"): - _ = list( - client.chat_completion( - messages=[{"role": "user", "content": "What is the weather in Paris?"}], - stream=True, - tools=tools, - tool_choice="auto", + with sentry_sdk.start_transaction(name="test"): + _ = list( + client.chat_completion( + [{"role": "user", "content": "Hello!"}], + stream=True, + ) ) - ) - - (transaction,) = events - - span = None - for sp in transaction["spans"]: - if sp["op"].startswith("gen_ai"): - assert span is None, "there is exactly one gen_ai span" - span = sp - else: - # there should be no other spans, just the gen_ai span - # and optionally some http.client spans from talking to the hf api - assert sp["op"] == "http.client" - - assert span is not None - - assert span["op"] == "gen_ai.chat" - assert span["description"] == "chat test-model" - assert span["origin"] == "auto.ai.huggingface_hub" - - expected_data = { - "gen_ai.operation.name": "chat", - "gen_ai.request.available_tools": '[{"type": "function", "function": {"name": "get_weather", "description": "Get current weather", "parameters": {"type": "object", "properties": {"location": {"type": "string"}}, "required": ["location"]}}}]', - "gen_ai.request.model": "test-model", - "gen_ai.response.finish_reasons": "tool_calls", - "gen_ai.response.model": "test-model-123", - "gen_ai.response.streaming": True, - "thread.id": mock.ANY, - "thread.name": mock.ANY, - } - - if HF_VERSION and HF_VERSION >= (0, 26, 0): - expected_data["gen_ai.usage.input_tokens"] = 183 - expected_data["gen_ai.usage.output_tokens"] = 14 - expected_data["gen_ai.usage.total_tokens"] = 197 - - if send_default_pii and include_prompts: - expected_data["gen_ai.request.messages"] = ( - '[{"role": "user", "content": "What is the weather in Paris?"}]' - ) - expected_data["gen_ai.response.text"] = "response with tool calls follows" - expected_data["gen_ai.response.tool_calls"] = ( - '[{"function": {"arguments": {"location": "Paris"}, "name": "get_weather"}, "id": "call_123", "type": "function", "index": "None"}]' - ) - - if not send_default_pii or not include_prompts: - assert "gen_ai.request.messages" not in expected_data - assert "gen_ai.response.text" not in expected_data - assert "gen_ai.response.tool_calls" not in expected_data - - assert span["data"] == expected_data + spans = [item.payload for item in items if item.type == "span"] + span = None + for sp in spans: + if sp["attributes"]["sentry.op"].startswith("gen_ai"): + assert span is None, "there is exactly one gen_ai span" + span = sp + else: + # there should be no other spans, just the gen_ai span + # and optionally some http.client spans from talking to the hf api + assert sp["attributes"]["sentry.op"] == "http.client" + + assert span is not None + + assert span["attributes"]["sentry.op"] == "gen_ai.chat" + assert span["name"] == "chat test-model" + assert span["attributes"]["sentry.origin"] == "auto.ai.huggingface_hub" + + expected_data = { + "gen_ai.operation.name": "chat", + "gen_ai.request.model": "test-model", + "gen_ai.response.finish_reasons": "stop", + "gen_ai.response.model": "test-model-123", + "gen_ai.response.streaming": True, + "sentry.environment": "production", + "sentry.op": "gen_ai.chat", + "sentry.origin": "auto.ai.huggingface_hub", + "sentry.release": mock.ANY, + "sentry.sdk.name": "sentry.python", + "sentry.sdk.version": mock.ANY, + "sentry.segment.id": mock.ANY, + "sentry.segment.name": "test", + "thread.id": mock.ANY, + "thread.name": mock.ANY, + } + # usage is not available in older versions of the library + if HF_VERSION and HF_VERSION >= (0, 26, 0): + expected_data["gen_ai.usage.input_tokens"] = 183 + expected_data["gen_ai.usage.output_tokens"] = 14 + expected_data["gen_ai.usage.total_tokens"] = 197 + + if send_default_pii and include_prompts: + expected_data["gen_ai.request.messages"] = ( + '[{"role": "user", "content": "Hello!"}]' + ) + expected_data["gen_ai.response.text"] = "the mocked model response" -def _get_gen_ai_span(items): - return next( - item.payload - for item in items - if item.payload.get("attributes", {}).get("sentry.op", "").startswith("gen_ai") - ) - - -@pytest.mark.httpx_mock(assert_all_requests_were_expected=False) -@pytest.mark.parametrize("send_default_pii", [True, False]) -@pytest.mark.parametrize("include_prompts", [True, False]) -def test_text_generation_span_streaming( - sentry_init: "Any", - capture_items: "Any", - send_default_pii: "Any", - include_prompts: "Any", - mock_hf_text_generation_api: "Any", -) -> None: - sentry_init( - traces_sample_rate=1.0, - send_default_pii=send_default_pii, - integrations=[HuggingfaceHubIntegration(include_prompts=include_prompts)], - _experiments={"trace_lifecycle": "stream"}, - ) - items = capture_items("span") - - client = InferenceClient(model="test-model") - - client.text_generation( - "Hello", - stream=False, - details=True, - ) - - sentry_sdk.flush() - - span = _get_gen_ai_span(items) - - assert span["name"] == "text_completion test-model" - assert span["attributes"]["sentry.op"] == "gen_ai.text_completion" - assert span["attributes"]["sentry.origin"] == "auto.ai.huggingface_hub" - - expected_attributes = { - "gen_ai.operation.name": "text_completion", - "gen_ai.request.model": "test-model", - "gen_ai.response.finish_reasons": "length", - "gen_ai.response.streaming": False, - "gen_ai.usage.total_tokens": 10, - } + if not send_default_pii or not include_prompts: + assert "gen_ai.request.messages" not in expected_data + assert "gen_ai.response.text" not in expected_data - if send_default_pii and include_prompts: - expected_attributes["gen_ai.request.messages"] = "Hello" - expected_attributes["gen_ai.response.text"] = ( - "[mocked] Hello! How can i help you?" - ) + assert span["attributes"] == expected_data + else: + events = capture_events() + + with sentry_sdk.start_transaction(name="test"): + _ = list( + client.chat_completion( + [{"role": "user", "content": "Hello!"}], + stream=True, + ) + ) - for key, value in expected_attributes.items(): - assert span["attributes"][key] == value, f"Mismatch on {key}" + (transaction,) = events + + span = None + for sp in transaction["spans"]: + if sp["op"].startswith("gen_ai"): + assert span is None, "there is exactly one gen_ai span" + span = sp + else: + # there should be no other spans, just the gen_ai span + # and optionally some http.client spans from talking to the hf api + assert sp["op"] == "http.client" + + assert span is not None + + assert span["op"] == "gen_ai.chat" + assert span["description"] == "chat test-model" + assert span["origin"] == "auto.ai.huggingface_hub" + + expected_data = { + "gen_ai.operation.name": "chat", + "gen_ai.request.model": "test-model", + "gen_ai.response.finish_reasons": "stop", + "gen_ai.response.model": "test-model-123", + "gen_ai.response.streaming": True, + "thread.id": mock.ANY, + "thread.name": mock.ANY, + } + # usage is not available in older versions of the library + if HF_VERSION and HF_VERSION >= (0, 26, 0): + expected_data["gen_ai.usage.input_tokens"] = 183 + expected_data["gen_ai.usage.output_tokens"] = 14 + expected_data["gen_ai.usage.total_tokens"] = 197 + + if send_default_pii and include_prompts: + expected_data["gen_ai.request.messages"] = ( + '[{"role": "user", "content": "Hello!"}]' + ) + expected_data["gen_ai.response.text"] = "the mocked model response" - if not send_default_pii or not include_prompts: - assert "gen_ai.request.messages" not in span["attributes"] - assert "gen_ai.response.text" not in span["attributes"] + if not send_default_pii or not include_prompts: + assert "gen_ai.request.messages" not in expected_data + assert "gen_ai.response.text" not in expected_data - # text generation does not set the response model - assert "gen_ai.response.model" not in span["attributes"] + assert span["data"] == expected_data +@pytest.mark.parametrize("stream_gen_ai_spans", [True, False]) @pytest.mark.httpx_mock(assert_all_requests_were_expected=False) -@pytest.mark.parametrize("send_default_pii", [True, False]) -@pytest.mark.parametrize("include_prompts", [True, False]) -def test_text_generation_streaming_span_streaming( +def test_chat_completion_api_error( sentry_init: "Any", + capture_events: "Any", capture_items: "Any", - send_default_pii: "Any", - include_prompts: "Any", - mock_hf_text_generation_api_streaming: "Any", + mock_hf_api_with_errors: "Any", + stream_gen_ai_spans: "Any", ) -> None: sentry_init( traces_sample_rate=1.0, - send_default_pii=send_default_pii, - integrations=[HuggingfaceHubIntegration(include_prompts=include_prompts)], - _experiments={"trace_lifecycle": "stream"}, + stream_gen_ai_spans=stream_gen_ai_spans, ) - items = capture_items("span") - - client = InferenceClient(model="test-model") - - for _ in client.text_generation( - prompt="Hello", - stream=True, - details=True, - ): - pass - - sentry_sdk.flush() - - span = _get_gen_ai_span(items) - - assert span["name"] == "text_completion test-model" - assert span["attributes"]["sentry.op"] == "gen_ai.text_completion" - assert span["attributes"]["sentry.origin"] == "auto.ai.huggingface_hub" - - expected_attributes = { - "gen_ai.operation.name": "text_completion", - "gen_ai.request.model": "test-model", - "gen_ai.response.finish_reasons": "length", - "gen_ai.response.streaming": True, - "gen_ai.usage.total_tokens": 10, - } - - if send_default_pii and include_prompts: - expected_attributes["gen_ai.request.messages"] = "Hello" - expected_attributes["gen_ai.response.text"] = "the mocked model response" - - for key, value in expected_attributes.items(): - assert span["attributes"][key] == value, f"Mismatch on {key}" - - if not send_default_pii or not include_prompts: - assert "gen_ai.request.messages" not in span["attributes"] - assert "gen_ai.response.text" not in span["attributes"] - - assert "gen_ai.response.model" not in span["attributes"] + client = get_hf_provider_inference_client() + if stream_gen_ai_spans: + items = capture_items("event", "transaction", "span") -@pytest.mark.httpx_mock(assert_all_requests_were_expected=False) -@pytest.mark.parametrize("send_default_pii", [True, False]) -@pytest.mark.parametrize("include_prompts", [True, False]) -def test_chat_completion_span_streaming( - sentry_init: "Any", - capture_items: "Any", - send_default_pii: "Any", - include_prompts: "Any", - mock_hf_chat_completion_api: "Any", -) -> None: - sentry_init( - traces_sample_rate=1.0, - send_default_pii=send_default_pii, - integrations=[HuggingfaceHubIntegration(include_prompts=include_prompts)], - _experiments={"trace_lifecycle": "stream"}, - ) - items = capture_items("span") + with sentry_sdk.start_transaction(name="test"), pytest.raises(HfHubHTTPError): + client.chat_completion( + messages=[{"role": "user", "content": "Hello!"}], + ) - client = get_hf_provider_inference_client() + (error,) = (item.payload for item in items if item.type == "event") - client.chat_completion( - messages=[{"role": "user", "content": "Hello!"}], - stream=False, - ) + assert error["exception"]["values"][0]["mechanism"]["type"] == "huggingface_hub" + assert not error["exception"]["values"][0]["mechanism"]["handled"] - sentry_sdk.flush() + spans = [item.payload for item in items if item.type == "span"] + span = None + for sp in spans: + if sp["attributes"]["sentry.op"].startswith("gen_ai"): + assert span is None, "there is exactly one gen_ai span" + span = sp + else: + # there should be no other spans, just the gen_ai span + # and optionally some http.client spans from talking to the hf api + assert sp["attributes"]["sentry.op"] == "http.client" - span = _get_gen_ai_span(items) + assert span is not None - assert span["name"] == "chat test-model" - assert span["attributes"]["sentry.op"] == "gen_ai.chat" - assert span["attributes"]["sentry.origin"] == "auto.ai.huggingface_hub" + assert span["attributes"]["sentry.op"] == "gen_ai.chat" + assert span["name"] == "chat test-model" + assert span["attributes"]["sentry.origin"] == "auto.ai.huggingface_hub" + assert span["status"] == "error" - expected_attributes = { - "gen_ai.operation.name": "chat", - "gen_ai.request.model": "test-model", - "gen_ai.response.finish_reasons": "stop", - "gen_ai.response.model": "test-model-123", - "gen_ai.response.streaming": False, - "gen_ai.usage.input_tokens": 10, - "gen_ai.usage.output_tokens": 8, - "gen_ai.usage.total_tokens": 18, - } + (transaction,) = (item.payload for item in items if item.type == "transaction") - if send_default_pii and include_prompts: - expected_attributes["gen_ai.request.messages"] = ( - '[{"role": "user", "content": "Hello!"}]' - ) - expected_attributes["gen_ai.response.text"] = ( - "[mocked] Hello! How can I help you today?" + assert ( + error["contexts"]["trace"]["trace_id"] + == transaction["contexts"]["trace"]["trace_id"] ) + expected_data = { + "gen_ai.operation.name": "chat", + "gen_ai.request.model": "test-model", + "sentry.environment": "production", + "sentry.op": "gen_ai.chat", + "sentry.origin": "auto.ai.huggingface_hub", + "sentry.release": mock.ANY, + "sentry.sdk.name": "sentry.python", + "sentry.sdk.version": mock.ANY, + "sentry.segment.id": mock.ANY, + "sentry.segment.name": "test", + "thread.id": mock.ANY, + "thread.name": mock.ANY, + } + assert span["attributes"] == expected_data + else: + events = capture_events() - for key, value in expected_attributes.items(): - assert span["attributes"][key] == value, f"Mismatch on {key}" + with sentry_sdk.start_transaction(name="test"), pytest.raises(HfHubHTTPError): + client.chat_completion( + messages=[{"role": "user", "content": "Hello!"}], + ) - if not send_default_pii or not include_prompts: - assert "gen_ai.request.messages" not in span["attributes"] - assert "gen_ai.response.text" not in span["attributes"] + ( + error, + transaction, + ) = events + + assert error["exception"]["values"][0]["mechanism"]["type"] == "huggingface_hub" + assert not error["exception"]["values"][0]["mechanism"]["handled"] + + span = None + for sp in transaction["spans"]: + if sp["op"].startswith("gen_ai"): + assert span is None, "there is exactly one gen_ai span" + span = sp + else: + # there should be no other spans, just the gen_ai span + # and optionally some http.client spans from talking to the hf api + assert sp["op"] == "http.client" + + assert span is not None + + assert span["op"] == "gen_ai.chat" + assert span["description"] == "chat test-model" + assert span["origin"] == "auto.ai.huggingface_hub" + assert span["status"] == "internal_error" + assert span.get("tags", {}).get("status") == "internal_error" + + assert ( + error["contexts"]["trace"]["trace_id"] + == transaction["contexts"]["trace"]["trace_id"] + ) + expected_data = { + "gen_ai.operation.name": "chat", + "gen_ai.request.model": "test-model", + "thread.id": mock.ANY, + "thread.name": mock.ANY, + } + assert span["data"] == expected_data +@pytest.mark.parametrize("stream_gen_ai_spans", [True, False]) @pytest.mark.httpx_mock(assert_all_requests_were_expected=False) -@pytest.mark.parametrize("send_default_pii", [True, False]) -@pytest.mark.parametrize("include_prompts", [True, False]) -def test_chat_completion_streaming_span_streaming( +def test_span_status_error( sentry_init: "Any", + capture_events: "Any", capture_items: "Any", - send_default_pii: "Any", - include_prompts: "Any", - mock_hf_chat_completion_api_streaming: "Any", + mock_hf_api_with_errors: "Any", + stream_gen_ai_spans: "Any", ) -> None: - sentry_init( - traces_sample_rate=1.0, - send_default_pii=send_default_pii, - integrations=[HuggingfaceHubIntegration(include_prompts=include_prompts)], - _experiments={"trace_lifecycle": "stream"}, - ) - items = capture_items("span") - client = get_hf_provider_inference_client() - _ = list( - client.chat_completion( - [{"role": "user", "content": "Hello!"}], - stream=True, - ) - ) - - sentry_sdk.flush() - - span = _get_gen_ai_span(items) - - assert span["name"] == "chat test-model" - assert span["attributes"]["sentry.op"] == "gen_ai.chat" - assert span["attributes"]["sentry.origin"] == "auto.ai.huggingface_hub" - - expected_attributes = { - "gen_ai.operation.name": "chat", - "gen_ai.request.model": "test-model", - "gen_ai.response.finish_reasons": "stop", - "gen_ai.response.model": "test-model-123", - "gen_ai.response.streaming": True, - } - if HF_VERSION and HF_VERSION >= (0, 26, 0): - expected_attributes["gen_ai.usage.input_tokens"] = 183 - expected_attributes["gen_ai.usage.output_tokens"] = 14 - expected_attributes["gen_ai.usage.total_tokens"] = 197 - - if send_default_pii and include_prompts: - expected_attributes["gen_ai.request.messages"] = ( - '[{"role": "user", "content": "Hello!"}]' - ) - expected_attributes["gen_ai.response.text"] = "the mocked model response" - - for key, value in expected_attributes.items(): - assert span["attributes"][key] == value, f"Mismatch on {key}" - - if not send_default_pii or not include_prompts: - assert "gen_ai.request.messages" not in span["attributes"] - assert "gen_ai.response.text" not in span["attributes"] - - -@pytest.mark.httpx_mock(assert_all_requests_were_expected=False) -def test_chat_completion_api_error_span_streaming( - sentry_init: "Any", capture_items: "Any", mock_hf_api_with_errors: "Any" -) -> None: sentry_init( traces_sample_rate=1.0, - _experiments={"trace_lifecycle": "stream"}, + stream_gen_ai_spans=stream_gen_ai_spans, ) - items = capture_items("event", "span") - - client = get_hf_provider_inference_client() - with pytest.raises(HfHubHTTPError): - client.chat_completion( - messages=[{"role": "user", "content": "Hello!"}], - ) + if stream_gen_ai_spans: + items = capture_items("event", "transaction", "span") - sentry_sdk.flush() + with sentry_sdk.start_transaction(name="test"), pytest.raises(HfHubHTTPError): + client.chat_completion( + messages=[{"role": "user", "content": "Hello!"}], + ) - error_event = next(item.payload for item in items if item.type == "event") - span = _get_gen_ai_span(items) + (error,) = [item.payload for item in items if item.type == "event"] + assert error["level"] == "error" + + spans = [item.payload for item in items if item.type == "span"] + span = None + for sp in spans: + if sp["attributes"]["sentry.op"].startswith("gen_ai"): + assert span is None, "there is exactly one gen_ai span" + span = sp + else: + # there should be no other spans, just the gen_ai span + # and optionally some http.client spans from talking to the hf api + assert sp["attributes"]["sentry.op"] == "http.client" + + assert span is not None + assert span["status"] == "error" + else: + events = capture_events() - assert error_event["exception"]["values"][0]["mechanism"]["type"] == ( - "huggingface_hub" - ) - assert not error_event["exception"]["values"][0]["mechanism"]["handled"] - assert error_event["level"] == "error" + with sentry_sdk.start_transaction(name="test"), pytest.raises(HfHubHTTPError): + client.chat_completion( + messages=[{"role": "user", "content": "Hello!"}], + ) - assert span["name"] == "chat test-model" - assert span["attributes"]["sentry.op"] == "gen_ai.chat" - assert span["attributes"]["sentry.origin"] == "auto.ai.huggingface_hub" - assert span["status"] == "error" + (error, transaction) = events + assert error["level"] == "error" - assert error_event["contexts"]["trace"]["trace_id"] == span["trace_id"] + span = None + for sp in transaction["spans"]: + if sp["op"].startswith("gen_ai"): + assert span is None, "there is exactly one gen_ai span" + span = sp + else: + # there should be no other spans, just the gen_ai span + # and optionally some http.client spans from talking to the hf api + assert sp["op"] == "http.client" - assert span["attributes"]["gen_ai.operation.name"] == "chat" - assert span["attributes"]["gen_ai.request.model"] == "test-model" + assert span is not None + assert span["status"] == "internal_error" + assert span["tags"]["status"] == "internal_error" +@pytest.mark.parametrize("stream_gen_ai_spans", [True, False]) @pytest.mark.httpx_mock(assert_all_requests_were_expected=False) @pytest.mark.parametrize("send_default_pii", [True, False]) @pytest.mark.parametrize("include_prompts", [True, False]) -def test_chat_completion_with_tools_span_streaming( +def test_chat_completion_with_tools( sentry_init: "Any", + capture_events: "Any", capture_items: "Any", send_default_pii: "Any", include_prompts: "Any", mock_hf_chat_completion_api_tools: "Any", -) -> None: + stream_gen_ai_spans: "Any", +): sentry_init( traces_sample_rate=1.0, send_default_pii=send_default_pii, integrations=[HuggingfaceHubIntegration(include_prompts=include_prompts)], - _experiments={"trace_lifecycle": "stream"}, + stream_gen_ai_spans=stream_gen_ai_spans, ) - items = capture_items("span") client = get_hf_provider_inference_client() @@ -1336,65 +1229,144 @@ def test_chat_completion_with_tools_span_streaming( } ] - client.chat_completion( - messages=[{"role": "user", "content": "What is the weather in Paris?"}], - tools=tools, - tool_choice="auto", - ) + if stream_gen_ai_spans: + items = capture_items("transaction", "span") - sentry_sdk.flush() + with sentry_sdk.start_transaction(name="test"): + client.chat_completion( + messages=[{"role": "user", "content": "What is the weather in Paris?"}], + tools=tools, + tool_choice="auto", + ) - span = _get_gen_ai_span(items) + spans = [item.payload for item in items if item.type == "span"] + span = None + for sp in spans: + if sp["attributes"]["sentry.op"].startswith("gen_ai"): + assert span is None, "there is exactly one gen_ai span" + span = sp + else: + # there should be no other spans, just the gen_ai span + # and optionally some http.client spans from talking to the hf api + assert sp["attributes"]["sentry.op"] == "http.client" + + assert span is not None + + assert span["attributes"]["sentry.op"] == "gen_ai.chat" + assert span["name"] == "chat test-model" + assert span["attributes"]["sentry.origin"] == "auto.ai.huggingface_hub" + + expected_data = { + "gen_ai.operation.name": "chat", + "gen_ai.request.available_tools": '[{"type": "function", "function": {"name": "get_weather", "description": "Get current weather", "parameters": {"type": "object", "properties": {"location": {"type": "string"}}, "required": ["location"]}}}]', + "gen_ai.request.model": "test-model", + "gen_ai.response.finish_reasons": "tool_calls", + "gen_ai.response.model": "test-model-123", + "gen_ai.usage.input_tokens": 10, + "gen_ai.usage.output_tokens": 8, + "gen_ai.usage.total_tokens": 18, + "sentry.environment": "production", + "sentry.op": "gen_ai.chat", + "sentry.origin": "auto.ai.huggingface_hub", + "sentry.release": mock.ANY, + "sentry.sdk.name": "sentry.python", + "sentry.sdk.version": mock.ANY, + "sentry.segment.id": mock.ANY, + "sentry.segment.name": "test", + "thread.id": mock.ANY, + "thread.name": mock.ANY, + } - assert span["name"] == "chat test-model" - assert span["attributes"]["sentry.op"] == "gen_ai.chat" - assert span["attributes"]["sentry.origin"] == "auto.ai.huggingface_hub" + if send_default_pii and include_prompts: + expected_data["gen_ai.request.messages"] = ( + '[{"role": "user", "content": "What is the weather in Paris?"}]' + ) + expected_data["gen_ai.response.tool_calls"] = ( + '[{"function": {"arguments": {"location": "Paris"}, "name": "get_weather", "description": "None"}, "id": "call_123", "type": "function"}]' + ) - expected_attributes = { - "gen_ai.operation.name": "chat", - "gen_ai.request.available_tools": '[{"type": "function", "function": {"name": "get_weather", "description": "Get current weather", "parameters": {"type": "object", "properties": {"location": {"type": "string"}}, "required": ["location"]}}}]', - "gen_ai.request.model": "test-model", - "gen_ai.response.finish_reasons": "tool_calls", - "gen_ai.response.model": "test-model-123", - "gen_ai.usage.input_tokens": 10, - "gen_ai.usage.output_tokens": 8, - "gen_ai.usage.total_tokens": 18, - } + if not send_default_pii or not include_prompts: + assert "gen_ai.request.messages" not in expected_data + assert "gen_ai.response.text" not in expected_data + assert "gen_ai.response.tool_calls" not in expected_data - if send_default_pii and include_prompts: - expected_attributes["gen_ai.request.messages"] = ( - '[{"role": "user", "content": "What is the weather in Paris?"}]' - ) - expected_attributes["gen_ai.response.tool_calls"] = ( - '[{"function": {"arguments": {"location": "Paris"}, "name": "get_weather", "description": "None"}, "id": "call_123", "type": "function"}]' - ) + assert span["attributes"] == expected_data + else: + events = capture_events() + + with sentry_sdk.start_transaction(name="test"): + client.chat_completion( + messages=[{"role": "user", "content": "What is the weather in Paris?"}], + tools=tools, + tool_choice="auto", + ) - for key, value in expected_attributes.items(): - assert span["attributes"][key] == value, f"Mismatch on {key}" + (transaction,) = events + + span = None + for sp in transaction["spans"]: + if sp["op"].startswith("gen_ai"): + assert span is None, "there is exactly one gen_ai span" + span = sp + else: + # there should be no other spans, just the gen_ai span + # and optionally some http.client spans from talking to the hf api + assert sp["op"] == "http.client" + + assert span is not None + + assert span["op"] == "gen_ai.chat" + assert span["description"] == "chat test-model" + assert span["origin"] == "auto.ai.huggingface_hub" + + expected_data = { + "gen_ai.operation.name": "chat", + "gen_ai.request.available_tools": '[{"type": "function", "function": {"name": "get_weather", "description": "Get current weather", "parameters": {"type": "object", "properties": {"location": {"type": "string"}}, "required": ["location"]}}}]', + "gen_ai.request.model": "test-model", + "gen_ai.response.finish_reasons": "tool_calls", + "gen_ai.response.model": "test-model-123", + "gen_ai.usage.input_tokens": 10, + "gen_ai.usage.output_tokens": 8, + "gen_ai.usage.total_tokens": 18, + "thread.id": mock.ANY, + "thread.name": mock.ANY, + } - if not send_default_pii or not include_prompts: - assert "gen_ai.request.messages" not in span["attributes"] - assert "gen_ai.response.text" not in span["attributes"] - assert "gen_ai.response.tool_calls" not in span["attributes"] + if send_default_pii and include_prompts: + expected_data["gen_ai.request.messages"] = ( + '[{"role": "user", "content": "What is the weather in Paris?"}]' + ) + expected_data["gen_ai.response.tool_calls"] = ( + '[{"function": {"arguments": {"location": "Paris"}, "name": "get_weather", "description": "None"}, "id": "call_123", "type": "function"}]' + ) + if not send_default_pii or not include_prompts: + assert "gen_ai.request.messages" not in expected_data + assert "gen_ai.response.text" not in expected_data + assert "gen_ai.response.tool_calls" not in expected_data + assert span["data"] == expected_data + + +@pytest.mark.parametrize("stream_gen_ai_spans", [True, False]) @pytest.mark.httpx_mock(assert_all_requests_were_expected=False) @pytest.mark.parametrize("send_default_pii", [True, False]) @pytest.mark.parametrize("include_prompts", [True, False]) -def test_chat_completion_streaming_with_tools_span_streaming( +def test_chat_completion_streaming_with_tools( sentry_init: "Any", + capture_events: "Any", capture_items: "Any", send_default_pii: "Any", include_prompts: "Any", mock_hf_chat_completion_api_streaming_tools: "Any", + stream_gen_ai_spans: "Any", ) -> None: sentry_init( traces_sample_rate=1.0, send_default_pii=send_default_pii, integrations=[HuggingfaceHubIntegration(include_prompts=include_prompts)], - _experiments={"trace_lifecycle": "stream"}, + stream_gen_ai_spans=stream_gen_ai_spans, ) - items = capture_items("span") client = get_hf_provider_inference_client() @@ -1413,50 +1385,138 @@ def test_chat_completion_streaming_with_tools_span_streaming( } ] - _ = list( - client.chat_completion( - messages=[{"role": "user", "content": "What is the weather in Paris?"}], - stream=True, - tools=tools, - tool_choice="auto", - ) - ) + if stream_gen_ai_spans: + items = capture_items("transaction", "span") + + with sentry_sdk.start_transaction(name="test"): + _ = list( + client.chat_completion( + messages=[ + {"role": "user", "content": "What is the weather in Paris?"} + ], + stream=True, + tools=tools, + tool_choice="auto", + ) + ) - sentry_sdk.flush() + spans = [item.payload for item in items if item.type == "span"] + span = None + for sp in spans: + if sp["attributes"]["sentry.op"].startswith("gen_ai"): + assert span is None, "there is exactly one gen_ai span" + span = sp + else: + # there should be no other spans, just the gen_ai span + # and optionally some http.client spans from talking to the hf api + assert sp["attributes"]["sentry.op"] == "http.client" + + assert span is not None + + assert span["attributes"]["sentry.op"] == "gen_ai.chat" + assert span["name"] == "chat test-model" + assert span["attributes"]["sentry.origin"] == "auto.ai.huggingface_hub" + + expected_data = { + "gen_ai.operation.name": "chat", + "gen_ai.request.available_tools": '[{"type": "function", "function": {"name": "get_weather", "description": "Get current weather", "parameters": {"type": "object", "properties": {"location": {"type": "string"}}, "required": ["location"]}}}]', + "gen_ai.request.model": "test-model", + "gen_ai.response.finish_reasons": "tool_calls", + "gen_ai.response.model": "test-model-123", + "gen_ai.response.streaming": True, + "sentry.environment": "production", + "sentry.op": "gen_ai.chat", + "sentry.origin": "auto.ai.huggingface_hub", + "sentry.release": mock.ANY, + "sentry.sdk.name": "sentry.python", + "sentry.sdk.version": mock.ANY, + "sentry.segment.id": mock.ANY, + "sentry.segment.name": "test", + "thread.id": mock.ANY, + "thread.name": mock.ANY, + } - span = _get_gen_ai_span(items) + if HF_VERSION and HF_VERSION >= (0, 26, 0): + expected_data["gen_ai.usage.input_tokens"] = 183 + expected_data["gen_ai.usage.output_tokens"] = 14 + expected_data["gen_ai.usage.total_tokens"] = 197 - assert span["name"] == "chat test-model" - assert span["attributes"]["sentry.op"] == "gen_ai.chat" - assert span["attributes"]["sentry.origin"] == "auto.ai.huggingface_hub" + if send_default_pii and include_prompts: + expected_data["gen_ai.request.messages"] = ( + '[{"role": "user", "content": "What is the weather in Paris?"}]' + ) + expected_data["gen_ai.response.text"] = "response with tool calls follows" + expected_data["gen_ai.response.tool_calls"] = ( + '[{"function": {"arguments": {"location": "Paris"}, "name": "get_weather"}, "id": "call_123", "type": "function", "index": "None"}]' + ) - expected_attributes = { - "gen_ai.operation.name": "chat", - "gen_ai.request.available_tools": '[{"type": "function", "function": {"name": "get_weather", "description": "Get current weather", "parameters": {"type": "object", "properties": {"location": {"type": "string"}}, "required": ["location"]}}}]', - "gen_ai.request.model": "test-model", - "gen_ai.response.finish_reasons": "tool_calls", - "gen_ai.response.model": "test-model-123", - "gen_ai.response.streaming": True, - } + if not send_default_pii or not include_prompts: + assert "gen_ai.request.messages" not in expected_data + assert "gen_ai.response.text" not in expected_data + assert "gen_ai.response.tool_calls" not in expected_data - if HF_VERSION and HF_VERSION >= (0, 26, 0): - expected_attributes["gen_ai.usage.input_tokens"] = 183 - expected_attributes["gen_ai.usage.output_tokens"] = 14 - expected_attributes["gen_ai.usage.total_tokens"] = 197 + assert span["attributes"] == expected_data + else: + events = capture_events() + + with sentry_sdk.start_transaction(name="test"): + _ = list( + client.chat_completion( + messages=[ + {"role": "user", "content": "What is the weather in Paris?"} + ], + stream=True, + tools=tools, + tool_choice="auto", + ) + ) - if send_default_pii and include_prompts: - expected_attributes["gen_ai.request.messages"] = ( - '[{"role": "user", "content": "What is the weather in Paris?"}]' - ) - expected_attributes["gen_ai.response.text"] = "response with tool calls follows" - expected_attributes["gen_ai.response.tool_calls"] = ( - '[{"function": {"arguments": {"location": "Paris"}, "name": "get_weather"}, "id": "call_123", "type": "function", "index": "None"}]' - ) + (transaction,) = events + + span = None + for sp in transaction["spans"]: + if sp["op"].startswith("gen_ai"): + assert span is None, "there is exactly one gen_ai span" + span = sp + else: + # there should be no other spans, just the gen_ai span + # and optionally some http.client spans from talking to the hf api + assert sp["op"] == "http.client" + + assert span is not None + + assert span["op"] == "gen_ai.chat" + assert span["description"] == "chat test-model" + assert span["origin"] == "auto.ai.huggingface_hub" + + expected_data = { + "gen_ai.operation.name": "chat", + "gen_ai.request.available_tools": '[{"type": "function", "function": {"name": "get_weather", "description": "Get current weather", "parameters": {"type": "object", "properties": {"location": {"type": "string"}}, "required": ["location"]}}}]', + "gen_ai.request.model": "test-model", + "gen_ai.response.finish_reasons": "tool_calls", + "gen_ai.response.model": "test-model-123", + "gen_ai.response.streaming": True, + "thread.id": mock.ANY, + "thread.name": mock.ANY, + } + + if HF_VERSION and HF_VERSION >= (0, 26, 0): + expected_data["gen_ai.usage.input_tokens"] = 183 + expected_data["gen_ai.usage.output_tokens"] = 14 + expected_data["gen_ai.usage.total_tokens"] = 197 + + if send_default_pii and include_prompts: + expected_data["gen_ai.request.messages"] = ( + '[{"role": "user", "content": "What is the weather in Paris?"}]' + ) + expected_data["gen_ai.response.text"] = "response with tool calls follows" + expected_data["gen_ai.response.tool_calls"] = ( + '[{"function": {"arguments": {"location": "Paris"}, "name": "get_weather"}, "id": "call_123", "type": "function", "index": "None"}]' + ) - for key, value in expected_attributes.items(): - assert span["attributes"][key] == value, f"Mismatch on {key}" + if not send_default_pii or not include_prompts: + assert "gen_ai.request.messages" not in expected_data + assert "gen_ai.response.text" not in expected_data + assert "gen_ai.response.tool_calls" not in expected_data - if not send_default_pii or not include_prompts: - assert "gen_ai.request.messages" not in span["attributes"] - assert "gen_ai.response.text" not in span["attributes"] - assert "gen_ai.response.tool_calls" not in span["attributes"] + assert span["data"] == expected_data diff --git a/tests/integrations/langchain/test_langchain.py b/tests/integrations/langchain/test_langchain.py index 336be2fb1e..79ecc7e96b 100644 --- a/tests/integrations/langchain/test_langchain.py +++ b/tests/integrations/langchain/test_langchain.py @@ -237,10 +237,13 @@ def get_word_length(word: str) -> int: return len(word) +@pytest.mark.parametrize("stream_gen_ai_spans", [True, False]) def test_langchain_text_completion( sentry_init, capture_events, + capture_items, get_model_response, + stream_gen_ai_spans, ): sentry_init( integrations=[ @@ -250,8 +253,8 @@ def test_langchain_text_completion( ], traces_sample_rate=1.0, send_default_pii=True, + stream_gen_ai_spans=stream_gen_ai_spans, ) - events = capture_events() model_response = get_model_response( Completion( @@ -282,41 +285,83 @@ def test_langchain_text_completion( openai_api_key="badkey", ) - with patch.object( - model.client._client._client, - "send", - return_value=model_response, - ) as _: - with start_transaction(): + if stream_gen_ai_spans: + items = capture_items("transaction", "span") + + with patch.object( + model.client._client._client, + "send", + return_value=model_response, + ) as _, start_transaction(): input_text = "What is the capital of France?" model.invoke(input_text, config={"run_name": "my-snazzy-pipeline"}) - tx = events[0] - assert tx["type"] == "transaction" + tx = next(item.payload for item in items if item.type == "transaction") + assert tx["type"] == "transaction" - llm_spans = [ - span - for span in tx.get("spans", []) - if span.get("op") == "gen_ai.text_completion" - ] - assert len(llm_spans) > 0 + spans = [item.payload for item in items if item.type == "span"] + llm_spans = [ + span + for span in spans + if span["attributes"].get("sentry.op") == "gen_ai.text_completion" + ] + assert len(llm_spans) > 0 + + llm_span = llm_spans[0] + assert llm_span["name"] == "text_completion gpt-3.5-turbo" + assert llm_span["attributes"]["gen_ai.system"] == "openai" + assert llm_span["attributes"]["gen_ai.function_id"] == "my-snazzy-pipeline" + assert llm_span["attributes"]["gen_ai.request.model"] == "gpt-3.5-turbo" + assert ( + llm_span["attributes"]["gen_ai.response.text"] + == "The capital of France is Paris." + ) + assert llm_span["attributes"]["gen_ai.usage.total_tokens"] == 25 + assert llm_span["attributes"]["gen_ai.usage.input_tokens"] == 10 + assert llm_span["attributes"]["gen_ai.usage.output_tokens"] == 15 + else: + events = capture_events() + + with patch.object( + model.client._client._client, + "send", + return_value=model_response, + ) as _, start_transaction(): + input_text = "What is the capital of France?" + model.invoke(input_text, config={"run_name": "my-snazzy-pipeline"}) + + tx = events[0] + assert tx["type"] == "transaction" + + llm_spans = [ + span + for span in tx.get("spans", []) + if span.get("op") == "gen_ai.text_completion" + ] + assert len(llm_spans) > 0 - llm_span = llm_spans[0] - assert llm_span["description"] == "text_completion gpt-3.5-turbo" - assert llm_span["data"]["gen_ai.system"] == "openai" - assert llm_span["data"]["gen_ai.function_id"] == "my-snazzy-pipeline" - assert llm_span["data"]["gen_ai.request.model"] == "gpt-3.5-turbo" - assert llm_span["data"]["gen_ai.response.text"] == "The capital of France is Paris." - assert llm_span["data"]["gen_ai.usage.total_tokens"] == 25 - assert llm_span["data"]["gen_ai.usage.input_tokens"] == 10 - assert llm_span["data"]["gen_ai.usage.output_tokens"] == 15 + llm_span = llm_spans[0] + assert llm_span["description"] == "text_completion gpt-3.5-turbo" + assert llm_span["data"]["gen_ai.system"] == "openai" + assert llm_span["data"]["gen_ai.function_id"] == "my-snazzy-pipeline" + assert llm_span["data"]["gen_ai.request.model"] == "gpt-3.5-turbo" + assert ( + llm_span["data"]["gen_ai.response.text"] + == "The capital of France is Paris." + ) + assert llm_span["data"]["gen_ai.usage.total_tokens"] == 25 + assert llm_span["data"]["gen_ai.usage.input_tokens"] == 10 + assert llm_span["data"]["gen_ai.usage.output_tokens"] == 15 +@pytest.mark.parametrize("stream_gen_ai_spans", [True, False]) def test_langchain_chat_with_run_name( sentry_init, capture_events, + capture_items, get_model_response, nonstreaming_chat_completions_model_response, + stream_gen_ai_spans, ): sentry_init( integrations=[ @@ -326,8 +371,8 @@ def test_langchain_chat_with_run_name( ], traces_sample_rate=1.0, send_default_pii=True, + stream_gen_ai_spans=stream_gen_ai_spans, ) - events = capture_events() request_headers = {} # Changed in https://github.com/langchain-ai/langchain/pull/32655 @@ -356,27 +401,56 @@ def test_langchain_chat_with_run_name( openai_api_key="badkey", ) - with patch.object( - llm.client._client._client, - "send", - return_value=model_response, - ) as _: - with start_transaction(): + if stream_gen_ai_spans: + items = capture_items("span") + + with patch.object( + llm.client._client._client, + "send", + return_value=model_response, + ) as _, start_transaction(): + llm.invoke( + "How many letters in the word eudca", + config={"run_name": "my-snazzy-pipeline"}, + ) + + spans = [item.payload for item in items if item.type == "span"] + chat_spans = list( + x for x in spans if x["attributes"]["sentry.op"] == "gen_ai.chat" + ) + assert len(chat_spans) == 1 + assert ( + chat_spans[0]["attributes"][SPANDATA.GEN_AI_FUNCTION_ID] + == "my-snazzy-pipeline" + ) + else: + events = capture_events() + + with patch.object( + llm.client._client._client, + "send", + return_value=model_response, + ) as _, start_transaction(): llm.invoke( "How many letters in the word eudca", config={"run_name": "my-snazzy-pipeline"}, ) - tx = events[0] + tx = events[0] - chat_spans = list(x for x in tx["spans"] if x["op"] == "gen_ai.chat") - assert len(chat_spans) == 1 - assert chat_spans[0]["data"][SPANDATA.GEN_AI_FUNCTION_ID] == "my-snazzy-pipeline" + chat_spans = list(x for x in tx["spans"] if x["op"] == "gen_ai.chat") + assert len(chat_spans) == 1 + assert ( + chat_spans[0]["data"][SPANDATA.GEN_AI_FUNCTION_ID] == "my-snazzy-pipeline" + ) +@pytest.mark.parametrize("stream_gen_ai_spans", [True, False]) def test_langchain_tool_call_with_run_name( sentry_init, capture_events, + capture_items, + stream_gen_ai_spans, ): sentry_init( integrations=[ @@ -386,21 +460,44 @@ def test_langchain_tool_call_with_run_name( ], traces_sample_rate=1.0, send_default_pii=True, + stream_gen_ai_spans=stream_gen_ai_spans, ) - events = capture_events() + if stream_gen_ai_spans: + items = capture_items("span") + + with start_transaction(): + get_word_length.invoke( + {"word": "eudca"}, + config={"run_name": "my-snazzy-pipeline"}, + ) - with start_transaction(): - get_word_length.invoke( - {"word": "eudca"}, - config={"run_name": "my-snazzy-pipeline"}, + spans = [item.payload for item in items if item.type == "span"] + tool_spans = list( + x for x in spans if x["attributes"]["sentry.op"] == "gen_ai.execute_tool" + ) + assert len(tool_spans) == 1 + assert ( + tool_spans[0]["attributes"][SPANDATA.GEN_AI_FUNCTION_ID] + == "my-snazzy-pipeline" ) + else: + events = capture_events() + + with start_transaction(): + get_word_length.invoke( + {"word": "eudca"}, + config={"run_name": "my-snazzy-pipeline"}, + ) - tx = events[0] - tool_spans = list(x for x in tx["spans"] if x["op"] == "gen_ai.execute_tool") - assert len(tool_spans) == 1 - assert tool_spans[0]["data"][SPANDATA.GEN_AI_FUNCTION_ID] == "my-snazzy-pipeline" + tx = events[0] + tool_spans = list(x for x in tx["spans"] if x["op"] == "gen_ai.execute_tool") + assert len(tool_spans) == 1 + assert ( + tool_spans[0]["data"][SPANDATA.GEN_AI_FUNCTION_ID] == "my-snazzy-pipeline" + ) +@pytest.mark.parametrize("stream_gen_ai_spans", [True, False]) @pytest.mark.skipif( LANGCHAIN_VERSION < (1,), reason="LangChain 1.0+ required (ONE AGENT refactor)", @@ -428,12 +525,14 @@ def test_langchain_tool_call_with_run_name( def test_langchain_create_agent( sentry_init, capture_events, + capture_items, send_default_pii, include_prompts, system_instructions_content, request, get_model_response, nonstreaming_responses_model_response, + stream_gen_ai_spans, ): sentry_init( integrations=[ @@ -443,8 +542,8 @@ def test_langchain_create_agent( ], traces_sample_rate=1.0, send_default_pii=send_default_pii, + stream_gen_ai_spans=stream_gen_ai_spans, ) - events = capture_events() model_response = get_model_response( nonstreaming_responses_model_response, @@ -467,12 +566,14 @@ def test_langchain_create_agent( name="word_length_agent", ) - with patch.object( - llm.client._client._client, - "send", - return_value=model_response, - ) as _: - with start_transaction(): + if stream_gen_ai_spans: + items = capture_items("transaction", "span") + + with patch.object( + llm.client._client._client, + "send", + return_value=model_response, + ) as _, start_transaction(): agent.invoke( { "messages": [ @@ -481,52 +582,133 @@ def test_langchain_create_agent( }, ) - tx = events[0] - assert tx["type"] == "transaction" - assert tx["contexts"]["trace"]["origin"] == "manual" + tx = next(item.payload for item in items if item.type == "transaction") + assert tx["type"] == "transaction" + assert tx["contexts"]["trace"]["origin"] == "manual" - chat_spans = list(x for x in tx["spans"] if x["op"] == "gen_ai.chat") - assert len(chat_spans) == 1 - assert chat_spans[0]["origin"] == "auto.ai.langchain" + spans = [item.payload for item in items if item.type == "span"] + chat_spans = list( + x for x in spans if x["attributes"]["sentry.op"] == "gen_ai.chat" + ) + assert len(chat_spans) == 1 + assert chat_spans[0]["attributes"]["sentry.origin"] == "auto.ai.langchain" - assert chat_spans[0]["data"]["gen_ai.system"] == "openai-chat" - assert chat_spans[0]["data"]["gen_ai.agent.name"] == "word_length_agent" + assert chat_spans[0]["attributes"]["gen_ai.system"] == "openai-chat" + assert chat_spans[0]["attributes"]["gen_ai.agent.name"] == "word_length_agent" - assert chat_spans[0]["data"]["gen_ai.usage.input_tokens"] == 10 - assert chat_spans[0]["data"]["gen_ai.usage.output_tokens"] == 20 - assert chat_spans[0]["data"]["gen_ai.usage.total_tokens"] == 30 + assert chat_spans[0]["attributes"]["gen_ai.usage.input_tokens"] == 10 + assert chat_spans[0]["attributes"]["gen_ai.usage.output_tokens"] == 20 + assert chat_spans[0]["attributes"]["gen_ai.usage.total_tokens"] == 30 - if send_default_pii and include_prompts: - assert ( - chat_spans[0]["data"][SPANDATA.GEN_AI_RESPONSE_TEXT] - == "Hello, how can I help you?" - ) + if send_default_pii and include_prompts: + assert ( + chat_spans[0]["attributes"][SPANDATA.GEN_AI_RESPONSE_TEXT] + == "Hello, how can I help you?" + ) - param_id = request.node.callspec.id - if "string" in param_id: - assert [ - { - "type": "text", - "content": "You are very powerful assistant, but don't know current events", - } - ] == json.loads(chat_spans[0]["data"][SPANDATA.GEN_AI_SYSTEM_INSTRUCTIONS]) + param_id = request.node.callspec.id + if "string" in param_id: + assert [ + { + "type": "text", + "content": "You are very powerful assistant, but don't know current events", + } + ] == json.loads( + chat_spans[0]["attributes"][SPANDATA.GEN_AI_SYSTEM_INSTRUCTIONS] + ) + else: + assert [ + { + "type": "text", + "content": "You are a helpful assistant.", + }, + { + "type": "text", + "content": "Be concise and clear.", + }, + ] == json.loads( + chat_spans[0]["attributes"][SPANDATA.GEN_AI_SYSTEM_INSTRUCTIONS] + ) else: - assert [ - { - "type": "text", - "content": "You are a helpful assistant.", - }, + assert SPANDATA.GEN_AI_SYSTEM_INSTRUCTIONS not in chat_spans[0].get( + "attributes", {} + ) + assert SPANDATA.GEN_AI_REQUEST_MESSAGES not in chat_spans[0].get( + "attributes", {} + ) + assert SPANDATA.GEN_AI_RESPONSE_TEXT not in chat_spans[0].get( + "attributes", {} + ) + + else: + events = capture_events() + + with patch.object( + llm.client._client._client, + "send", + return_value=model_response, + ) as _, start_transaction(): + agent.invoke( { - "type": "text", - "content": "Be concise and clear.", + "messages": [ + HumanMessage(content="How many letters in the word eudca"), + ], }, - ] == json.loads(chat_spans[0]["data"][SPANDATA.GEN_AI_SYSTEM_INSTRUCTIONS]) - else: - assert SPANDATA.GEN_AI_SYSTEM_INSTRUCTIONS not in chat_spans[0].get("data", {}) - assert SPANDATA.GEN_AI_REQUEST_MESSAGES not in chat_spans[0].get("data", {}) - assert SPANDATA.GEN_AI_RESPONSE_TEXT not in chat_spans[0].get("data", {}) + ) + + tx = events[0] + assert tx["type"] == "transaction" + assert tx["contexts"]["trace"]["origin"] == "manual" + + chat_spans = list(x for x in tx["spans"] if x["op"] == "gen_ai.chat") + assert len(chat_spans) == 1 + assert chat_spans[0]["origin"] == "auto.ai.langchain" + assert chat_spans[0]["data"]["gen_ai.system"] == "openai-chat" + assert chat_spans[0]["data"]["gen_ai.agent.name"] == "word_length_agent" + assert chat_spans[0]["data"]["gen_ai.usage.input_tokens"] == 10 + assert chat_spans[0]["data"]["gen_ai.usage.output_tokens"] == 20 + assert chat_spans[0]["data"]["gen_ai.usage.total_tokens"] == 30 + + if send_default_pii and include_prompts: + assert ( + chat_spans[0]["data"][SPANDATA.GEN_AI_RESPONSE_TEXT] + == "Hello, how can I help you?" + ) + + param_id = request.node.callspec.id + if "string" in param_id: + assert [ + { + "type": "text", + "content": "You are very powerful assistant, but don't know current events", + } + ] == json.loads( + chat_spans[0]["data"][SPANDATA.GEN_AI_SYSTEM_INSTRUCTIONS] + ) + else: + assert [ + { + "type": "text", + "content": "You are a helpful assistant.", + }, + { + "type": "text", + "content": "Be concise and clear.", + }, + ] == json.loads( + chat_spans[0]["data"][SPANDATA.GEN_AI_SYSTEM_INSTRUCTIONS] + ) + else: + assert SPANDATA.GEN_AI_SYSTEM_INSTRUCTIONS not in chat_spans[0].get( + "data", {} + ) + assert SPANDATA.GEN_AI_REQUEST_MESSAGES not in chat_spans[0].get("data", {}) + assert SPANDATA.GEN_AI_RESPONSE_TEXT not in chat_spans[0].get("data", {}) + + +@pytest.mark.parametrize("stream_gen_ai_spans", [True, False]) @pytest.mark.skipif( LANGCHAIN_VERSION < (1,), reason="LangChain 1.0+ required (ONE AGENT refactor)", @@ -543,10 +725,12 @@ def test_langchain_create_agent( def test_tool_execution_span( sentry_init, capture_events, + capture_items, send_default_pii, include_prompts, get_model_response, responses_tool_call_model_responses, + stream_gen_ai_spans, ): sentry_init( integrations=[ @@ -556,8 +740,8 @@ def test_tool_execution_span( ], traces_sample_rate=1.0, send_default_pii=send_default_pii, + stream_gen_ai_spans=stream_gen_ai_spans, ) - events = capture_events() responses = responses_tool_call_model_responses( tool_name="get_word_length", @@ -619,12 +803,14 @@ def test_tool_execution_span( name="word_length_agent", ) - with patch.object( - llm.client._client._client, - "send", - side_effect=[tool_response, final_response], - ) as _: - with start_transaction(): + if stream_gen_ai_spans: + items = capture_items("transaction", "span") + + with patch.object( + llm.client._client._client, + "send", + side_effect=[tool_response, final_response], + ) as _, start_transaction(): agent.invoke( { "messages": [ @@ -633,75 +819,186 @@ def test_tool_execution_span( }, ) - tx = events[0] - assert tx["type"] == "transaction" - assert tx["contexts"]["trace"]["origin"] == "manual" + tx = next(item.payload for item in items if item.type == "transaction") + assert tx["type"] == "transaction" + assert tx["contexts"]["trace"]["origin"] == "manual" - chat_spans = list(x for x in tx["spans"] if x["op"] == "gen_ai.chat") - assert len(chat_spans) == 2 + spans = [item.payload for item in items if item.type == "span"] + chat_spans = list( + x for x in spans if x["attributes"]["sentry.op"] == "gen_ai.chat" + ) + assert len(chat_spans) == 2 - tool_exec_spans = list(x for x in tx["spans"] if x["op"] == "gen_ai.execute_tool") - assert len(tool_exec_spans) == 1 - tool_exec_span = tool_exec_spans[0] + tool_exec_spans = list( + x for x in spans if x["attributes"]["sentry.op"] == "gen_ai.execute_tool" + ) + assert len(tool_exec_spans) == 1 + tool_exec_span = tool_exec_spans[0] - assert chat_spans[0]["origin"] == "auto.ai.langchain" - assert chat_spans[1]["origin"] == "auto.ai.langchain" - assert tool_exec_span["origin"] == "auto.ai.langchain" + assert chat_spans[0]["attributes"]["sentry.origin"] == "auto.ai.langchain" + assert chat_spans[1]["attributes"]["sentry.origin"] == "auto.ai.langchain" + assert tool_exec_span["attributes"]["sentry.origin"] == "auto.ai.langchain" - assert chat_spans[0]["data"]["gen_ai.agent.name"] == "word_length_agent" - assert chat_spans[1]["data"]["gen_ai.agent.name"] == "word_length_agent" - assert tool_exec_span["data"]["gen_ai.agent.name"] == "word_length_agent" + assert chat_spans[0]["attributes"]["gen_ai.agent.name"] == "word_length_agent" + assert chat_spans[1]["attributes"]["gen_ai.agent.name"] == "word_length_agent" + assert tool_exec_span["attributes"]["gen_ai.agent.name"] == "word_length_agent" - assert chat_spans[0]["data"]["gen_ai.usage.input_tokens"] == 142 - assert chat_spans[0]["data"]["gen_ai.usage.output_tokens"] == 50 - assert chat_spans[0]["data"]["gen_ai.usage.total_tokens"] == 192 - assert chat_spans[0]["data"]["gen_ai.system"] == "openai-chat" + assert chat_spans[0]["attributes"]["gen_ai.usage.input_tokens"] == 142 + assert chat_spans[0]["attributes"]["gen_ai.usage.output_tokens"] == 50 + assert chat_spans[0]["attributes"]["gen_ai.usage.total_tokens"] == 192 + assert chat_spans[0]["attributes"]["gen_ai.system"] == "openai-chat" - assert chat_spans[1]["data"]["gen_ai.usage.input_tokens"] == 89 - assert chat_spans[1]["data"]["gen_ai.usage.output_tokens"] == 28 - assert chat_spans[1]["data"]["gen_ai.usage.total_tokens"] == 117 - assert chat_spans[1]["data"]["gen_ai.system"] == "openai-chat" + assert chat_spans[1]["attributes"]["gen_ai.usage.input_tokens"] == 89 + assert chat_spans[1]["attributes"]["gen_ai.usage.output_tokens"] == 28 + assert chat_spans[1]["attributes"]["gen_ai.usage.total_tokens"] == 117 + assert chat_spans[1]["attributes"]["gen_ai.system"] == "openai-chat" - if send_default_pii and include_prompts: - assert "word" in tool_exec_span["data"][SPANDATA.GEN_AI_TOOL_INPUT] + if send_default_pii and include_prompts: + assert "word" in tool_exec_span["attributes"][SPANDATA.GEN_AI_TOOL_INPUT] - assert "5" in chat_spans[1]["data"][SPANDATA.GEN_AI_RESPONSE_TEXT] + assert "5" in chat_spans[1]["attributes"][SPANDATA.GEN_AI_RESPONSE_TEXT] - # Verify tool calls are recorded when PII is enabled - assert SPANDATA.GEN_AI_RESPONSE_TOOL_CALLS in chat_spans[0].get("data", {}), ( - "Tool calls should be recorded when send_default_pii=True and include_prompts=True" - ) - tool_calls_data = chat_spans[0]["data"][SPANDATA.GEN_AI_RESPONSE_TOOL_CALLS] - assert isinstance(tool_calls_data, str) - assert "get_word_length" in tool_calls_data + # Verify tool calls are recorded when PII is enabled + assert SPANDATA.GEN_AI_RESPONSE_TOOL_CALLS in chat_spans[0].get( + "attributes", {} + ), ( + "Tool calls should be recorded when send_default_pii=True and include_prompts=True" + ) + tool_calls_data = chat_spans[0]["attributes"][ + SPANDATA.GEN_AI_RESPONSE_TOOL_CALLS + ] + assert isinstance(tool_calls_data, str) + assert "get_word_length" in tool_calls_data + else: + assert SPANDATA.GEN_AI_REQUEST_MESSAGES not in chat_spans[0].get( + "attributes", {} + ) + assert SPANDATA.GEN_AI_RESPONSE_TEXT not in chat_spans[0].get( + "attributes", {} + ) + assert SPANDATA.GEN_AI_REQUEST_MESSAGES not in chat_spans[1].get( + "attributes", {} + ) + assert SPANDATA.GEN_AI_RESPONSE_TEXT not in chat_spans[1].get( + "attributes", {} + ) + assert SPANDATA.GEN_AI_TOOL_INPUT not in tool_exec_span.get( + "attributes", {} + ) + assert SPANDATA.GEN_AI_TOOL_OUTPUT not in tool_exec_span.get( + "attributes", {} + ) + + # Verify tool calls are NOT recorded when PII is disabled + assert SPANDATA.GEN_AI_RESPONSE_TOOL_CALLS not in chat_spans[0].get( + "attributes", {} + ), ( + f"Tool calls should NOT be recorded when send_default_pii={send_default_pii} " + f"and include_prompts={include_prompts}" + ) + assert SPANDATA.GEN_AI_RESPONSE_TOOL_CALLS not in chat_spans[1].get( + "attributes", {} + ), ( + f"Tool calls should NOT be recorded when send_default_pii={send_default_pii} " + f"and include_prompts={include_prompts}" + ) + + # Verify that available tools are always recorded regardless of PII settings + for chat_span in chat_spans: + tools_data = chat_span["attributes"][ + SPANDATA.GEN_AI_REQUEST_AVAILABLE_TOOLS + ] + assert "get_word_length" in tools_data else: - assert SPANDATA.GEN_AI_REQUEST_MESSAGES not in chat_spans[0].get("data", {}) - assert SPANDATA.GEN_AI_RESPONSE_TEXT not in chat_spans[0].get("data", {}) - assert SPANDATA.GEN_AI_REQUEST_MESSAGES not in chat_spans[1].get("data", {}) - assert SPANDATA.GEN_AI_RESPONSE_TEXT not in chat_spans[1].get("data", {}) - assert SPANDATA.GEN_AI_TOOL_INPUT not in tool_exec_span.get("data", {}) - assert SPANDATA.GEN_AI_TOOL_OUTPUT not in tool_exec_span.get("data", {}) + events = capture_events() - # Verify tool calls are NOT recorded when PII is disabled - assert SPANDATA.GEN_AI_RESPONSE_TOOL_CALLS not in chat_spans[0].get( - "data", {} - ), ( - f"Tool calls should NOT be recorded when send_default_pii={send_default_pii} " - f"and include_prompts={include_prompts}" - ) - assert SPANDATA.GEN_AI_RESPONSE_TOOL_CALLS not in chat_spans[1].get( - "data", {} - ), ( - f"Tool calls should NOT be recorded when send_default_pii={send_default_pii} " - f"and include_prompts={include_prompts}" + with patch.object( + llm.client._client._client, + "send", + side_effect=[tool_response, final_response], + ) as _, start_transaction(): + agent.invoke( + { + "messages": [ + HumanMessage(content="How many letters in the word eudca"), + ], + }, + ) + + tx = events[0] + assert tx["type"] == "transaction" + assert tx["contexts"]["trace"]["origin"] == "manual" + + chat_spans = list(x for x in tx["spans"] if x["op"] == "gen_ai.chat") + assert len(chat_spans) == 2 + tool_exec_spans = list( + x for x in tx["spans"] if x["op"] == "gen_ai.execute_tool" ) - # Verify that available tools are always recorded regardless of PII settings - for chat_span in chat_spans: - tools_data = chat_span["data"][SPANDATA.GEN_AI_REQUEST_AVAILABLE_TOOLS] - assert "get_word_length" in tools_data + assert len(tool_exec_spans) == 1 + tool_exec_span = tool_exec_spans[0] + + assert chat_spans[0]["origin"] == "auto.ai.langchain" + assert chat_spans[1]["origin"] == "auto.ai.langchain" + assert tool_exec_span["origin"] == "auto.ai.langchain" + + assert chat_spans[0]["data"]["gen_ai.agent.name"] == "word_length_agent" + assert chat_spans[1]["data"]["gen_ai.agent.name"] == "word_length_agent" + assert tool_exec_span["data"]["gen_ai.agent.name"] == "word_length_agent" + + assert chat_spans[0]["data"]["gen_ai.usage.input_tokens"] == 142 + assert chat_spans[0]["data"]["gen_ai.usage.output_tokens"] == 50 + assert chat_spans[0]["data"]["gen_ai.usage.total_tokens"] == 192 + assert chat_spans[0]["data"]["gen_ai.system"] == "openai-chat" + + assert chat_spans[1]["data"]["gen_ai.usage.input_tokens"] == 89 + assert chat_spans[1]["data"]["gen_ai.usage.output_tokens"] == 28 + assert chat_spans[1]["data"]["gen_ai.usage.total_tokens"] == 117 + assert chat_spans[1]["data"]["gen_ai.system"] == "openai-chat" + + if send_default_pii and include_prompts: + assert "word" in tool_exec_span["data"][SPANDATA.GEN_AI_TOOL_INPUT] + + assert "5" in chat_spans[1]["data"][SPANDATA.GEN_AI_RESPONSE_TEXT] + + # Verify tool calls are recorded when PII is enabled + assert SPANDATA.GEN_AI_RESPONSE_TOOL_CALLS in chat_spans[0].get( + "data", {} + ), ( + "Tool calls should be recorded when send_default_pii=True and include_prompts=True" + ) + tool_calls_data = chat_spans[0]["data"][SPANDATA.GEN_AI_RESPONSE_TOOL_CALLS] + assert isinstance(tool_calls_data, str) + assert "get_word_length" in tool_calls_data + else: + assert SPANDATA.GEN_AI_REQUEST_MESSAGES not in chat_spans[0].get("data", {}) + assert SPANDATA.GEN_AI_RESPONSE_TEXT not in chat_spans[0].get("data", {}) + assert SPANDATA.GEN_AI_REQUEST_MESSAGES not in chat_spans[1].get("data", {}) + assert SPANDATA.GEN_AI_RESPONSE_TEXT not in chat_spans[1].get("data", {}) + assert SPANDATA.GEN_AI_TOOL_INPUT not in tool_exec_span.get("data", {}) + assert SPANDATA.GEN_AI_TOOL_OUTPUT not in tool_exec_span.get("data", {}) + + # Verify tool calls are NOT recorded when PII is disabled + assert SPANDATA.GEN_AI_RESPONSE_TOOL_CALLS not in chat_spans[0].get( + "data", {} + ), ( + f"Tool calls should NOT be recorded when send_default_pii={send_default_pii} " + f"and include_prompts={include_prompts}" + ) + assert SPANDATA.GEN_AI_RESPONSE_TOOL_CALLS not in chat_spans[1].get( + "data", {} + ), ( + f"Tool calls should NOT be recorded when send_default_pii={send_default_pii} " + f"and include_prompts={include_prompts}" + ) + # Verify that available tools are always recorded regardless of PII settings + for chat_span in chat_spans: + tools_data = chat_span["data"][SPANDATA.GEN_AI_REQUEST_AVAILABLE_TOOLS] + assert "get_word_length" in tools_data + +@pytest.mark.parametrize("stream_gen_ai_spans", [True, False]) @pytest.mark.parametrize( "send_default_pii, include_prompts", [ @@ -713,11 +1010,13 @@ def test_tool_execution_span( def test_langchain_openai_tools_agent_no_prompts( sentry_init, capture_events, + capture_items, send_default_pii, include_prompts, get_model_response, server_side_event_chunks, streaming_chat_completions_model_responses, + stream_gen_ai_spans, ): sentry_init( integrations=[ @@ -727,8 +1026,8 @@ def test_langchain_openai_tools_agent_no_prompts( ], traces_sample_rate=1.0, send_default_pii=send_default_pii, + stream_gen_ai_spans=stream_gen_ai_spans, ) - events = capture_events() prompt = ChatPromptTemplate.from_messages( [ @@ -766,12 +1065,14 @@ def test_langchain_openai_tools_agent_no_prompts( agent_executor = AgentExecutor(agent=agent, tools=[get_word_length], verbose=True) - with patch.object( - llm.client._client._client, - "send", - side_effect=[tool_response, final_response], - ) as _: - with start_transaction(): + if stream_gen_ai_spans: + items = capture_items("transaction", "span") + + with patch.object( + llm.client._client._client, + "send", + side_effect=[tool_response, final_response], + ) as _, start_transaction(): list( agent_executor.invoke( {"input": "How many letters in the word eudca"}, @@ -779,72 +1080,191 @@ def test_langchain_openai_tools_agent_no_prompts( ) ) - tx = events[0] - assert tx["type"] == "transaction" - assert tx["contexts"]["trace"]["origin"] == "manual" + tx = next(item.payload for item in items if item.type == "transaction") + assert tx["type"] == "transaction" + assert tx["contexts"]["trace"]["origin"] == "manual" - invoke_agent_span = next(x for x in tx["spans"] if x["op"] == "gen_ai.invoke_agent") - chat_spans = list(x for x in tx["spans"] if x["op"] == "gen_ai.chat") - tool_exec_span = next(x for x in tx["spans"] if x["op"] == "gen_ai.execute_tool") + spans = [item.payload for item in items if item.type == "span"] + invoke_agent_span = next( + x for x in spans if x["attributes"]["sentry.op"] == "gen_ai.invoke_agent" + ) + chat_spans = list( + x for x in spans if x["attributes"]["sentry.op"] == "gen_ai.chat" + ) + tool_exec_span = next( + x for x in spans if x["attributes"]["sentry.op"] == "gen_ai.execute_tool" + ) - assert len(chat_spans) == 2 + assert len(chat_spans) == 2 - assert invoke_agent_span["origin"] == "auto.ai.langchain" - assert chat_spans[0]["origin"] == "auto.ai.langchain" - assert chat_spans[1]["origin"] == "auto.ai.langchain" - assert tool_exec_span["origin"] == "auto.ai.langchain" + assert invoke_agent_span["attributes"]["sentry.origin"] == "auto.ai.langchain" + assert chat_spans[0]["attributes"]["sentry.origin"] == "auto.ai.langchain" + assert chat_spans[1]["attributes"]["sentry.origin"] == "auto.ai.langchain" + assert tool_exec_span["attributes"]["sentry.origin"] == "auto.ai.langchain" - assert invoke_agent_span["data"]["gen_ai.function_id"] == "my-snazzy-pipeline" + assert ( + invoke_agent_span["attributes"]["gen_ai.function_id"] + == "my-snazzy-pipeline" + ) - # We can't guarantee anything about the "shape" of the langchain execution graph - assert len(list(x for x in tx["spans"] if x["op"] == "gen_ai.chat")) > 0 + # We can't guarantee anything about the "shape" of the langchain execution graph + assert ( + len(list(x for x in spans if x["attributes"]["sentry.op"] == "gen_ai.chat")) + > 0 + ) - # Token usage is only available in newer versions of langchain (v0.2+) - # where usage_metadata is supported on AIMessageChunk - if "gen_ai.usage.input_tokens" in chat_spans[0]["data"]: - assert chat_spans[0]["data"]["gen_ai.usage.input_tokens"] == 142 - assert chat_spans[0]["data"]["gen_ai.usage.output_tokens"] == 50 - assert chat_spans[0]["data"]["gen_ai.usage.total_tokens"] == 192 + # Token usage is only available in newer versions of langchain (v0.2+) + # where usage_metadata is supported on AIMessageChunk + if "gen_ai.usage.input_tokens" in chat_spans[0]["attributes"]: + assert chat_spans[0]["attributes"]["gen_ai.usage.input_tokens"] == 142 + assert chat_spans[0]["attributes"]["gen_ai.usage.output_tokens"] == 50 + assert chat_spans[0]["attributes"]["gen_ai.usage.total_tokens"] == 192 - if "gen_ai.usage.input_tokens" in chat_spans[1]["data"]: - assert chat_spans[1]["data"]["gen_ai.usage.input_tokens"] == 89 - assert chat_spans[1]["data"]["gen_ai.usage.output_tokens"] == 28 - assert chat_spans[1]["data"]["gen_ai.usage.total_tokens"] == 117 + if "gen_ai.usage.input_tokens" in chat_spans[1]["attributes"]: + assert chat_spans[1]["attributes"]["gen_ai.usage.input_tokens"] == 89 + assert chat_spans[1]["attributes"]["gen_ai.usage.output_tokens"] == 28 + assert chat_spans[1]["attributes"]["gen_ai.usage.total_tokens"] == 117 + + assert SPANDATA.GEN_AI_SYSTEM_INSTRUCTIONS not in chat_spans[0].get( + "attributes", {} + ) + assert SPANDATA.GEN_AI_REQUEST_MESSAGES not in chat_spans[0].get( + "attributes", {} + ) + assert SPANDATA.GEN_AI_RESPONSE_TEXT not in chat_spans[0].get("attributes", {}) + assert SPANDATA.GEN_AI_SYSTEM_INSTRUCTIONS not in chat_spans[1].get( + "attributes", {} + ) + assert SPANDATA.GEN_AI_REQUEST_MESSAGES not in chat_spans[1].get( + "attributes", {} + ) + assert SPANDATA.GEN_AI_RESPONSE_TEXT not in chat_spans[1].get("attributes", {}) + assert SPANDATA.GEN_AI_TOOL_INPUT not in tool_exec_span.get("attributes", {}) + assert SPANDATA.GEN_AI_TOOL_OUTPUT not in tool_exec_span.get("attributes", {}) - assert SPANDATA.GEN_AI_SYSTEM_INSTRUCTIONS not in chat_spans[0].get("data", {}) - assert SPANDATA.GEN_AI_REQUEST_MESSAGES not in chat_spans[0].get("data", {}) - assert SPANDATA.GEN_AI_RESPONSE_TEXT not in chat_spans[0].get("data", {}) - assert SPANDATA.GEN_AI_SYSTEM_INSTRUCTIONS not in chat_spans[1].get("data", {}) - assert SPANDATA.GEN_AI_REQUEST_MESSAGES not in chat_spans[1].get("data", {}) - assert SPANDATA.GEN_AI_RESPONSE_TEXT not in chat_spans[1].get("data", {}) - assert SPANDATA.GEN_AI_TOOL_INPUT not in tool_exec_span.get("data", {}) - assert SPANDATA.GEN_AI_TOOL_OUTPUT not in tool_exec_span.get("data", {}) + # Verify tool calls are NOT recorded when PII is disabled + assert SPANDATA.GEN_AI_RESPONSE_TOOL_CALLS not in chat_spans[0].get( + "attributes", {} + ), ( + f"Tool calls should NOT be recorded when send_default_pii={send_default_pii} " + f"and include_prompts={include_prompts}" + ) + assert SPANDATA.GEN_AI_RESPONSE_TOOL_CALLS not in chat_spans[1].get( + "attributes", {} + ), ( + f"Tool calls should NOT be recorded when send_default_pii={send_default_pii} " + f"and include_prompts={include_prompts}" + ) - # Verify tool calls are NOT recorded when PII is disabled - assert SPANDATA.GEN_AI_RESPONSE_TOOL_CALLS not in chat_spans[0].get("data", {}), ( - f"Tool calls should NOT be recorded when send_default_pii={send_default_pii} " - f"and include_prompts={include_prompts}" - ) - assert SPANDATA.GEN_AI_RESPONSE_TOOL_CALLS not in chat_spans[1].get("data", {}), ( - f"Tool calls should NOT be recorded when send_default_pii={send_default_pii} " - f"and include_prompts={include_prompts}" - ) + # Verify finish_reasons is always an array of strings + assert chat_spans[0]["attributes"][SPANDATA.GEN_AI_RESPONSE_FINISH_REASONS] == [ + "function_call" + ] + assert chat_spans[1]["attributes"][SPANDATA.GEN_AI_RESPONSE_FINISH_REASONS] == [ + "stop" + ] - # Verify finish_reasons is always an array of strings - assert chat_spans[0]["data"][SPANDATA.GEN_AI_RESPONSE_FINISH_REASONS] == [ - "function_call" - ] - assert chat_spans[1]["data"][SPANDATA.GEN_AI_RESPONSE_FINISH_REASONS] == ["stop"] + # Verify that available tools are always recorded regardless of PII settings + for chat_span in chat_spans: + tools_data = chat_span["attributes"][ + SPANDATA.GEN_AI_REQUEST_AVAILABLE_TOOLS + ] + assert tools_data is not None, ( + "Available tools should always be recorded regardless of PII settings" + ) + assert "get_word_length" in tools_data + else: + events = capture_events() + + with patch.object( + llm.client._client._client, + "send", + side_effect=[tool_response, final_response], + ) as _, start_transaction(): + list( + agent_executor.invoke( + {"input": "How many letters in the word eudca"}, + {"run_name": "my-snazzy-pipeline"}, + ) + ) + + tx = events[0] + assert tx["type"] == "transaction" + assert tx["contexts"]["trace"]["origin"] == "manual" + + invoke_agent_span = next( + x for x in tx["spans"] if x["op"] == "gen_ai.invoke_agent" + ) + chat_spans = list(x for x in tx["spans"] if x["op"] == "gen_ai.chat") + tool_exec_span = next( + x for x in tx["spans"] if x["op"] == "gen_ai.execute_tool" + ) + + assert len(chat_spans) == 2 + + assert invoke_agent_span["origin"] == "auto.ai.langchain" + assert chat_spans[0]["origin"] == "auto.ai.langchain" + assert chat_spans[1]["origin"] == "auto.ai.langchain" + assert tool_exec_span["origin"] == "auto.ai.langchain" + + assert invoke_agent_span["data"]["gen_ai.function_id"] == "my-snazzy-pipeline" + + # We can't guarantee anything about the "shape" of the langchain execution graph + assert len(list(x for x in tx["spans"] if x["op"] == "gen_ai.chat")) > 0 + + # Token usage is only available in newer versions of langchain (v0.2+) + # where usage_metadata is supported on AIMessageChunk + if "gen_ai.usage.input_tokens" in chat_spans[0]["data"]: + assert chat_spans[0]["data"]["gen_ai.usage.input_tokens"] == 142 + assert chat_spans[0]["data"]["gen_ai.usage.output_tokens"] == 50 + assert chat_spans[0]["data"]["gen_ai.usage.total_tokens"] == 192 + + if "gen_ai.usage.input_tokens" in chat_spans[1]["data"]: + assert chat_spans[1]["data"]["gen_ai.usage.input_tokens"] == 89 + assert chat_spans[1]["data"]["gen_ai.usage.output_tokens"] == 28 + assert chat_spans[1]["data"]["gen_ai.usage.total_tokens"] == 117 + + assert SPANDATA.GEN_AI_SYSTEM_INSTRUCTIONS not in chat_spans[0].get("data", {}) + assert SPANDATA.GEN_AI_REQUEST_MESSAGES not in chat_spans[0].get("data", {}) + assert SPANDATA.GEN_AI_RESPONSE_TEXT not in chat_spans[0].get("data", {}) + assert SPANDATA.GEN_AI_SYSTEM_INSTRUCTIONS not in chat_spans[1].get("data", {}) + assert SPANDATA.GEN_AI_REQUEST_MESSAGES not in chat_spans[1].get("data", {}) + assert SPANDATA.GEN_AI_RESPONSE_TEXT not in chat_spans[1].get("data", {}) + assert SPANDATA.GEN_AI_TOOL_INPUT not in tool_exec_span.get("data", {}) + assert SPANDATA.GEN_AI_TOOL_OUTPUT not in tool_exec_span.get("data", {}) - # Verify that available tools are always recorded regardless of PII settings - for chat_span in chat_spans: - tools_data = chat_span["data"][SPANDATA.GEN_AI_REQUEST_AVAILABLE_TOOLS] - assert tools_data is not None, ( - "Available tools should always be recorded regardless of PII settings" + # Verify tool calls are NOT recorded when PII is disabled + assert SPANDATA.GEN_AI_RESPONSE_TOOL_CALLS not in chat_spans[0].get( + "data", {} + ), ( + f"Tool calls should NOT be recorded when send_default_pii={send_default_pii} " + f"and include_prompts={include_prompts}" + ) + assert SPANDATA.GEN_AI_RESPONSE_TOOL_CALLS not in chat_spans[1].get( + "data", {} + ), ( + f"Tool calls should NOT be recorded when send_default_pii={send_default_pii} " + f"and include_prompts={include_prompts}" ) - assert "get_word_length" in tools_data + + # Verify finish_reasons is always an array of strings + assert chat_spans[0]["data"][SPANDATA.GEN_AI_RESPONSE_FINISH_REASONS] == [ + "function_call" + ] + assert chat_spans[1]["data"][SPANDATA.GEN_AI_RESPONSE_FINISH_REASONS] == [ + "stop" + ] + + # Verify that available tools are always recorded regardless of PII settings + for chat_span in chat_spans: + tools_data = chat_span["data"][SPANDATA.GEN_AI_REQUEST_AVAILABLE_TOOLS] + assert tools_data is not None, ( + "Available tools should always be recorded regardless of PII settings" + ) + assert "get_word_length" in tools_data +@pytest.mark.parametrize("stream_gen_ai_spans", [True, False]) @pytest.mark.parametrize( "system_instructions_content", [ @@ -860,11 +1280,13 @@ def test_langchain_openai_tools_agent_no_prompts( def test_langchain_openai_tools_agent( sentry_init, capture_events, + capture_items, system_instructions_content, request, get_model_response, server_side_event_chunks, streaming_chat_completions_model_responses, + stream_gen_ai_spans, ): sentry_init( integrations=[ @@ -874,8 +1296,8 @@ def test_langchain_openai_tools_agent( ], traces_sample_rate=1.0, send_default_pii=True, + stream_gen_ai_spans=stream_gen_ai_spans, ) - events = capture_events() prompt = ChatPromptTemplate.from_messages( [ @@ -913,104 +1335,233 @@ def test_langchain_openai_tools_agent( agent_executor = AgentExecutor(agent=agent, tools=[get_word_length], verbose=True) - with patch.object( - llm.client._client._client, - "send", - side_effect=[tool_response, final_response], - ) as _: - with start_transaction(): + if stream_gen_ai_spans: + items = capture_items("transaction", "span") + + with patch.object( + llm.client._client._client, + "send", + side_effect=[tool_response, final_response], + ) as _, start_transaction(): list(agent_executor.stream({"input": "How many letters in the word eudca"})) - tx = events[0] - assert tx["type"] == "transaction" - assert tx["contexts"]["trace"]["origin"] == "manual" + tx = next(item.payload for item in items if item.type == "transaction") + assert tx["type"] == "transaction" + assert tx["contexts"]["trace"]["origin"] == "manual" - invoke_agent_span = next(x for x in tx["spans"] if x["op"] == "gen_ai.invoke_agent") - chat_spans = list(x for x in tx["spans"] if x["op"] == "gen_ai.chat") - tool_exec_span = next(x for x in tx["spans"] if x["op"] == "gen_ai.execute_tool") + spans = [item.payload for item in items if item.type == "span"] + invoke_agent_span = next( + x for x in spans if x["attributes"]["sentry.op"] == "gen_ai.invoke_agent" + ) + chat_spans = list( + x for x in spans if x["attributes"]["sentry.op"] == "gen_ai.chat" + ) + tool_exec_span = next( + x for x in spans if x["attributes"]["sentry.op"] == "gen_ai.execute_tool" + ) - assert len(chat_spans) == 2 + assert len(chat_spans) == 2 - assert invoke_agent_span["origin"] == "auto.ai.langchain" - assert chat_spans[0]["origin"] == "auto.ai.langchain" - assert chat_spans[1]["origin"] == "auto.ai.langchain" - assert tool_exec_span["origin"] == "auto.ai.langchain" + assert invoke_agent_span["attributes"]["sentry.origin"] == "auto.ai.langchain" + assert chat_spans[0]["attributes"]["sentry.origin"] == "auto.ai.langchain" + assert chat_spans[1]["attributes"]["sentry.origin"] == "auto.ai.langchain" + assert tool_exec_span["attributes"]["sentry.origin"] == "auto.ai.langchain" - # We can't guarantee anything about the "shape" of the langchain execution graph - assert len(list(x for x in tx["spans"] if x["op"] == "gen_ai.chat")) > 0 + # We can't guarantee anything about the "shape" of the langchain execution graph + assert ( + len(list(x for x in spans if x["attributes"]["sentry.op"] == "gen_ai.chat")) + > 0 + ) - # Token usage is only available in newer versions of langchain (v0.2+) - # where usage_metadata is supported on AIMessageChunk - if "gen_ai.usage.input_tokens" in chat_spans[0]["data"]: - assert chat_spans[0]["data"]["gen_ai.usage.input_tokens"] == 142 - assert chat_spans[0]["data"]["gen_ai.usage.output_tokens"] == 50 - assert chat_spans[0]["data"]["gen_ai.usage.total_tokens"] == 192 + # Token usage is only available in newer versions of langchain (v0.2+) + # where usage_metadata is supported on AIMessageChunk + if "gen_ai.usage.input_tokens" in chat_spans[0]["attributes"]: + assert chat_spans[0]["attributes"]["gen_ai.usage.input_tokens"] == 142 + assert chat_spans[0]["attributes"]["gen_ai.usage.output_tokens"] == 50 + assert chat_spans[0]["attributes"]["gen_ai.usage.total_tokens"] == 192 - if "gen_ai.usage.input_tokens" in chat_spans[1]["data"]: - assert chat_spans[1]["data"]["gen_ai.usage.input_tokens"] == 89 - assert chat_spans[1]["data"]["gen_ai.usage.output_tokens"] == 28 - assert chat_spans[1]["data"]["gen_ai.usage.total_tokens"] == 117 + if "gen_ai.usage.input_tokens" in chat_spans[1]["attributes"]: + assert chat_spans[1]["attributes"]["gen_ai.usage.input_tokens"] == 89 + assert chat_spans[1]["attributes"]["gen_ai.usage.output_tokens"] == 28 + assert chat_spans[1]["attributes"]["gen_ai.usage.total_tokens"] == 117 - assert "5" in chat_spans[0]["data"][SPANDATA.GEN_AI_RESPONSE_TEXT] - assert "word" in tool_exec_span["data"][SPANDATA.GEN_AI_TOOL_INPUT] - assert 5 == int(tool_exec_span["data"][SPANDATA.GEN_AI_TOOL_OUTPUT]) + assert "5" in chat_spans[0]["attributes"][SPANDATA.GEN_AI_RESPONSE_TEXT] + assert "word" in tool_exec_span["attributes"][SPANDATA.GEN_AI_TOOL_INPUT] + assert 5 == int(tool_exec_span["attributes"][SPANDATA.GEN_AI_TOOL_OUTPUT]) - param_id = request.node.callspec.id - if "string" in param_id: - assert [ - { - "type": "text", - "content": "You are very powerful assistant, but don't know current events", - } - ] == json.loads(chat_spans[0]["data"][SPANDATA.GEN_AI_SYSTEM_INSTRUCTIONS]) + param_id = request.node.callspec.id + if "string" in param_id: + assert [ + { + "type": "text", + "content": "You are very powerful assistant, but don't know current events", + } + ] == json.loads( + chat_spans[0]["attributes"][SPANDATA.GEN_AI_SYSTEM_INSTRUCTIONS] + ) + else: + assert [ + { + "type": "text", + "content": "You are a helpful assistant.", + }, + { + "type": "text", + "content": "Be concise and clear.", + }, + ] == json.loads( + chat_spans[0]["attributes"][SPANDATA.GEN_AI_SYSTEM_INSTRUCTIONS] + ) + + assert "5" in chat_spans[1]["attributes"][SPANDATA.GEN_AI_RESPONSE_TEXT] + + # Verify tool calls are recorded when PII is enabled + assert SPANDATA.GEN_AI_RESPONSE_TOOL_CALLS in chat_spans[0].get( + "attributes", {} + ), ( + "Tool calls should be recorded when send_default_pii=True and include_prompts=True" + ) + tool_calls_data = chat_spans[0]["attributes"][ + SPANDATA.GEN_AI_RESPONSE_TOOL_CALLS + ] + + assert isinstance(tool_calls_data, (list, str)) # Could be serialized + if isinstance(tool_calls_data, str): + assert "get_word_length" in tool_calls_data + elif isinstance(tool_calls_data, list) and len(tool_calls_data) > 0: + # Check if tool calls contain expected function name + tool_call_str = str(tool_calls_data) + assert "get_word_length" in tool_call_str + + # Verify finish_reasons is always an array of strings + assert chat_spans[0]["attributes"][ + SPANDATA.GEN_AI_RESPONSE_FINISH_REASONS + ] == ["function_call"] + assert chat_spans[1]["attributes"][ + SPANDATA.GEN_AI_RESPONSE_FINISH_REASONS + ] == ["stop"] + + # Verify that available tools are always recorded regardless of PII settings + for chat_span in chat_spans: + tools_data = chat_span["attributes"][ + SPANDATA.GEN_AI_REQUEST_AVAILABLE_TOOLS + ] + assert tools_data is not None, ( + "Available tools should always be recorded regardless of PII settings" + ) + assert "get_word_length" in tools_data else: - assert [ - { - "type": "text", - "content": "You are a helpful assistant.", - }, - { - "type": "text", - "content": "Be concise and clear.", - }, - ] == json.loads(chat_spans[0]["data"][SPANDATA.GEN_AI_SYSTEM_INSTRUCTIONS]) - - assert "5" in chat_spans[1]["data"][SPANDATA.GEN_AI_RESPONSE_TEXT] - - # Verify tool calls are recorded when PII is enabled - assert SPANDATA.GEN_AI_RESPONSE_TOOL_CALLS in chat_spans[0].get("data", {}), ( - "Tool calls should be recorded when send_default_pii=True and include_prompts=True" - ) - tool_calls_data = chat_spans[0]["data"][SPANDATA.GEN_AI_RESPONSE_TOOL_CALLS] - assert isinstance(tool_calls_data, (list, str)) # Could be serialized - if isinstance(tool_calls_data, str): - assert "get_word_length" in tool_calls_data - elif isinstance(tool_calls_data, list) and len(tool_calls_data) > 0: - # Check if tool calls contain expected function name - tool_call_str = str(tool_calls_data) - assert "get_word_length" in tool_call_str - - # Verify finish_reasons is always an array of strings - assert chat_spans[0]["data"][SPANDATA.GEN_AI_RESPONSE_FINISH_REASONS] == [ - "function_call" - ] - assert chat_spans[1]["data"][SPANDATA.GEN_AI_RESPONSE_FINISH_REASONS] == ["stop"] + events = capture_events() - # Verify that available tools are always recorded regardless of PII settings - for chat_span in chat_spans: - tools_data = chat_span["data"][SPANDATA.GEN_AI_REQUEST_AVAILABLE_TOOLS] - assert tools_data is not None, ( - "Available tools should always be recorded regardless of PII settings" + with patch.object( + llm.client._client._client, + "send", + side_effect=[tool_response, final_response], + ) as _, start_transaction(): + list(agent_executor.stream({"input": "How many letters in the word eudca"})) + + tx = events[0] + assert tx["type"] == "transaction" + assert tx["contexts"]["trace"]["origin"] == "manual" + + invoke_agent_span = next( + x for x in tx["spans"] if x["op"] == "gen_ai.invoke_agent" + ) + chat_spans = list(x for x in tx["spans"] if x["op"] == "gen_ai.chat") + tool_exec_span = next( + x for x in tx["spans"] if x["op"] == "gen_ai.execute_tool" ) - assert "get_word_length" in tools_data + + assert len(chat_spans) == 2 + + assert invoke_agent_span["origin"] == "auto.ai.langchain" + assert chat_spans[0]["origin"] == "auto.ai.langchain" + assert chat_spans[1]["origin"] == "auto.ai.langchain" + assert tool_exec_span["origin"] == "auto.ai.langchain" + + # We can't guarantee anything about the "shape" of the langchain execution graph + assert len(list(x for x in tx["spans"] if x["op"] == "gen_ai.chat")) > 0 + + # Token usage is only available in newer versions of langchain (v0.2+) + # where usage_metadata is supported on AIMessageChunk + if "gen_ai.usage.input_tokens" in chat_spans[0]["data"]: + assert chat_spans[0]["data"]["gen_ai.usage.input_tokens"] == 142 + assert chat_spans[0]["data"]["gen_ai.usage.output_tokens"] == 50 + assert chat_spans[0]["data"]["gen_ai.usage.total_tokens"] == 192 + + if "gen_ai.usage.input_tokens" in chat_spans[1]["data"]: + assert chat_spans[1]["data"]["gen_ai.usage.input_tokens"] == 89 + assert chat_spans[1]["data"]["gen_ai.usage.output_tokens"] == 28 + assert chat_spans[1]["data"]["gen_ai.usage.total_tokens"] == 117 + + assert "5" in chat_spans[0]["data"][SPANDATA.GEN_AI_RESPONSE_TEXT] + assert "word" in tool_exec_span["data"][SPANDATA.GEN_AI_TOOL_INPUT] + assert 5 == int(tool_exec_span["data"][SPANDATA.GEN_AI_TOOL_OUTPUT]) + + param_id = request.node.callspec.id + if "string" in param_id: + assert [ + { + "type": "text", + "content": "You are very powerful assistant, but don't know current events", + } + ] == json.loads(chat_spans[0]["data"][SPANDATA.GEN_AI_SYSTEM_INSTRUCTIONS]) + else: + assert [ + { + "type": "text", + "content": "You are a helpful assistant.", + }, + { + "type": "text", + "content": "Be concise and clear.", + }, + ] == json.loads(chat_spans[0]["data"][SPANDATA.GEN_AI_SYSTEM_INSTRUCTIONS]) + + assert "5" in chat_spans[1]["data"][SPANDATA.GEN_AI_RESPONSE_TEXT] + + # Verify tool calls are recorded when PII is enabled + assert SPANDATA.GEN_AI_RESPONSE_TOOL_CALLS in chat_spans[0].get( + "data", {} + ), ( + "Tool calls should be recorded when send_default_pii=True and include_prompts=True" + ) + tool_calls_data = chat_spans[0]["data"][SPANDATA.GEN_AI_RESPONSE_TOOL_CALLS] + + assert isinstance(tool_calls_data, (list, str)) # Could be serialized + if isinstance(tool_calls_data, str): + assert "get_word_length" in tool_calls_data + elif isinstance(tool_calls_data, list) and len(tool_calls_data) > 0: + # Check if tool calls contain expected function name + tool_call_str = str(tool_calls_data) + assert "get_word_length" in tool_call_str + + # Verify finish_reasons is always an array of strings + assert chat_spans[0]["data"][SPANDATA.GEN_AI_RESPONSE_FINISH_REASONS] == [ + "function_call" + ] + assert chat_spans[1]["data"][SPANDATA.GEN_AI_RESPONSE_FINISH_REASONS] == [ + "stop" + ] + + # Verify that available tools are always recorded regardless of PII settings + for chat_span in chat_spans: + tools_data = chat_span["data"][SPANDATA.GEN_AI_REQUEST_AVAILABLE_TOOLS] + assert tools_data is not None, ( + "Available tools should always be recorded regardless of PII settings" + ) + assert "get_word_length" in tools_data +@pytest.mark.parametrize("stream_gen_ai_spans", [True, False]) def test_langchain_openai_tools_agent_with_config( sentry_init, capture_events, + capture_items, get_model_response, server_side_event_chunks, streaming_chat_completions_model_responses, + stream_gen_ai_spans, ): sentry_init( integrations=[ @@ -1020,8 +1571,8 @@ def test_langchain_openai_tools_agent_with_config( ], traces_sample_rate=1.0, send_default_pii=True, + stream_gen_ai_spans=stream_gen_ai_spans, ) - events = capture_events() prompt = ChatPromptTemplate.from_messages( [ @@ -1061,26 +1612,57 @@ def test_langchain_openai_tools_agent_with_config( agent_executor = AgentExecutor(agent=agent, tools=[get_word_length], verbose=True) - with patch.object( - llm.client._client._client, - "send", - side_effect=[tool_response, final_response], - ) as _: - with start_transaction(): + if stream_gen_ai_spans: + items = capture_items("transaction", "span") + + with patch.object( + llm.client._client._client, + "send", + side_effect=[tool_response, final_response], + ) as _, start_transaction(): list( agent_executor.invoke( {"input": "How many letters in the word eudca"}, ) ) - tx = events[0] - assert tx["type"] == "transaction" - assert tx["contexts"]["trace"]["origin"] == "manual" + tx = next(item.payload for item in items if item.type == "transaction") + assert tx["type"] == "transaction" + assert tx["contexts"]["trace"]["origin"] == "manual" - invoke_agent_span = next(x for x in tx["spans"] if x["op"] == "gen_ai.invoke_agent") - assert invoke_agent_span["data"]["gen_ai.function_id"] == "my-snazzy-pipeline" + spans = [item.payload for item in items if item.type == "span"] + invoke_agent_span = next( + x for x in spans if x["attributes"]["sentry.op"] == "gen_ai.invoke_agent" + ) + assert ( + invoke_agent_span["attributes"]["gen_ai.function_id"] + == "my-snazzy-pipeline" + ) + else: + events = capture_events() + + with patch.object( + llm.client._client._client, + "send", + side_effect=[tool_response, final_response], + ) as _, start_transaction(): + list( + agent_executor.invoke( + {"input": "How many letters in the word eudca"}, + ) + ) + + tx = events[0] + assert tx["type"] == "transaction" + assert tx["contexts"]["trace"]["origin"] == "manual" + + invoke_agent_span = next( + x for x in tx["spans"] if x["op"] == "gen_ai.invoke_agent" + ) + assert invoke_agent_span["data"]["gen_ai.function_id"] == "my-snazzy-pipeline" +@pytest.mark.parametrize("stream_gen_ai_spans", [True, False]) @pytest.mark.parametrize( "send_default_pii, include_prompts", [ @@ -1092,11 +1674,13 @@ def test_langchain_openai_tools_agent_with_config( def test_langchain_openai_tools_agent_stream_no_prompts( sentry_init, capture_events, + capture_items, send_default_pii, include_prompts, get_model_response, server_side_event_chunks, streaming_chat_completions_model_responses, + stream_gen_ai_spans, ): sentry_init( integrations=[ @@ -1106,8 +1690,8 @@ def test_langchain_openai_tools_agent_stream_no_prompts( ], traces_sample_rate=1.0, send_default_pii=send_default_pii, + stream_gen_ai_spans=stream_gen_ai_spans, ) - events = capture_events() prompt = ChatPromptTemplate.from_messages( [ @@ -1145,12 +1729,14 @@ def test_langchain_openai_tools_agent_stream_no_prompts( agent_executor = AgentExecutor(agent=agent, tools=[get_word_length], verbose=True) - with patch.object( - llm.client._client._client, - "send", - side_effect=[tool_response, final_response], - ) as _: - with start_transaction(): + if stream_gen_ai_spans: + items = capture_items("transaction", "span") + + with patch.object( + llm.client._client._client, + "send", + side_effect=[tool_response, final_response], + ) as _, start_transaction(): list( agent_executor.stream( {"input": "How many letters in the word eudca"}, @@ -1158,72 +1744,193 @@ def test_langchain_openai_tools_agent_stream_no_prompts( ) ) - tx = events[0] - assert tx["type"] == "transaction" - assert tx["contexts"]["trace"]["origin"] == "manual" + tx = next(item.payload for item in items if item.type == "transaction") + assert tx["type"] == "transaction" + assert tx["contexts"]["trace"]["origin"] == "manual" + + spans = [item.payload for item in items if item.type == "span"] + invoke_agent_span = next( + x for x in spans if x["attributes"]["sentry.op"] == "gen_ai.invoke_agent" + ) + chat_spans = list( + x for x in spans if x["attributes"]["sentry.op"] == "gen_ai.chat" + ) + tool_exec_span = next( + x for x in spans if x["attributes"]["sentry.op"] == "gen_ai.execute_tool" + ) - invoke_agent_span = next(x for x in tx["spans"] if x["op"] == "gen_ai.invoke_agent") - chat_spans = list(x for x in tx["spans"] if x["op"] == "gen_ai.chat") - tool_exec_span = next(x for x in tx["spans"] if x["op"] == "gen_ai.execute_tool") + assert len(chat_spans) == 2 - assert len(chat_spans) == 2 + assert invoke_agent_span["attributes"]["sentry.origin"] == "auto.ai.langchain" + assert chat_spans[0]["attributes"]["sentry.origin"] == "auto.ai.langchain" + assert chat_spans[1]["attributes"]["sentry.origin"] == "auto.ai.langchain" + assert tool_exec_span["attributes"]["sentry.origin"] == "auto.ai.langchain" - assert invoke_agent_span["origin"] == "auto.ai.langchain" - assert chat_spans[0]["origin"] == "auto.ai.langchain" - assert chat_spans[1]["origin"] == "auto.ai.langchain" - assert tool_exec_span["origin"] == "auto.ai.langchain" + assert ( + invoke_agent_span["attributes"]["gen_ai.function_id"] + == "my-snazzy-pipeline" + ) - assert invoke_agent_span["data"]["gen_ai.function_id"] == "my-snazzy-pipeline" + spans = [item.payload for item in items if item.type == "span"] + # We can't guarantee anything about the "shape" of the langchain execution graph + assert ( + len(list(x for x in spans if x["attributes"]["sentry.op"] == "gen_ai.chat")) + > 0 + ) - # We can't guarantee anything about the "shape" of the langchain execution graph - assert len(list(x for x in tx["spans"] if x["op"] == "gen_ai.chat")) > 0 + # Token usage is only available in newer versions of langchain (v0.2+) + # where usage_metadata is supported on AIMessageChunk + if "gen_ai.usage.input_tokens" in chat_spans[0]["attributes"]: + assert chat_spans[0]["attributes"]["gen_ai.usage.input_tokens"] == 142 + assert chat_spans[0]["attributes"]["gen_ai.usage.output_tokens"] == 50 + assert chat_spans[0]["attributes"]["gen_ai.usage.total_tokens"] == 192 - # Token usage is only available in newer versions of langchain (v0.2+) - # where usage_metadata is supported on AIMessageChunk - if "gen_ai.usage.input_tokens" in chat_spans[0]["data"]: - assert chat_spans[0]["data"]["gen_ai.usage.input_tokens"] == 142 - assert chat_spans[0]["data"]["gen_ai.usage.output_tokens"] == 50 - assert chat_spans[0]["data"]["gen_ai.usage.total_tokens"] == 192 + if "gen_ai.usage.input_tokens" in chat_spans[1]["attributes"]: + assert chat_spans[1]["attributes"]["gen_ai.usage.input_tokens"] == 89 + assert chat_spans[1]["attributes"]["gen_ai.usage.output_tokens"] == 28 + assert chat_spans[1]["attributes"]["gen_ai.usage.total_tokens"] == 117 - if "gen_ai.usage.input_tokens" in chat_spans[1]["data"]: - assert chat_spans[1]["data"]["gen_ai.usage.input_tokens"] == 89 - assert chat_spans[1]["data"]["gen_ai.usage.output_tokens"] == 28 - assert chat_spans[1]["data"]["gen_ai.usage.total_tokens"] == 117 + assert SPANDATA.GEN_AI_SYSTEM_INSTRUCTIONS not in chat_spans[0].get( + "attributes", {} + ) + assert SPANDATA.GEN_AI_REQUEST_MESSAGES not in chat_spans[0].get( + "attributes", {} + ) + assert SPANDATA.GEN_AI_RESPONSE_TEXT not in chat_spans[0].get("attributes", {}) + assert SPANDATA.GEN_AI_SYSTEM_INSTRUCTIONS not in chat_spans[1].get( + "attributes", {} + ) + assert SPANDATA.GEN_AI_REQUEST_MESSAGES not in chat_spans[1].get( + "attributes", {} + ) + assert SPANDATA.GEN_AI_RESPONSE_TEXT not in chat_spans[1].get("attributes", {}) + assert SPANDATA.GEN_AI_TOOL_INPUT not in tool_exec_span.get("attributes", {}) + assert SPANDATA.GEN_AI_TOOL_OUTPUT not in tool_exec_span.get("attributes", {}) - assert SPANDATA.GEN_AI_SYSTEM_INSTRUCTIONS not in chat_spans[0].get("data", {}) - assert SPANDATA.GEN_AI_REQUEST_MESSAGES not in chat_spans[0].get("data", {}) - assert SPANDATA.GEN_AI_RESPONSE_TEXT not in chat_spans[0].get("data", {}) - assert SPANDATA.GEN_AI_SYSTEM_INSTRUCTIONS not in chat_spans[1].get("data", {}) - assert SPANDATA.GEN_AI_REQUEST_MESSAGES not in chat_spans[1].get("data", {}) - assert SPANDATA.GEN_AI_RESPONSE_TEXT not in chat_spans[1].get("data", {}) - assert SPANDATA.GEN_AI_TOOL_INPUT not in tool_exec_span.get("data", {}) - assert SPANDATA.GEN_AI_TOOL_OUTPUT not in tool_exec_span.get("data", {}) + # Verify tool calls are NOT recorded when PII is disabled + assert SPANDATA.GEN_AI_RESPONSE_TOOL_CALLS not in chat_spans[0].get( + "attributes", {} + ), ( + f"Tool calls should NOT be recorded when send_default_pii={send_default_pii} " + f"and include_prompts={include_prompts}" + ) + assert SPANDATA.GEN_AI_RESPONSE_TOOL_CALLS not in chat_spans[1].get( + "attributes", {} + ), ( + f"Tool calls should NOT be recorded when send_default_pii={send_default_pii} " + f"and include_prompts={include_prompts}" + ) - # Verify tool calls are NOT recorded when PII is disabled - assert SPANDATA.GEN_AI_RESPONSE_TOOL_CALLS not in chat_spans[0].get("data", {}), ( - f"Tool calls should NOT be recorded when send_default_pii={send_default_pii} " - f"and include_prompts={include_prompts}" - ) - assert SPANDATA.GEN_AI_RESPONSE_TOOL_CALLS not in chat_spans[1].get("data", {}), ( - f"Tool calls should NOT be recorded when send_default_pii={send_default_pii} " - f"and include_prompts={include_prompts}" - ) + # Verify finish_reasons is always an array of strings + assert chat_spans[0]["attributes"][SPANDATA.GEN_AI_RESPONSE_FINISH_REASONS] == [ + "function_call" + ] + assert chat_spans[1]["attributes"][SPANDATA.GEN_AI_RESPONSE_FINISH_REASONS] == [ + "stop" + ] - # Verify finish_reasons is always an array of strings - assert chat_spans[0]["data"][SPANDATA.GEN_AI_RESPONSE_FINISH_REASONS] == [ - "function_call" - ] - assert chat_spans[1]["data"][SPANDATA.GEN_AI_RESPONSE_FINISH_REASONS] == ["stop"] + # Verify that available tools are always recorded regardless of PII settings + for chat_span in chat_spans: + tools_data = chat_span["attributes"][ + SPANDATA.GEN_AI_REQUEST_AVAILABLE_TOOLS + ] + + assert tools_data is not None, ( + "Available tools should always be recorded regardless of PII settings" + ) + assert "get_word_length" in tools_data + else: + events = capture_events() - # Verify that available tools are always recorded regardless of PII settings - for chat_span in chat_spans: - tools_data = chat_span["data"][SPANDATA.GEN_AI_REQUEST_AVAILABLE_TOOLS] - assert tools_data is not None, ( - "Available tools should always be recorded regardless of PII settings" + with patch.object( + llm.client._client._client, + "send", + side_effect=[tool_response, final_response], + ) as _, start_transaction(): + list( + agent_executor.stream( + {"input": "How many letters in the word eudca"}, + {"run_name": "my-snazzy-pipeline"}, + ) + ) + + tx = events[0] + assert tx["type"] == "transaction" + assert tx["contexts"]["trace"]["origin"] == "manual" + + invoke_agent_span = next( + x for x in tx["spans"] if x["op"] == "gen_ai.invoke_agent" + ) + chat_spans = list(x for x in tx["spans"] if x["op"] == "gen_ai.chat") + tool_exec_span = next( + x for x in tx["spans"] if x["op"] == "gen_ai.execute_tool" ) - assert "get_word_length" in tools_data + assert len(chat_spans) == 2 + assert invoke_agent_span["origin"] == "auto.ai.langchain" + assert chat_spans[0]["origin"] == "auto.ai.langchain" + assert chat_spans[1]["origin"] == "auto.ai.langchain" + assert tool_exec_span["origin"] == "auto.ai.langchain" + + assert invoke_agent_span["data"]["gen_ai.function_id"] == "my-snazzy-pipeline" + + # We can't guarantee anything about the "shape" of the langchain execution graph + assert len(list(x for x in tx["spans"] if x["op"] == "gen_ai.chat")) > 0 + + # Token usage is only available in newer versions of langchain (v0.2+) + # where usage_metadata is supported on AIMessageChunk + if "gen_ai.usage.input_tokens" in chat_spans[0]["data"]: + assert chat_spans[0]["data"]["gen_ai.usage.input_tokens"] == 142 + assert chat_spans[0]["data"]["gen_ai.usage.output_tokens"] == 50 + assert chat_spans[0]["data"]["gen_ai.usage.total_tokens"] == 192 + + if "gen_ai.usage.input_tokens" in chat_spans[1]["data"]: + assert chat_spans[1]["data"]["gen_ai.usage.input_tokens"] == 89 + assert chat_spans[1]["data"]["gen_ai.usage.output_tokens"] == 28 + assert chat_spans[1]["data"]["gen_ai.usage.total_tokens"] == 117 + + assert SPANDATA.GEN_AI_SYSTEM_INSTRUCTIONS not in chat_spans[0].get("data", {}) + assert SPANDATA.GEN_AI_REQUEST_MESSAGES not in chat_spans[0].get("data", {}) + assert SPANDATA.GEN_AI_RESPONSE_TEXT not in chat_spans[0].get("data", {}) + assert SPANDATA.GEN_AI_SYSTEM_INSTRUCTIONS not in chat_spans[1].get("data", {}) + assert SPANDATA.GEN_AI_REQUEST_MESSAGES not in chat_spans[1].get("data", {}) + assert SPANDATA.GEN_AI_RESPONSE_TEXT not in chat_spans[1].get("data", {}) + assert SPANDATA.GEN_AI_TOOL_INPUT not in tool_exec_span.get("data", {}) + assert SPANDATA.GEN_AI_TOOL_OUTPUT not in tool_exec_span.get("data", {}) + + # Verify tool calls are NOT recorded when PII is disabled + assert SPANDATA.GEN_AI_RESPONSE_TOOL_CALLS not in chat_spans[0].get( + "data", {} + ), ( + f"Tool calls should NOT be recorded when send_default_pii={send_default_pii} " + f"and include_prompts={include_prompts}" + ) + assert SPANDATA.GEN_AI_RESPONSE_TOOL_CALLS not in chat_spans[1].get( + "data", {} + ), ( + f"Tool calls should NOT be recorded when send_default_pii={send_default_pii} " + f"and include_prompts={include_prompts}" + ) + + # Verify finish_reasons is always an array of strings + assert chat_spans[0]["data"][SPANDATA.GEN_AI_RESPONSE_FINISH_REASONS] == [ + "function_call" + ] + assert chat_spans[1]["data"][SPANDATA.GEN_AI_RESPONSE_FINISH_REASONS] == [ + "stop" + ] + + # Verify that available tools are always recorded regardless of PII settings + for chat_span in chat_spans: + tools_data = chat_span["data"][SPANDATA.GEN_AI_REQUEST_AVAILABLE_TOOLS] + assert tools_data is not None, ( + "Available tools should always be recorded regardless of PII settings" + ) + assert "get_word_length" in tools_data + + +@pytest.mark.parametrize("stream_gen_ai_spans", [True, False]) @pytest.mark.parametrize( "system_instructions_content", [ @@ -1239,11 +1946,13 @@ def test_langchain_openai_tools_agent_stream_no_prompts( def test_langchain_openai_tools_agent_stream( sentry_init, capture_events, + capture_items, system_instructions_content, request, get_model_response, server_side_event_chunks, streaming_chat_completions_model_responses, + stream_gen_ai_spans, ): sentry_init( integrations=[ @@ -1253,8 +1962,8 @@ def test_langchain_openai_tools_agent_stream( ], traces_sample_rate=1.0, send_default_pii=True, + stream_gen_ai_spans=stream_gen_ai_spans, ) - events = capture_events() prompt = ChatPromptTemplate.from_messages( [ @@ -1292,12 +2001,14 @@ def test_langchain_openai_tools_agent_stream( agent_executor = AgentExecutor(agent=agent, tools=[get_word_length], verbose=True) - with patch.object( - llm.client._client._client, - "send", - side_effect=[tool_response, final_response], - ) as _: - with start_transaction(): + if stream_gen_ai_spans: + items = capture_items("transaction", "span") + + with patch.object( + llm.client._client._client, + "send", + side_effect=[tool_response, final_response], + ) as _, start_transaction(): list( agent_executor.stream( {"input": "How many letters in the word eudca"}, @@ -1305,98 +2016,234 @@ def test_langchain_openai_tools_agent_stream( ) ) - tx = events[0] - assert tx["type"] == "transaction" - assert tx["contexts"]["trace"]["origin"] == "manual" + tx = next(item.payload for item in items if item.type == "transaction") + assert tx["type"] == "transaction" + assert tx["contexts"]["trace"]["origin"] == "manual" - invoke_agent_span = next(x for x in tx["spans"] if x["op"] == "gen_ai.invoke_agent") - chat_spans = list(x for x in tx["spans"] if x["op"] == "gen_ai.chat") - tool_exec_span = next(x for x in tx["spans"] if x["op"] == "gen_ai.execute_tool") + spans = [item.payload for item in items if item.type == "span"] + invoke_agent_span = next( + x for x in spans if x["attributes"]["sentry.op"] == "gen_ai.invoke_agent" + ) + chat_spans = list( + x for x in spans if x["attributes"]["sentry.op"] == "gen_ai.chat" + ) + tool_exec_span = next( + x for x in spans if x["attributes"]["sentry.op"] == "gen_ai.execute_tool" + ) - assert len(chat_spans) == 2 + assert len(chat_spans) == 2 - assert invoke_agent_span["origin"] == "auto.ai.langchain" - assert chat_spans[0]["origin"] == "auto.ai.langchain" - assert chat_spans[1]["origin"] == "auto.ai.langchain" - assert tool_exec_span["origin"] == "auto.ai.langchain" + assert invoke_agent_span["attributes"]["sentry.origin"] == "auto.ai.langchain" + assert chat_spans[0]["attributes"]["sentry.origin"] == "auto.ai.langchain" + assert chat_spans[1]["attributes"]["sentry.origin"] == "auto.ai.langchain" + assert tool_exec_span["attributes"]["sentry.origin"] == "auto.ai.langchain" - assert invoke_agent_span["data"]["gen_ai.function_id"] == "my-snazzy-pipeline" + assert ( + invoke_agent_span["attributes"]["gen_ai.function_id"] + == "my-snazzy-pipeline" + ) - # We can't guarantee anything about the "shape" of the langchain execution graph - assert len(list(x for x in tx["spans"] if x["op"] == "gen_ai.chat")) > 0 + # We can't guarantee anything about the "shape" of the langchain execution graph + assert ( + len(list(x for x in spans if x["attributes"]["sentry.op"] == "gen_ai.chat")) + > 0 + ) - # Token usage is only available in newer versions of langchain (v0.2+) - # where usage_metadata is supported on AIMessageChunk - if "gen_ai.usage.input_tokens" in chat_spans[0]["data"]: - assert chat_spans[0]["data"]["gen_ai.usage.input_tokens"] == 142 - assert chat_spans[0]["data"]["gen_ai.usage.output_tokens"] == 50 - assert chat_spans[0]["data"]["gen_ai.usage.total_tokens"] == 192 + # Token usage is only available in newer versions of langchain (v0.2+) + # where usage_metadata is supported on AIMessageChunk + if "gen_ai.usage.input_tokens" in chat_spans[0]["attributes"]: + assert chat_spans[0]["attributes"]["gen_ai.usage.input_tokens"] == 142 + assert chat_spans[0]["attributes"]["gen_ai.usage.output_tokens"] == 50 + assert chat_spans[0]["attributes"]["gen_ai.usage.total_tokens"] == 192 - if "gen_ai.usage.input_tokens" in chat_spans[1]["data"]: - assert chat_spans[1]["data"]["gen_ai.usage.input_tokens"] == 89 - assert chat_spans[1]["data"]["gen_ai.usage.output_tokens"] == 28 - assert chat_spans[1]["data"]["gen_ai.usage.total_tokens"] == 117 + if "gen_ai.usage.input_tokens" in chat_spans[1]["attributes"]: + assert chat_spans[1]["attributes"]["gen_ai.usage.input_tokens"] == 89 + assert chat_spans[1]["attributes"]["gen_ai.usage.output_tokens"] == 28 + assert chat_spans[1]["attributes"]["gen_ai.usage.total_tokens"] == 117 - assert "5" in chat_spans[0]["data"][SPANDATA.GEN_AI_RESPONSE_TEXT] - assert "word" in tool_exec_span["data"][SPANDATA.GEN_AI_TOOL_INPUT] - assert 5 == int(tool_exec_span["data"][SPANDATA.GEN_AI_TOOL_OUTPUT]) + assert "5" in chat_spans[0]["attributes"][SPANDATA.GEN_AI_RESPONSE_TEXT] + assert "word" in tool_exec_span["attributes"][SPANDATA.GEN_AI_TOOL_INPUT] + assert 5 == int(tool_exec_span["attributes"][SPANDATA.GEN_AI_TOOL_OUTPUT]) - param_id = request.node.callspec.id - if "string" in param_id: - assert [ - { - "type": "text", - "content": "You are very powerful assistant, but don't know current events", - } - ] == json.loads(chat_spans[0]["data"][SPANDATA.GEN_AI_SYSTEM_INSTRUCTIONS]) + param_id = request.node.callspec.id + if "string" in param_id: + assert [ + { + "type": "text", + "content": "You are very powerful assistant, but don't know current events", + } + ] == json.loads( + chat_spans[0]["attributes"][SPANDATA.GEN_AI_SYSTEM_INSTRUCTIONS] + ) + else: + assert [ + { + "type": "text", + "content": "You are a helpful assistant.", + }, + { + "type": "text", + "content": "Be concise and clear.", + }, + ] == json.loads( + chat_spans[0]["attributes"][SPANDATA.GEN_AI_SYSTEM_INSTRUCTIONS] + ) + + assert "5" in chat_spans[1]["attributes"][SPANDATA.GEN_AI_RESPONSE_TEXT] + + # Verify tool calls are recorded when PII is enabled + assert SPANDATA.GEN_AI_RESPONSE_TOOL_CALLS in chat_spans[0].get( + "attributes", {} + ), ( + "Tool calls should be recorded when send_default_pii=True and include_prompts=True" + ) + tool_calls_data = chat_spans[0]["attributes"][ + SPANDATA.GEN_AI_RESPONSE_TOOL_CALLS + ] + + assert isinstance(tool_calls_data, (list, str)) # Could be serialized + if isinstance(tool_calls_data, str): + assert "get_word_length" in tool_calls_data + elif isinstance(tool_calls_data, list) and len(tool_calls_data) > 0: + # Check if tool calls contain expected function name + tool_call_str = str(tool_calls_data) + assert "get_word_length" in tool_call_str + + # Verify finish_reasons is always an array of strings + assert chat_spans[0]["attributes"][ + SPANDATA.GEN_AI_RESPONSE_FINISH_REASONS + ] == ["function_call"] + assert chat_spans[1]["attributes"][ + SPANDATA.GEN_AI_RESPONSE_FINISH_REASONS + ] == ["stop"] + + # Verify that available tools are always recorded regardless of PII settings + for chat_span in chat_spans: + tools_data = chat_span["attributes"][ + SPANDATA.GEN_AI_REQUEST_AVAILABLE_TOOLS + ] + assert tools_data is not None, ( + "Available tools should always be recorded regardless of PII settings" + ) + assert "get_word_length" in tools_data else: - assert [ - { - "type": "text", - "content": "You are a helpful assistant.", - }, - { - "type": "text", - "content": "Be concise and clear.", - }, - ] == json.loads(chat_spans[0]["data"][SPANDATA.GEN_AI_SYSTEM_INSTRUCTIONS]) - - assert "5" in chat_spans[1]["data"][SPANDATA.GEN_AI_RESPONSE_TEXT] - - # Verify tool calls are recorded when PII is enabled - assert SPANDATA.GEN_AI_RESPONSE_TOOL_CALLS in chat_spans[0].get("data", {}), ( - "Tool calls should be recorded when send_default_pii=True and include_prompts=True" - ) - tool_calls_data = chat_spans[0]["data"][SPANDATA.GEN_AI_RESPONSE_TOOL_CALLS] - assert isinstance(tool_calls_data, (list, str)) # Could be serialized - if isinstance(tool_calls_data, str): - assert "get_word_length" in tool_calls_data - elif isinstance(tool_calls_data, list) and len(tool_calls_data) > 0: - # Check if tool calls contain expected function name - tool_call_str = str(tool_calls_data) - assert "get_word_length" in tool_call_str - - # Verify finish_reasons is always an array of strings - assert chat_spans[0]["data"][SPANDATA.GEN_AI_RESPONSE_FINISH_REASONS] == [ - "function_call" - ] - assert chat_spans[1]["data"][SPANDATA.GEN_AI_RESPONSE_FINISH_REASONS] == ["stop"] + events = capture_events() - # Verify that available tools are always recorded regardless of PII settings - for chat_span in chat_spans: - tools_data = chat_span["data"][SPANDATA.GEN_AI_REQUEST_AVAILABLE_TOOLS] - assert tools_data is not None, ( - "Available tools should always be recorded regardless of PII settings" + with patch.object( + llm.client._client._client, + "send", + side_effect=[tool_response, final_response], + ) as _, start_transaction(): + list( + agent_executor.stream( + {"input": "How many letters in the word eudca"}, + {"run_name": "my-snazzy-pipeline"}, + ) + ) + + tx = events[0] + assert tx["type"] == "transaction" + assert tx["contexts"]["trace"]["origin"] == "manual" + + invoke_agent_span = next( + x for x in tx["spans"] if x["op"] == "gen_ai.invoke_agent" + ) + chat_spans = list(x for x in tx["spans"] if x["op"] == "gen_ai.chat") + tool_exec_span = next( + x for x in tx["spans"] if x["op"] == "gen_ai.execute_tool" ) - assert "get_word_length" in tools_data + assert len(chat_spans) == 2 + assert invoke_agent_span["origin"] == "auto.ai.langchain" + assert chat_spans[0]["origin"] == "auto.ai.langchain" + assert chat_spans[1]["origin"] == "auto.ai.langchain" + assert tool_exec_span["origin"] == "auto.ai.langchain" + + assert invoke_agent_span["data"]["gen_ai.function_id"] == "my-snazzy-pipeline" + + # We can't guarantee anything about the "shape" of the langchain execution graph + assert len(list(x for x in tx["spans"] if x["op"] == "gen_ai.chat")) > 0 + + # Token usage is only available in newer versions of langchain (v0.2+) + # where usage_metadata is supported on AIMessageChunk + if "gen_ai.usage.input_tokens" in chat_spans[0]["data"]: + assert chat_spans[0]["data"]["gen_ai.usage.input_tokens"] == 142 + assert chat_spans[0]["data"]["gen_ai.usage.output_tokens"] == 50 + assert chat_spans[0]["data"]["gen_ai.usage.total_tokens"] == 192 + + if "gen_ai.usage.input_tokens" in chat_spans[1]["data"]: + assert chat_spans[1]["data"]["gen_ai.usage.input_tokens"] == 89 + assert chat_spans[1]["data"]["gen_ai.usage.output_tokens"] == 28 + assert chat_spans[1]["data"]["gen_ai.usage.total_tokens"] == 117 + + assert "5" in chat_spans[0]["data"][SPANDATA.GEN_AI_RESPONSE_TEXT] + assert "word" in tool_exec_span["data"][SPANDATA.GEN_AI_TOOL_INPUT] + assert 5 == int(tool_exec_span["data"][SPANDATA.GEN_AI_TOOL_OUTPUT]) + + param_id = request.node.callspec.id + if "string" in param_id: + assert [ + { + "type": "text", + "content": "You are very powerful assistant, but don't know current events", + } + ] == json.loads(chat_spans[0]["data"][SPANDATA.GEN_AI_SYSTEM_INSTRUCTIONS]) + else: + assert [ + { + "type": "text", + "content": "You are a helpful assistant.", + }, + { + "type": "text", + "content": "Be concise and clear.", + }, + ] == json.loads(chat_spans[0]["data"][SPANDATA.GEN_AI_SYSTEM_INSTRUCTIONS]) + + assert "5" in chat_spans[1]["data"][SPANDATA.GEN_AI_RESPONSE_TEXT] + + # Verify tool calls are recorded when PII is enabled + assert SPANDATA.GEN_AI_RESPONSE_TOOL_CALLS in chat_spans[0].get( + "data", {} + ), ( + "Tool calls should be recorded when send_default_pii=True and include_prompts=True" + ) + tool_calls_data = chat_spans[0]["data"][SPANDATA.GEN_AI_RESPONSE_TOOL_CALLS] + assert isinstance(tool_calls_data, (list, str)) # Could be serialized + if isinstance(tool_calls_data, str): + assert "get_word_length" in tool_calls_data + elif isinstance(tool_calls_data, list) and len(tool_calls_data) > 0: + # Check if tool calls contain expected function name + tool_call_str = str(tool_calls_data) + assert "get_word_length" in tool_call_str + + # Verify finish_reasons is always an array of strings + assert chat_spans[0]["data"][SPANDATA.GEN_AI_RESPONSE_FINISH_REASONS] == [ + "function_call" + ] + assert chat_spans[1]["data"][SPANDATA.GEN_AI_RESPONSE_FINISH_REASONS] == [ + "stop" + ] + + # Verify that available tools are always recorded regardless of PII settings + for chat_span in chat_spans: + tools_data = chat_span["data"][SPANDATA.GEN_AI_REQUEST_AVAILABLE_TOOLS] + assert tools_data is not None, ( + "Available tools should always be recorded regardless of PII settings" + ) + assert "get_word_length" in tools_data + + +@pytest.mark.parametrize("stream_gen_ai_spans", [True, False]) def test_langchain_openai_tools_agent_stream_with_config( sentry_init, capture_events, + capture_items, get_model_response, server_side_event_chunks, streaming_chat_completions_model_responses, + stream_gen_ai_spans, ): sentry_init( integrations=[ @@ -1406,8 +2253,8 @@ def test_langchain_openai_tools_agent_stream_with_config( ], traces_sample_rate=1.0, send_default_pii=True, + stream_gen_ai_spans=stream_gen_ai_spans, ) - events = capture_events() prompt = ChatPromptTemplate.from_messages( [ @@ -1447,27 +2294,63 @@ def test_langchain_openai_tools_agent_stream_with_config( agent_executor = AgentExecutor(agent=agent, tools=[get_word_length], verbose=True) - with patch.object( - llm.client._client._client, - "send", - side_effect=[tool_response, final_response], - ) as _: - with start_transaction(): + if stream_gen_ai_spans: + items = capture_items("transaction", "span") + + with patch.object( + llm.client._client._client, + "send", + side_effect=[tool_response, final_response], + ) as _, start_transaction(): list( agent_executor.stream( {"input": "How many letters in the word eudca"}, ) ) - tx = events[0] - assert tx["type"] == "transaction" - assert tx["contexts"]["trace"]["origin"] == "manual" + tx = next(item.payload for item in items if item.type == "transaction") + assert tx["type"] == "transaction" + assert tx["contexts"]["trace"]["origin"] == "manual" - invoke_agent_span = next(x for x in tx["spans"] if x["op"] == "gen_ai.invoke_agent") - assert invoke_agent_span["data"]["gen_ai.function_id"] == "my-snazzy-pipeline" + spans = [item.payload for item in items if item.type == "span"] + invoke_agent_span = next( + x for x in spans if x["attributes"]["sentry.op"] == "gen_ai.invoke_agent" + ) + assert ( + invoke_agent_span["attributes"]["gen_ai.function_id"] + == "my-snazzy-pipeline" + ) + else: + events = capture_events() + + with patch.object( + llm.client._client._client, + "send", + side_effect=[tool_response, final_response], + ) as _, start_transaction(): + list( + agent_executor.stream( + {"input": "How many letters in the word eudca"}, + ) + ) + + tx = events[0] + assert tx["type"] == "transaction" + assert tx["contexts"]["trace"]["origin"] == "manual" + + invoke_agent_span = next( + x for x in tx["spans"] if x["op"] == "gen_ai.invoke_agent" + ) + assert invoke_agent_span["data"]["gen_ai.function_id"] == "my-snazzy-pipeline" -def test_langchain_error(sentry_init, capture_events): +@pytest.mark.parametrize("stream_gen_ai_spans", [True, False]) +def test_langchain_error( + sentry_init, + capture_events, + capture_items, + stream_gen_ai_spans, +): class MockOpenAI(ChatOpenAI): def _stream( self, @@ -1489,8 +2372,8 @@ def _llm_type(self) -> str: integrations=[LangchainIntegration(include_prompts=True)], traces_sample_rate=1.0, send_default_pii=True, + stream_gen_ai_spans=stream_gen_ai_spans, ) - events = capture_events() prompt = ChatPromptTemplate.from_messages( [ @@ -1511,14 +2394,30 @@ def _llm_type(self) -> str: agent_executor = AgentExecutor(agent=agent, tools=[get_word_length], verbose=True) - with start_transaction(), pytest.raises(ValueError): - list(agent_executor.stream({"input": "How many letters in the word eudca"})) + if stream_gen_ai_spans: + items = capture_items("event") + + with start_transaction(), pytest.raises(ValueError): + list(agent_executor.stream({"input": "How many letters in the word eudca"})) + + (error,) = (item.payload for item in items if item.type == "event") + else: + events = capture_events() + + with start_transaction(), pytest.raises(ValueError): + list(agent_executor.stream({"input": "How many letters in the word eudca"})) - error = events[0] + error = events[0] assert error["level"] == "error" -def test_span_status_error(sentry_init, capture_events): +@pytest.mark.parametrize("stream_gen_ai_spans", [True, False]) +def test_span_status_error( + sentry_init, + capture_events, + capture_items, + stream_gen_ai_spans, +): class MockOpenAI(ChatOpenAI): def _stream( self, @@ -1539,38 +2438,82 @@ def _llm_type(self) -> str: sentry_init( integrations=[LangchainIntegration(include_prompts=True)], traces_sample_rate=1.0, + stream_gen_ai_spans=stream_gen_ai_spans, ) - events = capture_events() + if stream_gen_ai_spans: + items = capture_items("event", "transaction", "span") + + with start_transaction(name="test"): + prompt = ChatPromptTemplate.from_messages( + [ + ( + "system", + "You are very powerful assistant, but don't know current events", + ), + ("user", "{input}"), + MessagesPlaceholder(variable_name="agent_scratchpad"), + ] + ) + llm = MockOpenAI( + model_name="gpt-3.5-turbo", + temperature=0, + openai_api_key="badkey", + ) + agent = create_openai_tools_agent(llm, [get_word_length], prompt) + + agent_executor = AgentExecutor( + agent=agent, tools=[get_word_length], verbose=True + ) + + with pytest.raises(ValueError): + list( + agent_executor.stream( + {"input": "How many letters in the word eudca"} + ) + ) + + (error,) = (item.payload for item in items if item.type == "event") + assert error["level"] == "error" + spans = [item.payload for item in items if item.type == "span"] + assert spans[0]["status"] == "error" + (transaction,) = (item.payload for item in items if item.type == "transaction") + else: + events = capture_events() + + with start_transaction(name="test"): + prompt = ChatPromptTemplate.from_messages( + [ + ( + "system", + "You are very powerful assistant, but don't know current events", + ), + ("user", "{input}"), + MessagesPlaceholder(variable_name="agent_scratchpad"), + ] + ) + llm = MockOpenAI( + model_name="gpt-3.5-turbo", + temperature=0, + openai_api_key="badkey", + ) + agent = create_openai_tools_agent(llm, [get_word_length], prompt) - with start_transaction(name="test"): - prompt = ChatPromptTemplate.from_messages( - [ - ( - "system", - "You are very powerful assistant, but don't know current events", - ), - ("user", "{input}"), - MessagesPlaceholder(variable_name="agent_scratchpad"), - ] - ) - llm = MockOpenAI( - model_name="gpt-3.5-turbo", - temperature=0, - openai_api_key="badkey", - ) - agent = create_openai_tools_agent(llm, [get_word_length], prompt) + agent_executor = AgentExecutor( + agent=agent, tools=[get_word_length], verbose=True + ) - agent_executor = AgentExecutor( - agent=agent, tools=[get_word_length], verbose=True - ) + with pytest.raises(ValueError): + list( + agent_executor.stream( + {"input": "How many letters in the word eudca"} + ) + ) - with pytest.raises(ValueError): - list(agent_executor.stream({"input": "How many letters in the word eudca"})) + (error, transaction) = events + assert error["level"] == "error" + assert transaction["spans"][0]["status"] == "internal_error" + assert transaction["spans"][0]["tags"]["status"] == "internal_error" - (error, transaction) = events - assert error["level"] == "error" - assert transaction["spans"][0]["status"] == "internal_error" - assert transaction["spans"][0]["tags"]["status"] == "internal_error" assert transaction["contexts"]["trace"]["status"] == "internal_error" @@ -1618,7 +2561,9 @@ def _llm_type(self): def _identifying_params(self): return {} - sentry_init(integrations=[LangchainIntegration()]) + sentry_init( + integrations=[LangchainIntegration()], _experiments={"gen_ai_as_v2_spans": True} + ) # Create a manual SentryLangchainCallback manual_callback = SentryLangchainCallback( @@ -1783,7 +2728,13 @@ def test_langchain_callback_list_existing_callback(sentry_init): assert handler is sentry_callback -def test_langchain_message_role_mapping(sentry_init, capture_events): +@pytest.mark.parametrize("stream_gen_ai_spans", [True, False]) +def test_langchain_message_role_mapping( + sentry_init, + capture_events, + capture_items, + stream_gen_ai_spans, +): """Test that message roles are properly normalized in langchain integration.""" class MockOpenAI(ChatOpenAI): @@ -1816,8 +2767,8 @@ def _llm_type(self) -> str: integrations=[LangchainIntegration(include_prompts=True)], traces_sample_rate=1.0, send_default_pii=True, + stream_gen_ai_spans=stream_gen_ai_spans, ) - events = capture_events() prompt = ChatPromptTemplate.from_messages( [ @@ -1838,54 +2789,112 @@ def _llm_type(self) -> str: # Test input that should trigger message role normalization test_input = "Hello, how are you?" - with start_transaction(): - list(agent_executor.stream({"input": test_input})) + message_data_found = False + if stream_gen_ai_spans: + items = capture_items("span") - assert len(events) > 0 - tx = events[0] - assert tx["type"] == "transaction" + with start_transaction(): + list(agent_executor.stream({"input": test_input})) - # Find spans with gen_ai operation that should have message data - gen_ai_spans = [ - span for span in tx.get("spans", []) if span.get("op", "").startswith("gen_ai") - ] + spans = [item.payload for item in items if item.type == "span"] + # Find spans with gen_ai operation that should have message data + gen_ai_spans = [ + span + for span in spans + if span["attributes"].get("sentry.op", "").startswith("gen_ai") + ] - # Check if any span has message data with normalized roles - message_data_found = False - for span in gen_ai_spans: - span_data = span.get("data", {}) - if SPANDATA.GEN_AI_REQUEST_MESSAGES in span_data: - message_data_found = True - messages_data = span_data[SPANDATA.GEN_AI_REQUEST_MESSAGES] - - # Parse the message data (might be JSON string) - if isinstance(messages_data, str): - try: - messages = json.loads(messages_data) - except json.JSONDecodeError: - # If not valid JSON, skip this assertion - continue - else: - messages = messages_data + # Check if any span has message data with normalized roles + for span in gen_ai_spans: + span_data = span.get("attributes", {}) + if SPANDATA.GEN_AI_REQUEST_MESSAGES in span_data: + message_data_found = True + messages_data = span_data[SPANDATA.GEN_AI_REQUEST_MESSAGES] + + # Parse the message data (might be JSON string) + if isinstance(messages_data, str): + try: + messages = json.loads(messages_data) + except json.JSONDecodeError: + # If not valid JSON, skip this assertion + continue + else: + messages = messages_data + + # Verify that the input message is present and contains the test input + assert isinstance(messages, list) + assert len(messages) > 0 + + # The test input should be in one of the messages + input_found = False + for msg in messages: + if isinstance(msg, dict) and test_input in str( + msg.get("content", "") + ): + input_found = True + break + elif isinstance(msg, str) and test_input in msg: + input_found = True + break + + assert input_found, ( + f"Test input '{test_input}' not found in messages: {messages}" + ) + break + else: + events = capture_events() - # Verify that the input message is present and contains the test input - assert isinstance(messages, list) - assert len(messages) > 0 + with start_transaction(): + list(agent_executor.stream({"input": test_input})) - # The test input should be in one of the messages - input_found = False - for msg in messages: - if isinstance(msg, dict) and test_input in str(msg.get("content", "")): - input_found = True - break - elif isinstance(msg, str) and test_input in msg: - input_found = True - break + assert len(events) > 0 + tx = events[0] + assert tx["type"] == "transaction" - assert input_found, ( - f"Test input '{test_input}' not found in messages: {messages}" - ) - break + # Find spans with gen_ai operation that should have message data + gen_ai_spans = [ + span + for span in tx.get("spans", []) + if span.get("op", "").startswith("gen_ai") + ] + + # Check if any span has message data with normalized roles + for span in gen_ai_spans: + span_data = span.get("data", {}) + if SPANDATA.GEN_AI_REQUEST_MESSAGES in span_data: + message_data_found = True + messages_data = span_data[SPANDATA.GEN_AI_REQUEST_MESSAGES] + + # Parse the message data (might be JSON string) + if isinstance(messages_data, str): + try: + messages = json.loads(messages_data) + except json.JSONDecodeError: + # If not valid JSON, skip this assertion + continue + else: + messages = messages_data + + # Verify that the input message is present and contains the test input + assert isinstance(messages, list) + assert len(messages) > 0 + + # The test input should be in one of the messages + input_found = False + for msg in messages: + if isinstance(msg, dict) and test_input in str( + msg.get("content", "") + ): + input_found = True + break + elif isinstance(msg, str) and test_input in msg: + input_found = True + break + + assert input_found, ( + f"Test input '{test_input}' not found in messages: {messages}" + ) + break # The message role mapping functionality is primarily tested through the normalization # that happens in the integration code. The fact that we can capture and process @@ -1934,7 +2943,13 @@ def test_langchain_message_role_normalization_units(): assert normalized[5] == "string message" # String message unchanged -def test_langchain_message_truncation(sentry_init, capture_events): +@pytest.mark.parametrize("stream_gen_ai_spans", [True, False]) +def test_langchain_message_truncation( + sentry_init, + capture_events, + capture_items, + stream_gen_ai_spans, +): """Test that large messages are truncated properly in Langchain integration.""" from langchain_core.outputs import LLMResult, Generation @@ -1942,8 +2957,8 @@ def test_langchain_message_truncation(sentry_init, capture_events): integrations=[LangchainIntegration(include_prompts=True)], traces_sample_rate=1.0, send_default_pii=True, + stream_gen_ai_spans=stream_gen_ai_spans, ) - events = capture_events() callback = SentryLangchainCallback(max_span_map_size=100, include_prompts=True) @@ -1961,48 +2976,101 @@ def test_langchain_message_truncation(sentry_init, capture_events): "small message 5", ] - with start_transaction(): - callback.on_llm_start( - serialized=serialized, - prompts=prompts, - run_id=run_id, - name="my_pipeline", - invocation_params={ - "temperature": 0.7, - "max_tokens": 100, - "model": "gpt-3.5-turbo", - }, - ) + if stream_gen_ai_spans: + items = capture_items("transaction", "span") - response = LLMResult( - generations=[[Generation(text="The response")]], - llm_output={ - "token_usage": { - "total_tokens": 25, - "prompt_tokens": 10, - "completion_tokens": 15, - } - }, - ) - callback.on_llm_end(response=response, run_id=run_id) + with start_transaction(): + callback.on_llm_start( + serialized=serialized, + prompts=prompts, + run_id=run_id, + name="my_pipeline", + invocation_params={ + "temperature": 0.7, + "max_tokens": 100, + "model": "gpt-3.5-turbo", + }, + ) + + response = LLMResult( + generations=[[Generation(text="The response")]], + llm_output={ + "token_usage": { + "total_tokens": 25, + "prompt_tokens": 10, + "completion_tokens": 15, + } + }, + ) + callback.on_llm_end(response=response, run_id=run_id) - assert len(events) > 0 - tx = events[0] - assert tx["type"] == "transaction" + tx = next(item.payload for item in items if item.type == "transaction") + assert tx["type"] == "transaction" - llm_spans = [ - span - for span in tx.get("spans", []) - if span.get("op") == "gen_ai.text_completion" - ] - assert len(llm_spans) > 0 + spans = [item.payload for item in items if item.type == "span"] + llm_spans = [ + span + for span in spans + if span["attributes"].get("sentry.op") == "gen_ai.text_completion" + ] + + assert len(llm_spans) > 0 + + llm_span = llm_spans[0] + + assert llm_span["attributes"]["gen_ai.operation.name"] == "text_completion" + assert llm_span["attributes"][SPANDATA.GEN_AI_FUNCTION_ID] == "my_pipeline" + + assert SPANDATA.GEN_AI_REQUEST_MESSAGES in llm_span["attributes"] + messages_data = llm_span["attributes"][SPANDATA.GEN_AI_REQUEST_MESSAGES] + else: + events = capture_events() + + with start_transaction(): + callback.on_llm_start( + serialized=serialized, + prompts=prompts, + run_id=run_id, + name="my_pipeline", + invocation_params={ + "temperature": 0.7, + "max_tokens": 100, + "model": "gpt-3.5-turbo", + }, + ) + + response = LLMResult( + generations=[[Generation(text="The response")]], + llm_output={ + "token_usage": { + "total_tokens": 25, + "prompt_tokens": 10, + "completion_tokens": 15, + } + }, + ) + callback.on_llm_end(response=response, run_id=run_id) - llm_span = llm_spans[0] - assert llm_span["data"]["gen_ai.operation.name"] == "text_completion" - assert llm_span["data"][SPANDATA.GEN_AI_FUNCTION_ID] == "my_pipeline" + assert len(events) > 0 + tx = events[0] + assert tx["type"] == "transaction" + + llm_spans = [ + span + for span in tx.get("spans", []) + if span.get("op") == "gen_ai.text_completion" + ] + + assert len(llm_spans) > 0 + + llm_span = llm_spans[0] + + assert llm_span["data"]["gen_ai.operation.name"] == "text_completion" + assert llm_span["data"][SPANDATA.GEN_AI_FUNCTION_ID] == "my_pipeline" + + assert SPANDATA.GEN_AI_REQUEST_MESSAGES in llm_span["data"] + messages_data = llm_span["data"][SPANDATA.GEN_AI_REQUEST_MESSAGES] - assert SPANDATA.GEN_AI_REQUEST_MESSAGES in llm_span["data"] - messages_data = llm_span["data"][SPANDATA.GEN_AI_REQUEST_MESSAGES] assert isinstance(messages_data, str) parsed_messages = json.loads(messages_data) @@ -2012,6 +3080,7 @@ def test_langchain_message_truncation(sentry_init, capture_events): assert tx["_meta"]["spans"]["0"]["data"]["gen_ai.request.messages"][""]["len"] == 5 +@pytest.mark.parametrize("stream_gen_ai_spans", [True, False]) @pytest.mark.parametrize( "send_default_pii, include_prompts", [ @@ -2022,7 +3091,12 @@ def test_langchain_message_truncation(sentry_init, capture_events): ], ) def test_langchain_embeddings_sync( - sentry_init, capture_events, send_default_pii, include_prompts + sentry_init, + capture_events, + capture_items, + send_default_pii, + include_prompts, + stream_gen_ai_spans, ): """Test that sync embedding methods (embed_documents, embed_query) are properly traced.""" try: @@ -2034,61 +3108,127 @@ def test_langchain_embeddings_sync( integrations=[LangchainIntegration(include_prompts=include_prompts)], traces_sample_rate=1.0, send_default_pii=send_default_pii, + stream_gen_ai_spans=stream_gen_ai_spans, ) - events = capture_events() + if stream_gen_ai_spans: + items = capture_items("span") + + # Mock the actual API call + with mock.patch.object( + OpenAIEmbeddings, + "embed_documents", + wraps=lambda self, texts: [[0.1, 0.2, 0.3] for _ in texts], + ) as mock_embed_documents: + embeddings = OpenAIEmbeddings( + model="text-embedding-ada-002", openai_api_key="test-key" + ) - # Mock the actual API call - with mock.patch.object( - OpenAIEmbeddings, - "embed_documents", - wraps=lambda self, texts: [[0.1, 0.2, 0.3] for _ in texts], - ) as mock_embed_documents: - embeddings = OpenAIEmbeddings( - model="text-embedding-ada-002", openai_api_key="test-key" + # Force setup to re-run to ensure our mock is wrapped + LangchainIntegration.setup_once() + + with start_transaction(name="test_embeddings"): + # Test embed_documents + result = embeddings.embed_documents(["Hello world", "Test document"]) + + assert len(result) == 2 + mock_embed_documents.assert_called_once() + + spans = [item.payload for item in items if item.type == "span"] + # Find embeddings span + embeddings_spans = [ + span + for span in spans + if span["attributes"].get("sentry.op") == "gen_ai.embeddings" + ] + assert len(embeddings_spans) == 1 + + embeddings_span = embeddings_spans[0] + assert embeddings_span["name"] == "embeddings text-embedding-ada-002" + assert embeddings_span["attributes"]["sentry.origin"] == "auto.ai.langchain" + assert embeddings_span["attributes"]["gen_ai.operation.name"] == "embeddings" + assert ( + embeddings_span["attributes"]["gen_ai.request.model"] + == "text-embedding-ada-002" ) - # Force setup to re-run to ensure our mock is wrapped - LangchainIntegration.setup_once() + # Check if input is captured based on PII settings + if send_default_pii and include_prompts: + assert SPANDATA.GEN_AI_EMBEDDINGS_INPUT in embeddings_span["attributes"] + input_data = embeddings_span["attributes"][SPANDATA.GEN_AI_EMBEDDINGS_INPUT] + + # Could be serialized as string + if isinstance(input_data, str): + assert "Hello world" in input_data + assert "Test document" in input_data + else: + assert "Hello world" in input_data + assert "Test document" in input_data + else: + assert SPANDATA.GEN_AI_EMBEDDINGS_INPUT not in embeddings_span.get( + "attributes", {} + ) + else: + events = capture_events() + + # Mock the actual API call + with mock.patch.object( + OpenAIEmbeddings, + "embed_documents", + wraps=lambda self, texts: [[0.1, 0.2, 0.3] for _ in texts], + ) as mock_embed_documents: + embeddings = OpenAIEmbeddings( + model="text-embedding-ada-002", openai_api_key="test-key" + ) - with start_transaction(name="test_embeddings"): - # Test embed_documents - result = embeddings.embed_documents(["Hello world", "Test document"]) + # Force setup to re-run to ensure our mock is wrapped + LangchainIntegration.setup_once() - assert len(result) == 2 - mock_embed_documents.assert_called_once() + with start_transaction(name="test_embeddings"): + # Test embed_documents + result = embeddings.embed_documents(["Hello world", "Test document"]) - # Check captured events - assert len(events) >= 1 - tx = events[0] - assert tx["type"] == "transaction" + assert len(result) == 2 + mock_embed_documents.assert_called_once() - # Find embeddings span - embeddings_spans = [ - span for span in tx.get("spans", []) if span.get("op") == "gen_ai.embeddings" - ] - assert len(embeddings_spans) == 1 + # Check captured events + assert len(events) >= 1 + tx = events[0] + assert tx["type"] == "transaction" - embeddings_span = embeddings_spans[0] - assert embeddings_span["description"] == "embeddings text-embedding-ada-002" - assert embeddings_span["origin"] == "auto.ai.langchain" - assert embeddings_span["data"]["gen_ai.operation.name"] == "embeddings" - assert embeddings_span["data"]["gen_ai.request.model"] == "text-embedding-ada-002" + # Find embeddings span + embeddings_spans = [ + span + for span in tx.get("spans", []) + if span.get("op") == "gen_ai.embeddings" + ] + assert len(embeddings_spans) == 1 - # Check if input is captured based on PII settings - if send_default_pii and include_prompts: - assert SPANDATA.GEN_AI_EMBEDDINGS_INPUT in embeddings_span["data"] - input_data = embeddings_span["data"][SPANDATA.GEN_AI_EMBEDDINGS_INPUT] - # Could be serialized as string - if isinstance(input_data, str): - assert "Hello world" in input_data - assert "Test document" in input_data + embeddings_span = embeddings_spans[0] + assert embeddings_span["description"] == "embeddings text-embedding-ada-002" + assert embeddings_span["origin"] == "auto.ai.langchain" + assert embeddings_span["data"]["gen_ai.operation.name"] == "embeddings" + assert ( + embeddings_span["data"]["gen_ai.request.model"] == "text-embedding-ada-002" + ) + + # Check if input is captured based on PII settings + if send_default_pii and include_prompts: + assert SPANDATA.GEN_AI_EMBEDDINGS_INPUT in embeddings_span["data"] + input_data = embeddings_span["data"][SPANDATA.GEN_AI_EMBEDDINGS_INPUT] + # Could be serialized as string + if isinstance(input_data, str): + assert "Hello world" in input_data + assert "Test document" in input_data + else: + assert "Hello world" in input_data + assert "Test document" in input_data else: - assert "Hello world" in input_data - assert "Test document" in input_data - else: - assert SPANDATA.GEN_AI_EMBEDDINGS_INPUT not in embeddings_span.get("data", {}) + assert SPANDATA.GEN_AI_EMBEDDINGS_INPUT not in embeddings_span.get( + "data", {} + ) +@pytest.mark.parametrize("stream_gen_ai_spans", [True, False]) @pytest.mark.parametrize( "send_default_pii, include_prompts", [ @@ -2097,7 +3237,12 @@ def test_langchain_embeddings_sync( ], ) def test_langchain_embeddings_embed_query( - sentry_init, capture_events, send_default_pii, include_prompts + sentry_init, + capture_events, + capture_items, + send_default_pii, + include_prompts, + stream_gen_ai_spans, ): """Test that embed_query method is properly traced.""" try: @@ -2109,56 +3254,117 @@ def test_langchain_embeddings_embed_query( integrations=[LangchainIntegration(include_prompts=include_prompts)], traces_sample_rate=1.0, send_default_pii=send_default_pii, + stream_gen_ai_spans=stream_gen_ai_spans, ) - events = capture_events() + if stream_gen_ai_spans: + items = capture_items("span") + + # Mock the actual API call + with mock.patch.object( + OpenAIEmbeddings, + "embed_query", + wraps=lambda self, text: [0.1, 0.2, 0.3], + ) as mock_embed_query: + embeddings = OpenAIEmbeddings( + model="text-embedding-ada-002", openai_api_key="test-key" + ) - # Mock the actual API call - with mock.patch.object( - OpenAIEmbeddings, - "embed_query", - wraps=lambda self, text: [0.1, 0.2, 0.3], - ) as mock_embed_query: - embeddings = OpenAIEmbeddings( - model="text-embedding-ada-002", openai_api_key="test-key" + # Force setup to re-run to ensure our mock is wrapped + LangchainIntegration.setup_once() + + with start_transaction(name="test_embeddings_query"): + result = embeddings.embed_query("What is the capital of France?") + + assert len(result) == 3 + mock_embed_query.assert_called_once() + + spans = [item.payload for item in items if item.type == "span"] + # Find embeddings span + embeddings_spans = [ + span + for span in spans + if span["attributes"].get("sentry.op") == "gen_ai.embeddings" + ] + assert len(embeddings_spans) == 1 + + embeddings_span = embeddings_spans[0] + assert embeddings_span["attributes"]["gen_ai.operation.name"] == "embeddings" + assert ( + embeddings_span["attributes"]["gen_ai.request.model"] + == "text-embedding-ada-002" ) - # Force setup to re-run to ensure our mock is wrapped - LangchainIntegration.setup_once() + # Check if input is captured based on PII settings + if send_default_pii and include_prompts: + assert SPANDATA.GEN_AI_EMBEDDINGS_INPUT in embeddings_span["attributes"] + input_data = embeddings_span["attributes"][SPANDATA.GEN_AI_EMBEDDINGS_INPUT] - with start_transaction(name="test_embeddings_query"): - result = embeddings.embed_query("What is the capital of France?") + # Could be serialized as string + if isinstance(input_data, str): + assert "What is the capital of France?" in input_data + else: + assert "What is the capital of France?" in input_data + else: + assert SPANDATA.GEN_AI_EMBEDDINGS_INPUT not in embeddings_span.get( + "attributes", {} + ) + else: + events = capture_events() + + # Mock the actual API call + with mock.patch.object( + OpenAIEmbeddings, + "embed_query", + wraps=lambda self, text: [0.1, 0.2, 0.3], + ) as mock_embed_query: + embeddings = OpenAIEmbeddings( + model="text-embedding-ada-002", openai_api_key="test-key" + ) - assert len(result) == 3 - mock_embed_query.assert_called_once() + # Force setup to re-run to ensure our mock is wrapped + LangchainIntegration.setup_once() - # Check captured events - assert len(events) >= 1 - tx = events[0] - assert tx["type"] == "transaction" + with start_transaction(name="test_embeddings_query"): + result = embeddings.embed_query("What is the capital of France?") - # Find embeddings span - embeddings_spans = [ - span for span in tx.get("spans", []) if span.get("op") == "gen_ai.embeddings" - ] - assert len(embeddings_spans) == 1 + assert len(result) == 3 + mock_embed_query.assert_called_once() - embeddings_span = embeddings_spans[0] - assert embeddings_span["data"]["gen_ai.operation.name"] == "embeddings" - assert embeddings_span["data"]["gen_ai.request.model"] == "text-embedding-ada-002" + # Check captured events + assert len(events) >= 1 + tx = events[0] + assert tx["type"] == "transaction" - # Check if input is captured based on PII settings - if send_default_pii and include_prompts: - assert SPANDATA.GEN_AI_EMBEDDINGS_INPUT in embeddings_span["data"] - input_data = embeddings_span["data"][SPANDATA.GEN_AI_EMBEDDINGS_INPUT] - # Could be serialized as string - if isinstance(input_data, str): - assert "What is the capital of France?" in input_data + # Find embeddings span + embeddings_spans = [ + span + for span in tx.get("spans", []) + if span.get("op") == "gen_ai.embeddings" + ] + assert len(embeddings_spans) == 1 + + embeddings_span = embeddings_spans[0] + assert embeddings_span["data"]["gen_ai.operation.name"] == "embeddings" + assert ( + embeddings_span["data"]["gen_ai.request.model"] == "text-embedding-ada-002" + ) + + # Check if input is captured based on PII settings + if send_default_pii and include_prompts: + assert SPANDATA.GEN_AI_EMBEDDINGS_INPUT in embeddings_span["data"] + input_data = embeddings_span["data"][SPANDATA.GEN_AI_EMBEDDINGS_INPUT] + # Could be serialized as string + if isinstance(input_data, str): + assert "What is the capital of France?" in input_data + else: + assert "What is the capital of France?" in input_data else: - assert "What is the capital of France?" in input_data - else: - assert SPANDATA.GEN_AI_EMBEDDINGS_INPUT not in embeddings_span.get("data", {}) + assert SPANDATA.GEN_AI_EMBEDDINGS_INPUT not in embeddings_span.get( + "data", {} + ) +@pytest.mark.parametrize("stream_gen_ai_spans", [True, False]) @pytest.mark.parametrize( "send_default_pii, include_prompts", [ @@ -2168,7 +3374,12 @@ def test_langchain_embeddings_embed_query( ) @pytest.mark.asyncio async def test_langchain_embeddings_async( - sentry_init, capture_events, send_default_pii, include_prompts + sentry_init, + capture_events, + capture_items, + send_default_pii, + include_prompts, + stream_gen_ai_spans, ): """Test that async embedding methods (aembed_documents, aembed_query) are properly traced.""" try: @@ -2180,65 +3391,145 @@ async def test_langchain_embeddings_async( integrations=[LangchainIntegration(include_prompts=include_prompts)], traces_sample_rate=1.0, send_default_pii=send_default_pii, + stream_gen_ai_spans=stream_gen_ai_spans, ) - events = capture_events() async def mock_aembed_documents(self, texts): return [[0.1, 0.2, 0.3] for _ in texts] - # Mock the actual API call - with mock.patch.object( - OpenAIEmbeddings, - "aembed_documents", - wraps=mock_aembed_documents, - ) as mock_aembed: - embeddings = OpenAIEmbeddings( - model="text-embedding-ada-002", openai_api_key="test-key" + if stream_gen_ai_spans: + items = capture_items("span") + + # Mock the actual API call + with mock.patch.object( + OpenAIEmbeddings, + "aembed_documents", + wraps=mock_aembed_documents, + ) as mock_aembed: + embeddings = OpenAIEmbeddings( + model="text-embedding-ada-002", openai_api_key="test-key" + ) + + # Force setup to re-run to ensure our mock is wrapped + LangchainIntegration.setup_once() + + with start_transaction(name="test_async_embeddings"): + result = await embeddings.aembed_documents( + ["Async hello", "Async test document"] + ) + + assert len(result) == 2 + mock_aembed.assert_called_once() + + spans = [item.payload for item in items if item.type == "span"] + # Find embeddings span + embeddings_spans = [ + span + for span in spans + if span["attributes"].get("sentry.op") == "gen_ai.embeddings" + ] + assert len(embeddings_spans) == 1 + + embeddings_span = embeddings_spans[0] + assert embeddings_span["name"] == "embeddings text-embedding-ada-002" + assert embeddings_span["attributes"]["sentry.origin"] == "auto.ai.langchain" + assert embeddings_span["attributes"]["gen_ai.operation.name"] == "embeddings" + assert ( + embeddings_span["attributes"]["gen_ai.request.model"] + == "text-embedding-ada-002" ) - # Force setup to re-run to ensure our mock is wrapped - LangchainIntegration.setup_once() + # Check if input is captured based on PII settings + if send_default_pii and include_prompts: + assert SPANDATA.GEN_AI_EMBEDDINGS_INPUT in embeddings_span["attributes"] + input_data = embeddings_span["attributes"][SPANDATA.GEN_AI_EMBEDDINGS_INPUT] - with start_transaction(name="test_async_embeddings"): - result = await embeddings.aembed_documents( - ["Async hello", "Async test document"] + # Could be serialized as string + if isinstance(input_data, str): + assert ( + "Async hello" in input_data or "Async test document" in input_data + ) + else: + assert ( + "Async hello" in input_data or "Async test document" in input_data + ) + else: + assert SPANDATA.GEN_AI_EMBEDDINGS_INPUT not in embeddings_span.get( + "attributes", {} ) - assert len(result) == 2 - mock_aembed.assert_called_once() + else: + events = capture_events() + + # Mock the actual API call + with mock.patch.object( + OpenAIEmbeddings, + "aembed_documents", + wraps=mock_aembed_documents, + ) as mock_aembed: + embeddings = OpenAIEmbeddings( + model="text-embedding-ada-002", openai_api_key="test-key" + ) - # Check captured events - assert len(events) >= 1 - tx = events[0] - assert tx["type"] == "transaction" + # Force setup to re-run to ensure our mock is wrapped + LangchainIntegration.setup_once() - # Find embeddings span - embeddings_spans = [ - span for span in tx.get("spans", []) if span.get("op") == "gen_ai.embeddings" - ] - assert len(embeddings_spans) == 1 + with start_transaction(name="test_async_embeddings"): + result = await embeddings.aembed_documents( + ["Async hello", "Async test document"] + ) - embeddings_span = embeddings_spans[0] - assert embeddings_span["description"] == "embeddings text-embedding-ada-002" - assert embeddings_span["origin"] == "auto.ai.langchain" - assert embeddings_span["data"]["gen_ai.operation.name"] == "embeddings" - assert embeddings_span["data"]["gen_ai.request.model"] == "text-embedding-ada-002" + assert len(result) == 2 + mock_aembed.assert_called_once() - # Check if input is captured based on PII settings - if send_default_pii and include_prompts: - assert SPANDATA.GEN_AI_EMBEDDINGS_INPUT in embeddings_span["data"] - input_data = embeddings_span["data"][SPANDATA.GEN_AI_EMBEDDINGS_INPUT] - # Could be serialized as string - if isinstance(input_data, str): - assert "Async hello" in input_data or "Async test document" in input_data + # Check captured events + assert len(events) >= 1 + tx = events[0] + assert tx["type"] == "transaction" + + # Find embeddings span + embeddings_spans = [ + span + for span in tx.get("spans", []) + if span.get("op") == "gen_ai.embeddings" + ] + assert len(embeddings_spans) == 1 + + embeddings_span = embeddings_spans[0] + assert embeddings_span["description"] == "embeddings text-embedding-ada-002" + assert embeddings_span["origin"] == "auto.ai.langchain" + assert embeddings_span["data"]["gen_ai.operation.name"] == "embeddings" + assert ( + embeddings_span["data"]["gen_ai.request.model"] == "text-embedding-ada-002" + ) + + # Check if input is captured based on PII settings + if send_default_pii and include_prompts: + assert SPANDATA.GEN_AI_EMBEDDINGS_INPUT in embeddings_span["data"] + input_data = embeddings_span["data"][SPANDATA.GEN_AI_EMBEDDINGS_INPUT] + # Could be serialized as string + if isinstance(input_data, str): + assert ( + "Async hello" in input_data or "Async test document" in input_data + ) + else: + assert ( + "Async hello" in input_data or "Async test document" in input_data + ) else: - assert "Async hello" in input_data or "Async test document" in input_data - else: - assert SPANDATA.GEN_AI_EMBEDDINGS_INPUT not in embeddings_span.get("data", {}) + assert SPANDATA.GEN_AI_EMBEDDINGS_INPUT not in embeddings_span.get( + "data", {} + ) +@pytest.mark.parametrize("stream_gen_ai_spans", [True, False]) @pytest.mark.asyncio -async def test_langchain_embeddings_aembed_query(sentry_init, capture_events): +async def test_langchain_embeddings_aembed_query( + sentry_init, + capture_events, + capture_items, + stream_gen_ai_spans, +): """Test that aembed_query method is properly traced.""" try: from langchain_openai import OpenAIEmbeddings @@ -2249,49 +3540,98 @@ async def test_langchain_embeddings_aembed_query(sentry_init, capture_events): integrations=[LangchainIntegration(include_prompts=True)], traces_sample_rate=1.0, send_default_pii=True, + stream_gen_ai_spans=stream_gen_ai_spans, ) - events = capture_events() async def mock_aembed_query(self, text): return [0.1, 0.2, 0.3] - # Mock the actual API call - with mock.patch.object( - OpenAIEmbeddings, - "aembed_query", - wraps=mock_aembed_query, - ) as mock_aembed: - embeddings = OpenAIEmbeddings( - model="text-embedding-ada-002", openai_api_key="test-key" + if stream_gen_ai_spans: + items = capture_items("span") + + # Mock the actual API call + with mock.patch.object( + OpenAIEmbeddings, + "aembed_query", + wraps=mock_aembed_query, + ) as mock_aembed: + embeddings = OpenAIEmbeddings( + model="text-embedding-ada-002", openai_api_key="test-key" + ) + + # Force setup to re-run to ensure our mock is wrapped + LangchainIntegration.setup_once() + + with start_transaction(name="test_async_embeddings_query"): + result = await embeddings.aembed_query("Async query test") + + assert len(result) == 3 + mock_aembed.assert_called_once() + + spans = [item.payload for item in items if item.type == "span"] + # Find embeddings span + embeddings_spans = [ + span + for span in spans + if span["attributes"].get("sentry.op") == "gen_ai.embeddings" + ] + assert len(embeddings_spans) == 1 + + embeddings_span = embeddings_spans[0] + assert embeddings_span["attributes"]["gen_ai.operation.name"] == "embeddings" + assert ( + embeddings_span["attributes"]["gen_ai.request.model"] + == "text-embedding-ada-002" ) - # Force setup to re-run to ensure our mock is wrapped - LangchainIntegration.setup_once() + # Check if input is captured + assert SPANDATA.GEN_AI_EMBEDDINGS_INPUT in embeddings_span["attributes"] + input_data = embeddings_span["attributes"][SPANDATA.GEN_AI_EMBEDDINGS_INPUT] + else: + events = capture_events() + + # Mock the actual API call + with mock.patch.object( + OpenAIEmbeddings, + "aembed_query", + wraps=mock_aembed_query, + ) as mock_aembed: + embeddings = OpenAIEmbeddings( + model="text-embedding-ada-002", openai_api_key="test-key" + ) - with start_transaction(name="test_async_embeddings_query"): - result = await embeddings.aembed_query("Async query test") + # Force setup to re-run to ensure our mock is wrapped + LangchainIntegration.setup_once() - assert len(result) == 3 - mock_aembed.assert_called_once() + with start_transaction(name="test_async_embeddings_query"): + result = await embeddings.aembed_query("Async query test") - # Check captured events - assert len(events) >= 1 - tx = events[0] - assert tx["type"] == "transaction" + assert len(result) == 3 + mock_aembed.assert_called_once() - # Find embeddings span - embeddings_spans = [ - span for span in tx.get("spans", []) if span.get("op") == "gen_ai.embeddings" - ] - assert len(embeddings_spans) == 1 + # Check captured events + assert len(events) >= 1 + tx = events[0] + assert tx["type"] == "transaction" + + # Find embeddings span + embeddings_spans = [ + span + for span in tx.get("spans", []) + if span.get("op") == "gen_ai.embeddings" + ] + assert len(embeddings_spans) == 1 - embeddings_span = embeddings_spans[0] - assert embeddings_span["data"]["gen_ai.operation.name"] == "embeddings" - assert embeddings_span["data"]["gen_ai.request.model"] == "text-embedding-ada-002" + embeddings_span = embeddings_spans[0] + assert embeddings_span["data"]["gen_ai.operation.name"] == "embeddings" + assert ( + embeddings_span["data"]["gen_ai.request.model"] == "text-embedding-ada-002" + ) + + # Check if input is captured + assert SPANDATA.GEN_AI_EMBEDDINGS_INPUT in embeddings_span["data"] + input_data = embeddings_span["data"][SPANDATA.GEN_AI_EMBEDDINGS_INPUT] - # Check if input is captured - assert SPANDATA.GEN_AI_EMBEDDINGS_INPUT in embeddings_span["data"] - input_data = embeddings_span["data"][SPANDATA.GEN_AI_EMBEDDINGS_INPUT] # Could be serialized as string if isinstance(input_data, str): assert "Async query test" in input_data @@ -2299,94 +3639,183 @@ async def mock_aembed_query(self, text): assert "Async query test" in input_data -def test_langchain_embeddings_no_model_name(sentry_init, capture_events): +@pytest.mark.parametrize("stream_gen_ai_spans", [True, False]) +def test_langchain_embeddings_no_model_name( + sentry_init, + capture_events, + capture_items, + stream_gen_ai_spans, +): """Test embeddings when model name is not available.""" try: from langchain_openai import OpenAIEmbeddings except ImportError: pytest.skip("langchain_openai not installed") - sentry_init( - integrations=[LangchainIntegration(include_prompts=False)], - traces_sample_rate=1.0, - ) - events = capture_events() + sentry_init( + integrations=[LangchainIntegration(include_prompts=False)], + traces_sample_rate=1.0, + stream_gen_ai_spans=stream_gen_ai_spans, + ) + if stream_gen_ai_spans: + items = capture_items("span") + + # Mock the actual API call and remove model attribute + with mock.patch.object( + OpenAIEmbeddings, + "embed_documents", + wraps=lambda self, texts: [[0.1, 0.2, 0.3] for _ in texts], + ): + embeddings = OpenAIEmbeddings(openai_api_key="test-key") + # Remove model attribute to test fallback + delattr(embeddings, "model") + if hasattr(embeddings, "model_name"): + delattr(embeddings, "model_name") + + # Force setup to re-run to ensure our mock is wrapped + LangchainIntegration.setup_once() + + with start_transaction(name="test_embeddings_no_model"): + embeddings.embed_documents(["Test"]) + + spans = [item.payload for item in items if item.type == "span"] + # Find embeddings span + embeddings_spans = [ + span + for span in spans + if span["attributes"].get("sentry.op") == "gen_ai.embeddings" + ] + assert len(embeddings_spans) == 1 + + embeddings_span = embeddings_spans[0] + assert embeddings_span["name"] == "embeddings" + assert embeddings_span["attributes"]["gen_ai.operation.name"] == "embeddings" + # Model name should not be set if not available + assert ( + "gen_ai.request.model" not in embeddings_span["attributes"] + or embeddings_span["attributes"]["gen_ai.request.model"] is None + ) + else: + events = capture_events() - # Mock the actual API call and remove model attribute - with mock.patch.object( - OpenAIEmbeddings, - "embed_documents", - wraps=lambda self, texts: [[0.1, 0.2, 0.3] for _ in texts], - ): - embeddings = OpenAIEmbeddings(openai_api_key="test-key") - # Remove model attribute to test fallback - delattr(embeddings, "model") - if hasattr(embeddings, "model_name"): - delattr(embeddings, "model_name") + # Mock the actual API call and remove model attribute + with mock.patch.object( + OpenAIEmbeddings, + "embed_documents", + wraps=lambda self, texts: [[0.1, 0.2, 0.3] for _ in texts], + ): + embeddings = OpenAIEmbeddings(openai_api_key="test-key") + # Remove model attribute to test fallback + delattr(embeddings, "model") + if hasattr(embeddings, "model_name"): + delattr(embeddings, "model_name") - # Force setup to re-run to ensure our mock is wrapped - LangchainIntegration.setup_once() + # Force setup to re-run to ensure our mock is wrapped + LangchainIntegration.setup_once() - with start_transaction(name="test_embeddings_no_model"): - embeddings.embed_documents(["Test"]) + with start_transaction(name="test_embeddings_no_model"): + embeddings.embed_documents(["Test"]) - # Check captured events - assert len(events) >= 1 - tx = events[0] - assert tx["type"] == "transaction" + # Check captured events + assert len(events) >= 1 + tx = events[0] + assert tx["type"] == "transaction" - # Find embeddings span - embeddings_spans = [ - span for span in tx.get("spans", []) if span.get("op") == "gen_ai.embeddings" - ] - assert len(embeddings_spans) == 1 + # Find embeddings span + embeddings_spans = [ + span + for span in tx.get("spans", []) + if span.get("op") == "gen_ai.embeddings" + ] + assert len(embeddings_spans) == 1 - embeddings_span = embeddings_spans[0] - assert embeddings_span["description"] == "embeddings" - assert embeddings_span["data"]["gen_ai.operation.name"] == "embeddings" - # Model name should not be set if not available - assert ( - "gen_ai.request.model" not in embeddings_span["data"] - or embeddings_span["data"]["gen_ai.request.model"] is None - ) + embeddings_span = embeddings_spans[0] + assert embeddings_span["description"] == "embeddings" + assert embeddings_span["data"]["gen_ai.operation.name"] == "embeddings" + # Model name should not be set if not available + assert ( + "gen_ai.request.model" not in embeddings_span["data"] + or embeddings_span["data"]["gen_ai.request.model"] is None + ) -def test_langchain_embeddings_integration_disabled(sentry_init, capture_events): +@pytest.mark.parametrize("stream_gen_ai_spans", [True, False]) +def test_langchain_embeddings_integration_disabled( + sentry_init, + capture_events, + capture_items, + stream_gen_ai_spans, +): """Test that embeddings are not traced when integration is disabled.""" try: from langchain_openai import OpenAIEmbeddings except ImportError: pytest.skip("langchain_openai not installed") + sentry_init( + traces_sample_rate=1.0, + stream_gen_ai_spans=stream_gen_ai_spans, + ) + # Initialize without LangchainIntegration - sentry_init(traces_sample_rate=1.0) - events = capture_events() + if stream_gen_ai_spans: + items = capture_items("span") - with mock.patch.object( - OpenAIEmbeddings, - "embed_documents", - return_value=[[0.1, 0.2, 0.3]], - ): - embeddings = OpenAIEmbeddings( - model="text-embedding-ada-002", openai_api_key="test-key" - ) + with mock.patch.object( + OpenAIEmbeddings, + "embed_documents", + return_value=[[0.1, 0.2, 0.3]], + ): + embeddings = OpenAIEmbeddings( + model="text-embedding-ada-002", openai_api_key="test-key" + ) - with start_transaction(name="test_embeddings_disabled"): - embeddings.embed_documents(["Test"]) + with start_transaction(name="test_embeddings_disabled"): + embeddings.embed_documents(["Test"]) - # Check that no embeddings spans were created - if events: - tx = events[0] + # Check that no embeddings spans were created + spans = [item.payload for item in items if item.type == "span"] embeddings_spans = [ span - for span in tx.get("spans", []) - if span.get("op") == "gen_ai.embeddings" + for span in spans + if span["attributes"].get("sentry.op") == "gen_ai.embeddings" ] # Should be empty since integration is disabled assert len(embeddings_spans) == 0 + else: + events = capture_events() + + with mock.patch.object( + OpenAIEmbeddings, + "embed_documents", + return_value=[[0.1, 0.2, 0.3]], + ): + embeddings = OpenAIEmbeddings( + model="text-embedding-ada-002", openai_api_key="test-key" + ) + + with start_transaction(name="test_embeddings_disabled"): + embeddings.embed_documents(["Test"]) + + # Check that no embeddings spans were created + if events: + tx = events[0] + embeddings_spans = [ + span + for span in tx.get("spans", []) + if span.get("op") == "gen_ai.embeddings" + ] + # Should be empty since integration is disabled + assert len(embeddings_spans) == 0 -def test_langchain_embeddings_multiple_providers(sentry_init, capture_events): +@pytest.mark.parametrize("stream_gen_ai_spans", [True, False]) +def test_langchain_embeddings_multiple_providers( + sentry_init, + capture_events, + capture_items, + stream_gen_ai_spans, +): """Test that embeddings work with different providers.""" try: from langchain_openai import OpenAIEmbeddings, AzureOpenAIEmbeddings @@ -2397,52 +3826,102 @@ def test_langchain_embeddings_multiple_providers(sentry_init, capture_events): integrations=[LangchainIntegration(include_prompts=True)], traces_sample_rate=1.0, send_default_pii=True, + stream_gen_ai_spans=stream_gen_ai_spans, ) - events = capture_events() + if stream_gen_ai_spans: + items = capture_items("span") + + # Mock both providers + with mock.patch.object( + OpenAIEmbeddings, + "embed_documents", + wraps=lambda self, texts: [[0.1, 0.2, 0.3] for _ in texts], + ), mock.patch.object( + AzureOpenAIEmbeddings, + "embed_documents", + wraps=lambda self, texts: [[0.4, 0.5, 0.6] for _ in texts], + ): + openai_embeddings = OpenAIEmbeddings( + model="text-embedding-ada-002", openai_api_key="test-key" + ) + azure_embeddings = AzureOpenAIEmbeddings( + model="text-embedding-ada-002", + azure_endpoint="https://test.openai.azure.com/", + openai_api_key="test-key", + ) - # Mock both providers - with mock.patch.object( - OpenAIEmbeddings, - "embed_documents", - wraps=lambda self, texts: [[0.1, 0.2, 0.3] for _ in texts], - ), mock.patch.object( - AzureOpenAIEmbeddings, - "embed_documents", - wraps=lambda self, texts: [[0.4, 0.5, 0.6] for _ in texts], - ): - openai_embeddings = OpenAIEmbeddings( - model="text-embedding-ada-002", openai_api_key="test-key" - ) - azure_embeddings = AzureOpenAIEmbeddings( - model="text-embedding-ada-002", - azure_endpoint="https://test.openai.azure.com/", - openai_api_key="test-key", - ) + # Force setup to re-run + LangchainIntegration.setup_once() - # Force setup to re-run - LangchainIntegration.setup_once() + with start_transaction(name="test_multiple_providers"): + openai_embeddings.embed_documents(["OpenAI test"]) + azure_embeddings.embed_documents(["Azure test"]) + + spans = [item.payload for item in items if item.type == "span"] + # Find embeddings spans + embeddings_spans = [ + span + for span in spans + if span["attributes"].get("sentry.op") == "gen_ai.embeddings" + ] + # Should have 2 spans, one for each provider + assert len(embeddings_spans) == 2 + + # Verify both spans have proper data + for span in embeddings_spans: + assert span["attributes"]["gen_ai.operation.name"] == "embeddings" + assert ( + span["attributes"]["gen_ai.request.model"] == "text-embedding-ada-002" + ) + assert SPANDATA.GEN_AI_EMBEDDINGS_INPUT in span["attributes"] + else: + events = capture_events() + + # Mock both providers + with mock.patch.object( + OpenAIEmbeddings, + "embed_documents", + wraps=lambda self, texts: [[0.1, 0.2, 0.3] for _ in texts], + ), mock.patch.object( + AzureOpenAIEmbeddings, + "embed_documents", + wraps=lambda self, texts: [[0.4, 0.5, 0.6] for _ in texts], + ): + openai_embeddings = OpenAIEmbeddings( + model="text-embedding-ada-002", openai_api_key="test-key" + ) + azure_embeddings = AzureOpenAIEmbeddings( + model="text-embedding-ada-002", + azure_endpoint="https://test.openai.azure.com/", + openai_api_key="test-key", + ) - with start_transaction(name="test_multiple_providers"): - openai_embeddings.embed_documents(["OpenAI test"]) - azure_embeddings.embed_documents(["Azure test"]) + # Force setup to re-run + LangchainIntegration.setup_once() - # Check captured events - assert len(events) >= 1 - tx = events[0] - assert tx["type"] == "transaction" + with start_transaction(name="test_multiple_providers"): + openai_embeddings.embed_documents(["OpenAI test"]) + azure_embeddings.embed_documents(["Azure test"]) - # Find embeddings spans - embeddings_spans = [ - span for span in tx.get("spans", []) if span.get("op") == "gen_ai.embeddings" - ] - # Should have 2 spans, one for each provider - assert len(embeddings_spans) == 2 + # Check captured events + assert len(events) >= 1 + tx = events[0] + assert tx["type"] == "transaction" + + # Find embeddings spans + embeddings_spans = [ + span + for span in tx.get("spans", []) + if span.get("op") == "gen_ai.embeddings" + ] + # Should have 2 spans, one for each provider + assert len(embeddings_spans) == 2 - # Verify both spans have proper data - for span in embeddings_spans: - assert span["data"]["gen_ai.operation.name"] == "embeddings" - assert span["data"]["gen_ai.request.model"] == "text-embedding-ada-002" - assert SPANDATA.GEN_AI_EMBEDDINGS_INPUT in span["data"] + # Verify both spans have proper data + for span in embeddings_spans: + assert span["data"]["gen_ai.operation.name"] == "embeddings" + assert span["data"]["gen_ai.request.model"] == "text-embedding-ada-002" + assert SPANDATA.GEN_AI_EMBEDDINGS_INPUT in span["data"] def test_langchain_embeddings_error_handling(sentry_init, capture_events): @@ -2472,9 +3951,8 @@ def test_langchain_embeddings_error_handling(sentry_init, capture_events): # Force setup to re-run LangchainIntegration.setup_once() - with start_transaction(name="test_embeddings_error"): - with pytest.raises(ValueError): - embeddings.embed_documents(["Test"]) + with start_transaction(name="test_embeddings_error"), pytest.raises(ValueError): + embeddings.embed_documents(["Test"]) # The error should be captured assert len(events) >= 1 @@ -2484,7 +3962,13 @@ def test_langchain_embeddings_error_handling(sentry_init, capture_events): # but the span should still be created -def test_langchain_embeddings_multiple_calls(sentry_init, capture_events): +@pytest.mark.parametrize("stream_gen_ai_spans", [True, False]) +def test_langchain_embeddings_multiple_calls( + sentry_init, + capture_events, + capture_items, + stream_gen_ai_spans, +): """Test that multiple embeddings calls within a transaction are all traced.""" try: from langchain_openai import OpenAIEmbeddings @@ -2495,60 +3979,120 @@ def test_langchain_embeddings_multiple_calls(sentry_init, capture_events): integrations=[LangchainIntegration(include_prompts=True)], traces_sample_rate=1.0, send_default_pii=True, + stream_gen_ai_spans=stream_gen_ai_spans, ) - events = capture_events() + if stream_gen_ai_spans: + items = capture_items("span") + + # Mock the actual API calls + with mock.patch.object( + OpenAIEmbeddings, + "embed_documents", + wraps=lambda self, texts: [[0.1, 0.2, 0.3] for _ in texts], + ), mock.patch.object( + OpenAIEmbeddings, + "embed_query", + wraps=lambda self, text: [0.4, 0.5, 0.6], + ): + embeddings = OpenAIEmbeddings( + model="text-embedding-ada-002", openai_api_key="test-key" + ) - # Mock the actual API calls - with mock.patch.object( - OpenAIEmbeddings, - "embed_documents", - wraps=lambda self, texts: [[0.1, 0.2, 0.3] for _ in texts], - ), mock.patch.object( - OpenAIEmbeddings, - "embed_query", - wraps=lambda self, text: [0.4, 0.5, 0.6], - ): - embeddings = OpenAIEmbeddings( - model="text-embedding-ada-002", openai_api_key="test-key" - ) + # Force setup to re-run + LangchainIntegration.setup_once() - # Force setup to re-run - LangchainIntegration.setup_once() + with start_transaction(name="test_multiple_embeddings"): + # Call embed_documents + embeddings.embed_documents(["First batch", "Second batch"]) + # Call embed_query + embeddings.embed_query("Single query") + # Call embed_documents again + embeddings.embed_documents(["Third batch"]) - with start_transaction(name="test_multiple_embeddings"): - # Call embed_documents - embeddings.embed_documents(["First batch", "Second batch"]) - # Call embed_query - embeddings.embed_query("Single query") - # Call embed_documents again - embeddings.embed_documents(["Third batch"]) + spans = [item.payload for item in items if item.type == "span"] + # Find embeddings spans - should have 3 (2 embed_documents + 1 embed_query) + embeddings_spans = [ + span + for span in spans + if span["attributes"].get("sentry.op") == "gen_ai.embeddings" + ] + assert len(embeddings_spans) == 3 - # Check captured events - assert len(events) >= 1 - tx = events[0] - assert tx["type"] == "transaction" + # Verify all spans have proper data + for span in embeddings_spans: + assert span["attributes"]["gen_ai.operation.name"] == "embeddings" + assert ( + span["attributes"]["gen_ai.request.model"] == "text-embedding-ada-002" + ) + assert SPANDATA.GEN_AI_EMBEDDINGS_INPUT in span["attributes"] - # Find embeddings spans - should have 3 (2 embed_documents + 1 embed_query) - embeddings_spans = [ - span for span in tx.get("spans", []) if span.get("op") == "gen_ai.embeddings" - ] - assert len(embeddings_spans) == 3 + # Verify the input data is different for each span + input_data_list = [ + span["attributes"][SPANDATA.GEN_AI_EMBEDDINGS_INPUT] + for span in embeddings_spans + ] + else: + events = capture_events() + + # Mock the actual API calls + with mock.patch.object( + OpenAIEmbeddings, + "embed_documents", + wraps=lambda self, texts: [[0.1, 0.2, 0.3] for _ in texts], + ), mock.patch.object( + OpenAIEmbeddings, + "embed_query", + wraps=lambda self, text: [0.4, 0.5, 0.6], + ): + embeddings = OpenAIEmbeddings( + model="text-embedding-ada-002", openai_api_key="test-key" + ) - # Verify all spans have proper data - for span in embeddings_spans: - assert span["data"]["gen_ai.operation.name"] == "embeddings" - assert span["data"]["gen_ai.request.model"] == "text-embedding-ada-002" - assert SPANDATA.GEN_AI_EMBEDDINGS_INPUT in span["data"] + # Force setup to re-run + LangchainIntegration.setup_once() - # Verify the input data is different for each span - input_data_list = [ - span["data"][SPANDATA.GEN_AI_EMBEDDINGS_INPUT] for span in embeddings_spans - ] + with start_transaction(name="test_multiple_embeddings"): + # Call embed_documents + embeddings.embed_documents(["First batch", "Second batch"]) + # Call embed_query + embeddings.embed_query("Single query") + # Call embed_documents again + embeddings.embed_documents(["Third batch"]) + + # Check captured events + assert len(events) >= 1 + tx = events[0] + assert tx["type"] == "transaction" + + # Find embeddings spans - should have 3 (2 embed_documents + 1 embed_query) + embeddings_spans = [ + span + for span in tx.get("spans", []) + if span.get("op") == "gen_ai.embeddings" + ] + assert len(embeddings_spans) == 3 + + # Verify all spans have proper data + for span in embeddings_spans: + assert span["data"]["gen_ai.operation.name"] == "embeddings" + assert span["data"]["gen_ai.request.model"] == "text-embedding-ada-002" + assert SPANDATA.GEN_AI_EMBEDDINGS_INPUT in span["data"] + + # Verify the input data is different for each span + input_data_list = [ + span["data"][SPANDATA.GEN_AI_EMBEDDINGS_INPUT] for span in embeddings_spans + ] # They should all be different (different inputs) assert len(set(str(data) for data in input_data_list)) == 3 -def test_langchain_embeddings_span_hierarchy(sentry_init, capture_events): +@pytest.mark.parametrize("stream_gen_ai_spans", [True, False]) +def test_langchain_embeddings_span_hierarchy( + sentry_init, + capture_events, + capture_items, + stream_gen_ai_spans, +): """Test that embeddings spans are properly nested within parent spans.""" try: from langchain_openai import OpenAIEmbeddings @@ -2559,49 +4103,103 @@ def test_langchain_embeddings_span_hierarchy(sentry_init, capture_events): integrations=[LangchainIntegration(include_prompts=True)], traces_sample_rate=1.0, send_default_pii=True, + stream_gen_ai_spans=stream_gen_ai_spans, ) - events = capture_events() + if stream_gen_ai_spans: + items = capture_items("transaction", "span") + + # Mock the actual API call + with mock.patch.object( + OpenAIEmbeddings, + "embed_documents", + wraps=lambda self, texts: [[0.1, 0.2, 0.3] for _ in texts], + ): + embeddings = OpenAIEmbeddings( + model="text-embedding-ada-002", openai_api_key="test-key" + ) - # Mock the actual API call - with mock.patch.object( - OpenAIEmbeddings, - "embed_documents", - wraps=lambda self, texts: [[0.1, 0.2, 0.3] for _ in texts], - ): - embeddings = OpenAIEmbeddings( - model="text-embedding-ada-002", openai_api_key="test-key" - ) + # Force setup to re-run + LangchainIntegration.setup_once() - # Force setup to re-run - LangchainIntegration.setup_once() + with start_transaction(name="test_span_hierarchy"), sentry_sdk.start_span( + op="custom", name="custom operation" + ): + embeddings.embed_documents(["Test within custom span"]) + + spans = [item.payload for item in items if item.type == "span"] + # Find all spans + embeddings_spans = [ + span + for span in spans + if span["attributes"].get("sentry.op") == "gen_ai.embeddings" + ] + tx = next(item.payload for item in items if item.type == "transaction") + custom_spans = [ + span for span in tx.get("spans", []) if span.get("op") == "custom" + ] + + assert len(embeddings_spans) == 1 + assert len(custom_spans) == 1 + + # Both spans should exist + embeddings_span = embeddings_spans[0] + custom_span = custom_spans[0] + + assert embeddings_span["attributes"]["gen_ai.operation.name"] == "embeddings" + else: + events = capture_events() + + # Mock the actual API call + with mock.patch.object( + OpenAIEmbeddings, + "embed_documents", + wraps=lambda self, texts: [[0.1, 0.2, 0.3] for _ in texts], + ): + embeddings = OpenAIEmbeddings( + model="text-embedding-ada-002", openai_api_key="test-key" + ) - with start_transaction(name="test_span_hierarchy"): - with sentry_sdk.start_span(op="custom", name="custom operation"): + # Force setup to re-run + LangchainIntegration.setup_once() + + with start_transaction(name="test_span_hierarchy"), sentry_sdk.start_span( + op="custom", name="custom operation" + ): embeddings.embed_documents(["Test within custom span"]) - # Check captured events - assert len(events) >= 1 - tx = events[0] - assert tx["type"] == "transaction" + # Check captured events + assert len(events) >= 1 + tx = events[0] + assert tx["type"] == "transaction" - # Find all spans - embeddings_spans = [ - span for span in tx.get("spans", []) if span.get("op") == "gen_ai.embeddings" - ] - custom_spans = [span for span in tx.get("spans", []) if span.get("op") == "custom"] + # Find all spans + embeddings_spans = [ + span + for span in tx.get("spans", []) + if span.get("op") == "gen_ai.embeddings" + ] + custom_spans = [ + span for span in tx.get("spans", []) if span.get("op") == "custom" + ] - assert len(embeddings_spans) == 1 - assert len(custom_spans) == 1 + assert len(embeddings_spans) == 1 + assert len(custom_spans) == 1 - # Both spans should exist - embeddings_span = embeddings_spans[0] - custom_span = custom_spans[0] + # Both spans should exist + embeddings_span = embeddings_spans[0] + custom_span = custom_spans[0] - assert embeddings_span["data"]["gen_ai.operation.name"] == "embeddings" + assert embeddings_span["data"]["gen_ai.operation.name"] == "embeddings" assert custom_span["description"] == "custom operation" -def test_langchain_embeddings_with_list_and_string_inputs(sentry_init, capture_events): +@pytest.mark.parametrize("stream_gen_ai_spans", [True, False]) +def test_langchain_embeddings_with_list_and_string_inputs( + sentry_init, + capture_events, + capture_items, + stream_gen_ai_spans, +): """Test that embeddings correctly handle both list and string inputs.""" try: from langchain_openai import OpenAIEmbeddings @@ -2612,55 +4210,109 @@ def test_langchain_embeddings_with_list_and_string_inputs(sentry_init, capture_e integrations=[LangchainIntegration(include_prompts=True)], traces_sample_rate=1.0, send_default_pii=True, + stream_gen_ai_spans=stream_gen_ai_spans, ) - events = capture_events() + if stream_gen_ai_spans: + items = capture_items("span") + + # Mock the actual API calls + with mock.patch.object( + OpenAIEmbeddings, + "embed_documents", + wraps=lambda self, texts: [[0.1, 0.2, 0.3] for _ in texts], + ), mock.patch.object( + OpenAIEmbeddings, + "embed_query", + wraps=lambda self, text: [0.4, 0.5, 0.6], + ): + embeddings = OpenAIEmbeddings( + model="text-embedding-ada-002", openai_api_key="test-key" + ) - # Mock the actual API calls - with mock.patch.object( - OpenAIEmbeddings, - "embed_documents", - wraps=lambda self, texts: [[0.1, 0.2, 0.3] for _ in texts], - ), mock.patch.object( - OpenAIEmbeddings, - "embed_query", - wraps=lambda self, text: [0.4, 0.5, 0.6], - ): - embeddings = OpenAIEmbeddings( - model="text-embedding-ada-002", openai_api_key="test-key" - ) + # Force setup to re-run + LangchainIntegration.setup_once() - # Force setup to re-run - LangchainIntegration.setup_once() + with start_transaction(name="test_input_types"): + # embed_documents takes a list + embeddings.embed_documents( + ["List item 1", "List item 2", "List item 3"] + ) + # embed_query takes a string + embeddings.embed_query("Single string query") + + spans = [item.payload for item in items if item.type == "span"] + # Find embeddings spans + embeddings_spans = [ + span + for span in spans + if span["attributes"].get("sentry.op") == "gen_ai.embeddings" + ] + assert len(embeddings_spans) == 2 + + # Both should have input data captured as lists + for span in embeddings_spans: + assert SPANDATA.GEN_AI_EMBEDDINGS_INPUT in span["attributes"] + input_data = span["attributes"][SPANDATA.GEN_AI_EMBEDDINGS_INPUT] + # Input should be normalized to list format + if isinstance(input_data, str): + # If serialized, should contain the input text + assert ( + "List item" in input_data or "Single string query" in input_data + ), f"Expected input text in serialized data: {input_data}" + else: + events = capture_events() + + # Mock the actual API calls + with mock.patch.object( + OpenAIEmbeddings, + "embed_documents", + wraps=lambda self, texts: [[0.1, 0.2, 0.3] for _ in texts], + ), mock.patch.object( + OpenAIEmbeddings, + "embed_query", + wraps=lambda self, text: [0.4, 0.5, 0.6], + ): + embeddings = OpenAIEmbeddings( + model="text-embedding-ada-002", openai_api_key="test-key" + ) - with start_transaction(name="test_input_types"): - # embed_documents takes a list - embeddings.embed_documents(["List item 1", "List item 2", "List item 3"]) - # embed_query takes a string - embeddings.embed_query("Single string query") + # Force setup to re-run + LangchainIntegration.setup_once() - # Check captured events - assert len(events) >= 1 - tx = events[0] - assert tx["type"] == "transaction" + with start_transaction(name="test_input_types"): + # embed_documents takes a list + embeddings.embed_documents( + ["List item 1", "List item 2", "List item 3"] + ) + # embed_query takes a string + embeddings.embed_query("Single string query") - # Find embeddings spans - embeddings_spans = [ - span for span in tx.get("spans", []) if span.get("op") == "gen_ai.embeddings" - ] - assert len(embeddings_spans) == 2 + # Check captured events + assert len(events) >= 1 + tx = events[0] + assert tx["type"] == "transaction" - # Both should have input data captured as lists - for span in embeddings_spans: - assert SPANDATA.GEN_AI_EMBEDDINGS_INPUT in span["data"] - input_data = span["data"][SPANDATA.GEN_AI_EMBEDDINGS_INPUT] - # Input should be normalized to list format - if isinstance(input_data, str): - # If serialized, should contain the input text - assert "List item" in input_data or "Single string query" in input_data, ( - f"Expected input text in serialized data: {input_data}" - ) + # Find embeddings spans + embeddings_spans = [ + span + for span in tx.get("spans", []) + if span.get("op") == "gen_ai.embeddings" + ] + assert len(embeddings_spans) == 2 + # Both should have input data captured as lists + for span in embeddings_spans: + assert SPANDATA.GEN_AI_EMBEDDINGS_INPUT in span["data"] + input_data = span["data"][SPANDATA.GEN_AI_EMBEDDINGS_INPUT] + # Input should be normalized to list format + if isinstance(input_data, str): + # If serialized, should contain the input text + assert ( + "List item" in input_data or "Single string query" in input_data + ), f"Expected input text in serialized data: {input_data}" + +@pytest.mark.parametrize("stream_gen_ai_spans", [True, False]) @pytest.mark.parametrize( "response_metadata_model,expected_model", [ @@ -2671,15 +4323,17 @@ def test_langchain_embeddings_with_list_and_string_inputs(sentry_init, capture_e def test_langchain_response_model_extraction( sentry_init, capture_events, + capture_items, response_metadata_model, expected_model, + stream_gen_ai_spans, ): sentry_init( integrations=[LangchainIntegration(include_prompts=True)], traces_sample_rate=1.0, send_default_pii=True, + stream_gen_ai_spans=stream_gen_ai_spans, ) - events = capture_events() callback = SentryLangchainCallback(max_span_map_size=100, include_prompts=True) @@ -2687,42 +4341,83 @@ def test_langchain_response_model_extraction( serialized = {"_type": "openai-chat", "model_name": "gpt-3.5-turbo"} prompts = ["Test prompt"] - with start_transaction(): - callback.on_llm_start( - serialized=serialized, - prompts=prompts, - run_id=run_id, - invocation_params={"model": "gpt-3.5-turbo"}, - ) + if stream_gen_ai_spans: + items = capture_items("span") - response_metadata = {"model_name": response_metadata_model} - message = AIMessageChunk( - content="Test response", response_metadata=response_metadata - ) + with start_transaction(): + callback.on_llm_start( + serialized=serialized, + prompts=prompts, + run_id=run_id, + invocation_params={"model": "gpt-3.5-turbo"}, + ) - generation = Mock(text="Test response", message=message) - response = Mock(generations=[[generation]]) - callback.on_llm_end(response=response, run_id=run_id) + response_metadata = {"model_name": response_metadata_model} + message = AIMessageChunk( + content="Test response", response_metadata=response_metadata + ) - assert len(events) > 0 - tx = events[0] - assert tx["type"] == "transaction" + generation = Mock(text="Test response", message=message) + response = Mock(generations=[[generation]]) + callback.on_llm_end(response=response, run_id=run_id) - llm_spans = [ - span - for span in tx.get("spans", []) - if span.get("op") == "gen_ai.text_completion" - ] - assert len(llm_spans) > 0 + spans = [item.payload for item in items if item.type == "span"] + llm_spans = [ + span + for span in spans + if span["attributes"].get("sentry.op") == "gen_ai.text_completion" + ] + assert len(llm_spans) > 0 - llm_span = llm_spans[0] - assert llm_span["data"]["gen_ai.operation.name"] == "text_completion" + llm_span = llm_spans[0] + assert llm_span["attributes"]["gen_ai.operation.name"] == "text_completion" - if expected_model is not None: - assert SPANDATA.GEN_AI_RESPONSE_MODEL in llm_span["data"] - assert llm_span["data"][SPANDATA.GEN_AI_RESPONSE_MODEL] == expected_model + if expected_model is not None: + assert SPANDATA.GEN_AI_RESPONSE_MODEL in llm_span["attributes"] + assert ( + llm_span["attributes"][SPANDATA.GEN_AI_RESPONSE_MODEL] == expected_model + ) + else: + assert SPANDATA.GEN_AI_RESPONSE_MODEL not in llm_span.get("attributes", {}) else: - assert SPANDATA.GEN_AI_RESPONSE_MODEL not in llm_span.get("data", {}) + events = capture_events() + + with start_transaction(): + callback.on_llm_start( + serialized=serialized, + prompts=prompts, + run_id=run_id, + invocation_params={"model": "gpt-3.5-turbo"}, + ) + + response_metadata = {"model_name": response_metadata_model} + message = AIMessageChunk( + content="Test response", response_metadata=response_metadata + ) + + generation = Mock(text="Test response", message=message) + response = Mock(generations=[[generation]]) + callback.on_llm_end(response=response, run_id=run_id) + + assert len(events) > 0 + tx = events[0] + assert tx["type"] == "transaction" + + llm_spans = [ + span + for span in tx.get("spans", []) + if span.get("op") == "gen_ai.text_completion" + ] + assert len(llm_spans) > 0 + + llm_span = llm_spans[0] + assert llm_span["data"]["gen_ai.operation.name"] == "text_completion" + + if expected_model is not None: + assert SPANDATA.GEN_AI_RESPONSE_MODEL in llm_span["data"] + assert llm_span["data"][SPANDATA.GEN_AI_RESPONSE_MODEL] == expected_model + else: + assert SPANDATA.GEN_AI_RESPONSE_MODEL not in llm_span.get("data", {}) # Tests for multimodal content transformation functions @@ -2936,6 +4631,7 @@ def test_transform_google_file_data(self): } +@pytest.mark.parametrize("stream_gen_ai_spans", [True, False]) @pytest.mark.parametrize( "ai_type,expected_system", [ @@ -2981,13 +4677,18 @@ def test_transform_google_file_data(self): ], ) def test_langchain_ai_system_detection( - sentry_init, capture_events, ai_type, expected_system + sentry_init, + capture_events, + capture_items, + ai_type, + expected_system, + stream_gen_ai_spans, ): sentry_init( integrations=[LangchainIntegration()], traces_sample_rate=1.0, + stream_gen_ai_spans=stream_gen_ai_spans, ) - events = capture_events() callback = SentryLangchainCallback(max_span_map_size=100, include_prompts=True) @@ -2995,35 +4696,67 @@ def test_langchain_ai_system_detection( serialized = {"_type": ai_type} if ai_type is not None else {} prompts = ["Test prompt"] - with start_transaction(): - callback.on_llm_start( - serialized=serialized, - prompts=prompts, - run_id=run_id, - invocation_params={"_type": ai_type, "model": "test-model"}, - ) + if stream_gen_ai_spans: + items = capture_items("span") - generation = Mock(text="Test response", message=None) - response = Mock(generations=[[generation]]) - callback.on_llm_end(response=response, run_id=run_id) + with start_transaction(): + callback.on_llm_start( + serialized=serialized, + prompts=prompts, + run_id=run_id, + invocation_params={"_type": ai_type, "model": "test-model"}, + ) - assert len(events) > 0 - tx = events[0] - assert tx["type"] == "transaction" + generation = Mock(text="Test response", message=None) + response = Mock(generations=[[generation]]) + callback.on_llm_end(response=response, run_id=run_id) - llm_spans = [ - span - for span in tx.get("spans", []) - if span.get("op") == "gen_ai.text_completion" - ] - assert len(llm_spans) > 0 + spans = [item.payload for item in items if item.type == "span"] + llm_spans = [ + span + for span in spans + if span["attributes"].get("sentry.op") == "gen_ai.text_completion" + ] - llm_span = llm_spans[0] + assert len(llm_spans) > 0 + llm_span = llm_spans[0] - if expected_system is not None: - assert llm_span["data"][SPANDATA.GEN_AI_SYSTEM] == expected_system + if expected_system is not None: + assert llm_span["attributes"][SPANDATA.GEN_AI_SYSTEM] == expected_system + else: + assert SPANDATA.GEN_AI_SYSTEM not in llm_span.get("attributes", {}) else: - assert SPANDATA.GEN_AI_SYSTEM not in llm_span.get("data", {}) + events = capture_events() + + with start_transaction(): + callback.on_llm_start( + serialized=serialized, + prompts=prompts, + run_id=run_id, + invocation_params={"_type": ai_type, "model": "test-model"}, + ) + + generation = Mock(text="Test response", message=None) + response = Mock(generations=[[generation]]) + callback.on_llm_end(response=response, run_id=run_id) + + assert len(events) > 0 + tx = events[0] + assert tx["type"] == "transaction" + + llm_spans = [ + span + for span in tx.get("spans", []) + if span.get("op") == "gen_ai.text_completion" + ] + + assert len(llm_spans) > 0 + llm_span = llm_spans[0] + + if expected_system is not None: + assert llm_span["data"][SPANDATA.GEN_AI_SYSTEM] == expected_system + else: + assert SPANDATA.GEN_AI_SYSTEM not in llm_span.get("data", {}) class TestTransformLangchainMessageContent: diff --git a/tests/integrations/langgraph/test_langgraph.py b/tests/integrations/langgraph/test_langgraph.py index 2a385d8a78..f8df60739f 100644 --- a/tests/integrations/langgraph/test_langgraph.py +++ b/tests/integrations/langgraph/test_langgraph.py @@ -137,6 +137,7 @@ def test_langgraph_integration_init(): assert integration.origin == "auto.ai.langgraph" +@pytest.mark.parametrize("stream_gen_ai_spans", [True, False]) @pytest.mark.parametrize( "send_default_pii, include_prompts", [ @@ -147,51 +148,98 @@ def test_langgraph_integration_init(): ], ) def test_state_graph_compile( - sentry_init, capture_events, send_default_pii, include_prompts + sentry_init, + capture_events, + capture_items, + send_default_pii, + include_prompts, + stream_gen_ai_spans, ): """Test StateGraph.compile() wrapper creates proper create_agent span.""" sentry_init( integrations=[LanggraphIntegration(include_prompts=include_prompts)], traces_sample_rate=1.0, send_default_pii=send_default_pii, + stream_gen_ai_spans=stream_gen_ai_spans, ) - events = capture_events() + graph = MockStateGraph() def original_compile(self, *args, **kwargs): return MockCompiledGraph(self.name) - with patch("sentry_sdk.integrations.langgraph.StateGraph"): - with start_transaction(): + if stream_gen_ai_spans: + items = capture_items("transaction", "span") + + with patch("sentry_sdk.integrations.langgraph.StateGraph"), start_transaction(): wrapped_compile = _wrap_state_graph_compile(original_compile) compiled_graph = wrapped_compile( graph, model="test-model", checkpointer=None ) - assert compiled_graph is not None - assert compiled_graph.name == "test_graph" + assert compiled_graph is not None + assert compiled_graph.name == "test_graph" - tx = events[0] - assert tx["type"] == "transaction" + spans = [item.payload for item in items if item.type == "span"] + agent_spans = [ + span + for span in spans + if span["attributes"]["sentry.op"] == OP.GEN_AI_CREATE_AGENT + ] + assert len(agent_spans) == 1 + agent_span = agent_spans[0] - agent_spans = [span for span in tx["spans"] if span["op"] == OP.GEN_AI_CREATE_AGENT] - assert len(agent_spans) == 1 + assert agent_span["name"] == "create_agent test_graph" + assert agent_span["attributes"]["sentry.origin"] == "auto.ai.langgraph" + assert ( + agent_span["attributes"][SPANDATA.GEN_AI_OPERATION_NAME] == "create_agent" + ) + assert agent_span["attributes"][SPANDATA.GEN_AI_AGENT_NAME] == "test_graph" + assert agent_span["attributes"][SPANDATA.GEN_AI_REQUEST_MODEL] == "test-model" + assert SPANDATA.GEN_AI_REQUEST_AVAILABLE_TOOLS in agent_span["attributes"] + + tools_data = agent_span["attributes"][SPANDATA.GEN_AI_REQUEST_AVAILABLE_TOOLS] + assert tools_data == ["search_tool", "calculator"] + assert len(tools_data) == 2 + assert "search_tool" in tools_data + assert "calculator" in tools_data + else: + events = capture_events() - agent_span = agent_spans[0] - assert agent_span["description"] == "create_agent test_graph" - assert agent_span["origin"] == "auto.ai.langgraph" - assert agent_span["data"][SPANDATA.GEN_AI_OPERATION_NAME] == "create_agent" - assert agent_span["data"][SPANDATA.GEN_AI_AGENT_NAME] == "test_graph" - assert agent_span["data"][SPANDATA.GEN_AI_REQUEST_MODEL] == "test-model" - assert SPANDATA.GEN_AI_REQUEST_AVAILABLE_TOOLS in agent_span["data"] + with patch("sentry_sdk.integrations.langgraph.StateGraph"), start_transaction(): + wrapped_compile = _wrap_state_graph_compile(original_compile) + compiled_graph = wrapped_compile( + graph, model="test-model", checkpointer=None + ) - tools_data = agent_span["data"][SPANDATA.GEN_AI_REQUEST_AVAILABLE_TOOLS] - assert tools_data == ["search_tool", "calculator"] - assert len(tools_data) == 2 - assert "search_tool" in tools_data - assert "calculator" in tools_data + assert compiled_graph is not None + assert compiled_graph.name == "test_graph" + tx = events[0] + assert tx["type"] == "transaction" + agent_spans = [ + span for span in tx["spans"] if span["op"] == OP.GEN_AI_CREATE_AGENT + ] + assert len(agent_spans) == 1 + agent_span = agent_spans[0] + + assert agent_span["description"] == "create_agent test_graph" + assert agent_span["origin"] == "auto.ai.langgraph" + assert agent_span["data"][SPANDATA.GEN_AI_OPERATION_NAME] == "create_agent" + assert agent_span["data"][SPANDATA.GEN_AI_AGENT_NAME] == "test_graph" + assert agent_span["data"][SPANDATA.GEN_AI_REQUEST_MODEL] == "test-model" + assert SPANDATA.GEN_AI_REQUEST_AVAILABLE_TOOLS in agent_span["data"] + + tools_data = agent_span["data"][SPANDATA.GEN_AI_REQUEST_AVAILABLE_TOOLS] + + assert tools_data == ["search_tool", "calculator"] + assert len(tools_data) == 2 + assert "search_tool" in tools_data + assert "calculator" in tools_data + + +@pytest.mark.parametrize("stream_gen_ai_spans", [True, False]) @pytest.mark.parametrize( "send_default_pii, include_prompts", [ @@ -201,14 +249,21 @@ def original_compile(self, *args, **kwargs): (False, False), ], ) -def test_pregel_invoke(sentry_init, capture_events, send_default_pii, include_prompts): +def test_pregel_invoke( + sentry_init, + capture_events, + capture_items, + send_default_pii, + include_prompts, + stream_gen_ai_spans, +): """Test Pregel.invoke() wrapper creates proper invoke_agent span.""" sentry_init( integrations=[LanggraphIntegration(include_prompts=include_prompts)], traces_sample_rate=1.0, send_default_pii=send_default_pii, + stream_gen_ai_spans=stream_gen_ai_spans, ) - events = capture_events() test_state = { "messages": [ @@ -239,59 +294,133 @@ def original_invoke(self, *args, **kwargs): ] return {"messages": new_messages} - with start_transaction(): - wrapped_invoke = _wrap_pregel_invoke(original_invoke) - result = wrapped_invoke(pregel, test_state) + if stream_gen_ai_spans: + items = capture_items("transaction", "span") - assert result is not None + with start_transaction(): + wrapped_invoke = _wrap_pregel_invoke(original_invoke) + result = wrapped_invoke(pregel, test_state) - tx = events[0] - assert tx["type"] == "transaction" + assert result is not None - invoke_spans = [ - span for span in tx["spans"] if span["op"] == OP.GEN_AI_INVOKE_AGENT - ] - assert len(invoke_spans) == 1 + spans = [item.payload for item in items if item.type == "span"] + invoke_spans = [ + span + for span in spans + if span["attributes"]["sentry.op"] == OP.GEN_AI_INVOKE_AGENT + ] + assert len(invoke_spans) == 1 - invoke_span = invoke_spans[0] - assert invoke_span["description"] == "invoke_agent test_graph" - assert invoke_span["origin"] == "auto.ai.langgraph" - assert invoke_span["data"][SPANDATA.GEN_AI_OPERATION_NAME] == "invoke_agent" - assert invoke_span["data"][SPANDATA.GEN_AI_PIPELINE_NAME] == "test_graph" - assert invoke_span["data"][SPANDATA.GEN_AI_AGENT_NAME] == "test_graph" + invoke_span = invoke_spans[0] + assert invoke_span["name"] == "invoke_agent test_graph" + assert invoke_span["attributes"]["sentry.origin"] == "auto.ai.langgraph" + assert ( + invoke_span["attributes"][SPANDATA.GEN_AI_OPERATION_NAME] == "invoke_agent" + ) + assert invoke_span["attributes"][SPANDATA.GEN_AI_PIPELINE_NAME] == "test_graph" + assert invoke_span["attributes"][SPANDATA.GEN_AI_AGENT_NAME] == "test_graph" - if send_default_pii and include_prompts: - assert SPANDATA.GEN_AI_REQUEST_MESSAGES in invoke_span["data"] - assert SPANDATA.GEN_AI_RESPONSE_TEXT in invoke_span["data"] + if send_default_pii and include_prompts: + assert SPANDATA.GEN_AI_REQUEST_MESSAGES in invoke_span["attributes"] + assert SPANDATA.GEN_AI_RESPONSE_TEXT in invoke_span["attributes"] - request_messages = invoke_span["data"][SPANDATA.GEN_AI_REQUEST_MESSAGES] + request_messages = invoke_span["attributes"][ + SPANDATA.GEN_AI_REQUEST_MESSAGES + ] - if isinstance(request_messages, str): - import json + if isinstance(request_messages, str): + import json - request_messages = json.loads(request_messages) - assert len(request_messages) == 1 - assert request_messages[0]["content"] == "Of course! How can I assist you?" + request_messages = json.loads(request_messages) + assert len(request_messages) == 1 + assert request_messages[0]["content"] == "Of course! How can I assist you?" - response_text = invoke_span["data"][SPANDATA.GEN_AI_RESPONSE_TEXT] - assert response_text == expected_assistant_response + response_text = invoke_span["attributes"][SPANDATA.GEN_AI_RESPONSE_TEXT] + assert response_text == expected_assistant_response - assert SPANDATA.GEN_AI_RESPONSE_TOOL_CALLS in invoke_span["data"] - tool_calls_data = invoke_span["data"][SPANDATA.GEN_AI_RESPONSE_TOOL_CALLS] - if isinstance(tool_calls_data, str): - import json + assert SPANDATA.GEN_AI_RESPONSE_TOOL_CALLS in invoke_span["attributes"] + tool_calls_data = invoke_span["attributes"][ + SPANDATA.GEN_AI_RESPONSE_TOOL_CALLS + ] + + if isinstance(tool_calls_data, str): + import json - tool_calls_data = json.loads(tool_calls_data) + tool_calls_data = json.loads(tool_calls_data) - assert len(tool_calls_data) == 1 - assert tool_calls_data[0]["id"] == "call_test_123" - assert tool_calls_data[0]["function"]["name"] == "search_tool" + assert len(tool_calls_data) == 1 + assert tool_calls_data[0]["id"] == "call_test_123" + assert tool_calls_data[0]["function"]["name"] == "search_tool" + else: + assert SPANDATA.GEN_AI_REQUEST_MESSAGES not in invoke_span.get( + "attributes", {} + ) + assert SPANDATA.GEN_AI_RESPONSE_TEXT not in invoke_span.get( + "attributes", {} + ) + assert SPANDATA.GEN_AI_RESPONSE_TOOL_CALLS not in invoke_span.get( + "attributes", {} + ) else: - assert SPANDATA.GEN_AI_REQUEST_MESSAGES not in invoke_span.get("data", {}) - assert SPANDATA.GEN_AI_RESPONSE_TEXT not in invoke_span.get("data", {}) - assert SPANDATA.GEN_AI_RESPONSE_TOOL_CALLS not in invoke_span.get("data", {}) + events = capture_events() + + with start_transaction(): + wrapped_invoke = _wrap_pregel_invoke(original_invoke) + result = wrapped_invoke(pregel, test_state) + + assert result is not None + tx = events[0] + assert tx["type"] == "transaction" + invoke_spans = [ + span for span in tx["spans"] if span["op"] == OP.GEN_AI_INVOKE_AGENT + ] + assert len(invoke_spans) == 1 + + invoke_span = invoke_spans[0] + assert invoke_span["description"] == "invoke_agent test_graph" + assert invoke_span["origin"] == "auto.ai.langgraph" + assert invoke_span["data"][SPANDATA.GEN_AI_OPERATION_NAME] == "invoke_agent" + assert invoke_span["data"][SPANDATA.GEN_AI_PIPELINE_NAME] == "test_graph" + assert invoke_span["data"][SPANDATA.GEN_AI_AGENT_NAME] == "test_graph" + + if send_default_pii and include_prompts: + assert SPANDATA.GEN_AI_REQUEST_MESSAGES in invoke_span["data"] + assert SPANDATA.GEN_AI_RESPONSE_TEXT in invoke_span["data"] + + request_messages = invoke_span["data"][SPANDATA.GEN_AI_REQUEST_MESSAGES] + + if isinstance(request_messages, str): + import json + + request_messages = json.loads(request_messages) + assert len(request_messages) == 1 + assert request_messages[0]["content"] == "Of course! How can I assist you?" + + response_text = invoke_span["data"][SPANDATA.GEN_AI_RESPONSE_TEXT] + assert response_text == expected_assistant_response + + assert SPANDATA.GEN_AI_RESPONSE_TOOL_CALLS in invoke_span["data"] + tool_calls_data = invoke_span["data"][SPANDATA.GEN_AI_RESPONSE_TOOL_CALLS] + + if isinstance(tool_calls_data, str): + import json + + tool_calls_data = json.loads(tool_calls_data) + + assert len(tool_calls_data) == 1 + assert tool_calls_data[0]["id"] == "call_test_123" + assert tool_calls_data[0]["function"]["name"] == "search_tool" + else: + assert SPANDATA.GEN_AI_REQUEST_MESSAGES not in invoke_span.get("data", {}) + assert SPANDATA.GEN_AI_RESPONSE_TEXT not in invoke_span.get("data", {}) + assert SPANDATA.GEN_AI_RESPONSE_TOOL_CALLS not in invoke_span.get( + "data", {} + ) + + +@pytest.mark.parametrize("stream_gen_ai_spans", [True, False]) @pytest.mark.parametrize( "send_default_pii, include_prompts", [ @@ -301,14 +430,22 @@ def original_invoke(self, *args, **kwargs): (False, False), ], ) -def test_pregel_ainvoke(sentry_init, capture_events, send_default_pii, include_prompts): +def test_pregel_ainvoke( + sentry_init, + capture_events, + capture_items, + send_default_pii, + include_prompts, + stream_gen_ai_spans, +): """Test Pregel.ainvoke() async wrapper creates proper invoke_agent span.""" sentry_init( integrations=[LanggraphIntegration(include_prompts=include_prompts)], traces_sample_rate=1.0, send_default_pii=send_default_pii, + stream_gen_ai_spans=stream_gen_ai_spans, ) - events = capture_events() + test_state = {"messages": [MockMessage("What's the weather like?", name="user")]} pregel = MockPregelInstance("async_graph") @@ -338,84 +475,181 @@ async def run_test(): result = await wrapped_ainvoke(pregel, test_state) return result - result = asyncio.run(run_test()) - assert result is not None + if stream_gen_ai_spans: + items = capture_items("transaction", "span") - tx = events[0] - assert tx["type"] == "transaction" + result = asyncio.run(run_test()) + assert result is not None - invoke_spans = [ - span for span in tx["spans"] if span["op"] == OP.GEN_AI_INVOKE_AGENT - ] - assert len(invoke_spans) == 1 + spans = [item.payload for item in items if item.type == "span"] + invoke_spans = [ + span + for span in spans + if span["attributes"]["sentry.op"] == OP.GEN_AI_INVOKE_AGENT + ] + assert len(invoke_spans) == 1 - invoke_span = invoke_spans[0] - assert invoke_span["description"] == "invoke_agent async_graph" - assert invoke_span["origin"] == "auto.ai.langgraph" - assert invoke_span["data"][SPANDATA.GEN_AI_OPERATION_NAME] == "invoke_agent" - assert invoke_span["data"][SPANDATA.GEN_AI_PIPELINE_NAME] == "async_graph" - assert invoke_span["data"][SPANDATA.GEN_AI_AGENT_NAME] == "async_graph" + invoke_span = invoke_spans[0] + assert invoke_span["name"] == "invoke_agent async_graph" + assert invoke_span["attributes"]["sentry.origin"] == "auto.ai.langgraph" + assert ( + invoke_span["attributes"][SPANDATA.GEN_AI_OPERATION_NAME] == "invoke_agent" + ) + assert invoke_span["attributes"][SPANDATA.GEN_AI_PIPELINE_NAME] == "async_graph" + assert invoke_span["attributes"][SPANDATA.GEN_AI_AGENT_NAME] == "async_graph" - if send_default_pii and include_prompts: - assert SPANDATA.GEN_AI_REQUEST_MESSAGES in invoke_span["data"] - assert SPANDATA.GEN_AI_RESPONSE_TEXT in invoke_span["data"] + if send_default_pii and include_prompts: + assert SPANDATA.GEN_AI_REQUEST_MESSAGES in invoke_span["attributes"] + assert SPANDATA.GEN_AI_RESPONSE_TEXT in invoke_span["attributes"] - response_text = invoke_span["data"][SPANDATA.GEN_AI_RESPONSE_TEXT] - assert response_text == expected_assistant_response + response_text = invoke_span["attributes"][SPANDATA.GEN_AI_RESPONSE_TEXT] + assert response_text == expected_assistant_response - assert SPANDATA.GEN_AI_RESPONSE_TOOL_CALLS in invoke_span["data"] - tool_calls_data = invoke_span["data"][SPANDATA.GEN_AI_RESPONSE_TOOL_CALLS] - if isinstance(tool_calls_data, str): - import json + assert SPANDATA.GEN_AI_RESPONSE_TOOL_CALLS in invoke_span["attributes"] + tool_calls_data = invoke_span["attributes"][ + SPANDATA.GEN_AI_RESPONSE_TOOL_CALLS + ] + + if isinstance(tool_calls_data, str): + import json - tool_calls_data = json.loads(tool_calls_data) + tool_calls_data = json.loads(tool_calls_data) - assert len(tool_calls_data) == 1 - assert tool_calls_data[0]["id"] == "call_weather_456" - assert tool_calls_data[0]["function"]["name"] == "get_weather" + assert len(tool_calls_data) == 1 + assert tool_calls_data[0]["id"] == "call_weather_456" + assert tool_calls_data[0]["function"]["name"] == "get_weather" + else: + assert SPANDATA.GEN_AI_REQUEST_MESSAGES not in invoke_span.get( + "attributes", {} + ) + assert SPANDATA.GEN_AI_RESPONSE_TEXT not in invoke_span.get( + "attributes", {} + ) + assert SPANDATA.GEN_AI_RESPONSE_TOOL_CALLS not in invoke_span.get( + "attributes", {} + ) else: - assert SPANDATA.GEN_AI_REQUEST_MESSAGES not in invoke_span.get("data", {}) - assert SPANDATA.GEN_AI_RESPONSE_TEXT not in invoke_span.get("data", {}) - assert SPANDATA.GEN_AI_RESPONSE_TOOL_CALLS not in invoke_span.get("data", {}) + events = capture_events() + + result = asyncio.run(run_test()) + assert result is not None + + tx = events[0] + assert tx["type"] == "transaction" + + invoke_spans = [ + span for span in tx["spans"] if span["op"] == OP.GEN_AI_INVOKE_AGENT + ] + assert len(invoke_spans) == 1 + + invoke_span = invoke_spans[0] + assert invoke_span["description"] == "invoke_agent async_graph" + assert invoke_span["origin"] == "auto.ai.langgraph" + assert invoke_span["data"][SPANDATA.GEN_AI_OPERATION_NAME] == "invoke_agent" + assert invoke_span["data"][SPANDATA.GEN_AI_PIPELINE_NAME] == "async_graph" + assert invoke_span["data"][SPANDATA.GEN_AI_AGENT_NAME] == "async_graph" + if send_default_pii and include_prompts: + assert SPANDATA.GEN_AI_REQUEST_MESSAGES in invoke_span["data"] + assert SPANDATA.GEN_AI_RESPONSE_TEXT in invoke_span["data"] -def test_pregel_invoke_error(sentry_init, capture_events): + response_text = invoke_span["data"][SPANDATA.GEN_AI_RESPONSE_TEXT] + assert response_text == expected_assistant_response + + assert SPANDATA.GEN_AI_RESPONSE_TOOL_CALLS in invoke_span["data"] + tool_calls_data = invoke_span["data"][SPANDATA.GEN_AI_RESPONSE_TOOL_CALLS] + + if isinstance(tool_calls_data, str): + import json + + tool_calls_data = json.loads(tool_calls_data) + + assert len(tool_calls_data) == 1 + assert tool_calls_data[0]["id"] == "call_weather_456" + assert tool_calls_data[0]["function"]["name"] == "get_weather" + else: + assert SPANDATA.GEN_AI_REQUEST_MESSAGES not in invoke_span.get("data", {}) + assert SPANDATA.GEN_AI_RESPONSE_TEXT not in invoke_span.get("data", {}) + assert SPANDATA.GEN_AI_RESPONSE_TOOL_CALLS not in invoke_span.get( + "data", {} + ) + + +@pytest.mark.parametrize("stream_gen_ai_spans", [True, False]) +def test_pregel_invoke_error( + sentry_init, + capture_events, + capture_items, + stream_gen_ai_spans, +): """Test error handling during graph execution.""" sentry_init( integrations=[LanggraphIntegration(include_prompts=True)], traces_sample_rate=1.0, send_default_pii=True, + stream_gen_ai_spans=stream_gen_ai_spans, ) - events = capture_events() + test_state = {"messages": [MockMessage("This will fail")]} pregel = MockPregelInstance("error_graph") def original_invoke(self, *args, **kwargs): raise Exception("Graph execution failed") - with start_transaction(), pytest.raises(Exception, match="Graph execution failed"): - wrapped_invoke = _wrap_pregel_invoke(original_invoke) - wrapped_invoke(pregel, test_state) + if stream_gen_ai_spans: + items = capture_items("transaction", "span") - tx = events[0] - invoke_spans = [ - span for span in tx["spans"] if span["op"] == OP.GEN_AI_INVOKE_AGENT - ] - assert len(invoke_spans) == 1 + with start_transaction(), pytest.raises( + Exception, match="Graph execution failed" + ): + wrapped_invoke = _wrap_pregel_invoke(original_invoke) + wrapped_invoke(pregel, test_state) + + spans = [item.payload for item in items if item.type == "span"] + invoke_spans = [ + span + for span in spans + if span["attributes"]["sentry.op"] == OP.GEN_AI_INVOKE_AGENT + ] + assert len(invoke_spans) == 1 - invoke_span = invoke_spans[0] - assert invoke_span.get("status") == "internal_error" - assert invoke_span.get("tags", {}).get("status") == "internal_error" + invoke_span = invoke_spans[0] + assert invoke_span.get("status") == "error" + else: + events = capture_events() + with start_transaction(), pytest.raises( + Exception, match="Graph execution failed" + ): + wrapped_invoke = _wrap_pregel_invoke(original_invoke) + wrapped_invoke(pregel, test_state) + + tx = events[0] + invoke_spans = [ + span for span in tx["spans"] if span["op"] == OP.GEN_AI_INVOKE_AGENT + ] + assert len(invoke_spans) == 1 -def test_pregel_ainvoke_error(sentry_init, capture_events): + invoke_span = invoke_spans[0] + assert invoke_span.get("status") == "internal_error" + assert invoke_span.get("tags", {}).get("status") == "internal_error" + + +@pytest.mark.parametrize("stream_gen_ai_spans", [True, False]) +def test_pregel_ainvoke_error( + sentry_init, + capture_events, + capture_items, + stream_gen_ai_spans, +): """Test error handling during async graph execution.""" sentry_init( integrations=[LanggraphIntegration(include_prompts=True)], traces_sample_rate=1.0, send_default_pii=True, + stream_gen_ai_spans=stream_gen_ai_spans, ) - events = capture_events() + test_state = {"messages": [MockMessage("This will fail async")]} pregel = MockPregelInstance("async_error_graph") @@ -429,56 +663,103 @@ async def run_error_test(): wrapped_ainvoke = _wrap_pregel_ainvoke(original_ainvoke) await wrapped_ainvoke(pregel, test_state) - asyncio.run(run_error_test()) + if stream_gen_ai_spans: + items = capture_items("transaction", "span") - tx = events[0] - invoke_spans = [ - span for span in tx["spans"] if span["op"] == OP.GEN_AI_INVOKE_AGENT - ] - assert len(invoke_spans) == 1 + asyncio.run(run_error_test()) + + spans = [item.payload for item in items if item.type == "span"] + invoke_spans = [ + span + for span in spans + if span["attributes"]["sentry.op"] == OP.GEN_AI_INVOKE_AGENT + ] + assert len(invoke_spans) == 1 + + invoke_span = invoke_spans[0] + assert invoke_span.get("status") == "error" + else: + events = capture_events() - invoke_span = invoke_spans[0] - assert invoke_span.get("status") == "internal_error" - assert invoke_span.get("tags", {}).get("status") == "internal_error" + asyncio.run(run_error_test()) + + tx = events[0] + invoke_spans = [ + span for span in tx["spans"] if span["op"] == OP.GEN_AI_INVOKE_AGENT + ] + assert len(invoke_spans) == 1 + + invoke_span = invoke_spans[0] + assert invoke_span.get("status") == "internal_error" + assert invoke_span.get("tags", {}).get("status") == "internal_error" -def test_span_origin(sentry_init, capture_events): +@pytest.mark.parametrize("stream_gen_ai_spans", [True, False]) +def test_span_origin( + sentry_init, + capture_events, + capture_items, + stream_gen_ai_spans, +): """Test that span origins are correctly set.""" sentry_init( integrations=[LanggraphIntegration()], traces_sample_rate=1.0, + stream_gen_ai_spans=stream_gen_ai_spans, ) - events = capture_events() graph = MockStateGraph() def original_compile(self, *args, **kwargs): return MockCompiledGraph(self.name) - with start_transaction(): - from sentry_sdk.integrations.langgraph import _wrap_state_graph_compile + if stream_gen_ai_spans: + items = capture_items("transaction", "span") - wrapped_compile = _wrap_state_graph_compile(original_compile) - wrapped_compile(graph) + with start_transaction(): + from sentry_sdk.integrations.langgraph import _wrap_state_graph_compile - tx = events[0] - assert tx["contexts"]["trace"]["origin"] == "manual" + wrapped_compile = _wrap_state_graph_compile(original_compile) + wrapped_compile(graph) + + tx = next(item.payload for item in items if item.type == "transaction") + assert tx["contexts"]["trace"]["origin"] == "manual" - for span in tx["spans"]: - assert span["origin"] == "auto.ai.langgraph" + spans = [item.payload for item in items if item.type == "span"] + for span in spans: + assert span["attributes"]["sentry.origin"] == "auto.ai.langgraph" + else: + events = capture_events() + with start_transaction(): + from sentry_sdk.integrations.langgraph import _wrap_state_graph_compile + wrapped_compile = _wrap_state_graph_compile(original_compile) + wrapped_compile(graph) + + tx = events[0] + assert tx["contexts"]["trace"]["origin"] == "manual" + + for span in tx["spans"]: + assert span["origin"] == "auto.ai.langgraph" + + +@pytest.mark.parametrize("stream_gen_ai_spans", [True, False]) @pytest.mark.parametrize("graph_name", ["my_graph", None, ""]) def test_pregel_invoke_with_different_graph_names( - sentry_init, capture_events, graph_name + sentry_init, + capture_events, + capture_items, + graph_name, + stream_gen_ai_spans, ): """Test Pregel.invoke() with different graph name scenarios.""" sentry_init( integrations=[LanggraphIntegration()], traces_sample_rate=1.0, send_default_pii=True, + stream_gen_ai_spans=stream_gen_ai_spans, ) - events = capture_events() pregel = MockPregelInstance(graph_name) if graph_name else MockPregelInstance() if not graph_name: @@ -488,29 +769,67 @@ def test_pregel_invoke_with_different_graph_names( def original_invoke(self, *args, **kwargs): return {"result": "test"} - with start_transaction(): - wrapped_invoke = _wrap_pregel_invoke(original_invoke) - wrapped_invoke(pregel, {"messages": []}) + if stream_gen_ai_spans: + items = capture_items("transaction", "span") - tx = events[0] - invoke_spans = [ - span for span in tx["spans"] if span["op"] == OP.GEN_AI_INVOKE_AGENT - ] - assert len(invoke_spans) == 1 + with start_transaction(): + wrapped_invoke = _wrap_pregel_invoke(original_invoke) + wrapped_invoke(pregel, {"messages": []}) + + spans = [item.payload for item in items if item.type == "span"] + invoke_spans = [ + span + for span in spans + if span["attributes"]["sentry.op"] == OP.GEN_AI_INVOKE_AGENT + ] + assert len(invoke_spans) == 1 - invoke_span = invoke_spans[0] + invoke_span = invoke_spans[0] - if graph_name and graph_name.strip(): - assert invoke_span["description"] == "invoke_agent my_graph" - assert invoke_span["data"][SPANDATA.GEN_AI_PIPELINE_NAME] == graph_name - assert invoke_span["data"][SPANDATA.GEN_AI_AGENT_NAME] == graph_name + if graph_name and graph_name.strip(): + assert invoke_span["name"] == "invoke_agent my_graph" + assert ( + invoke_span["attributes"][SPANDATA.GEN_AI_PIPELINE_NAME] == graph_name + ) + assert invoke_span["attributes"][SPANDATA.GEN_AI_AGENT_NAME] == graph_name + else: + assert invoke_span["name"] == "invoke_agent" + assert SPANDATA.GEN_AI_PIPELINE_NAME not in invoke_span.get( + "attributes", {} + ) + assert SPANDATA.GEN_AI_AGENT_NAME not in invoke_span.get("attributes", {}) else: - assert invoke_span["description"] == "invoke_agent" - assert SPANDATA.GEN_AI_PIPELINE_NAME not in invoke_span.get("data", {}) - assert SPANDATA.GEN_AI_AGENT_NAME not in invoke_span.get("data", {}) + events = capture_events() + + with start_transaction(): + wrapped_invoke = _wrap_pregel_invoke(original_invoke) + wrapped_invoke(pregel, {"messages": []}) + + tx = events[0] + invoke_spans = [ + span for span in tx["spans"] if span["op"] == OP.GEN_AI_INVOKE_AGENT + ] + assert len(invoke_spans) == 1 + + invoke_span = invoke_spans[0] + + if graph_name and graph_name.strip(): + assert invoke_span["description"] == "invoke_agent my_graph" + assert invoke_span["data"][SPANDATA.GEN_AI_PIPELINE_NAME] == graph_name + assert invoke_span["data"][SPANDATA.GEN_AI_AGENT_NAME] == graph_name + else: + assert invoke_span["description"] == "invoke_agent" + assert SPANDATA.GEN_AI_PIPELINE_NAME not in invoke_span.get("data", {}) + assert SPANDATA.GEN_AI_AGENT_NAME not in invoke_span.get("data", {}) -def test_pregel_invoke_span_includes_usage_data(sentry_init, capture_events): +@pytest.mark.parametrize("stream_gen_ai_spans", [True, False]) +def test_pregel_invoke_span_includes_usage_data( + sentry_init, + capture_events, + capture_items, + stream_gen_ai_spans, +): """ Test that invoke_agent spans include aggregated usage data from context_wrapper. This verifies the new functionality added to track token usage in invoke_agent spans. @@ -518,8 +837,8 @@ def test_pregel_invoke_span_includes_usage_data(sentry_init, capture_events): sentry_init( integrations=[LanggraphIntegration()], traces_sample_rate=1.0, + stream_gen_ai_spans=stream_gen_ai_spans, ) - events = capture_events() test_state = { "messages": [ @@ -558,35 +877,73 @@ def original_invoke(self, *args, **kwargs): ] return {"messages": new_messages} - with start_transaction(): - wrapped_invoke = _wrap_pregel_invoke(original_invoke) - result = wrapped_invoke(pregel, test_state) + if stream_gen_ai_spans: + items = capture_items("transaction", "span") - assert result is not None + with start_transaction(): + wrapped_invoke = _wrap_pregel_invoke(original_invoke) + result = wrapped_invoke(pregel, test_state) - tx = events[0] - assert tx["type"] == "transaction" + assert result is not None - invoke_spans = [ - span for span in tx["spans"] if span["op"] == OP.GEN_AI_INVOKE_AGENT - ] - assert len(invoke_spans) == 1 + spans = [item.payload for item in items if item.type == "span"] + invoke_spans = [ + span + for span in spans + if span["attributes"]["sentry.op"] == OP.GEN_AI_INVOKE_AGENT + ] + assert len(invoke_spans) == 1 + + invoke_agent_span = invoke_spans[0] + + # Verify invoke_agent span has usage data + assert invoke_agent_span["name"] == "invoke_agent test_graph" + assert "gen_ai.usage.input_tokens" in invoke_agent_span["attributes"] + assert "gen_ai.usage.output_tokens" in invoke_agent_span["attributes"] + assert "gen_ai.usage.total_tokens" in invoke_agent_span["attributes"] + + # The usage should match the mock_usage values (aggregated across all calls) + assert invoke_agent_span["attributes"]["gen_ai.usage.input_tokens"] == 10 + assert invoke_agent_span["attributes"]["gen_ai.usage.output_tokens"] == 20 + assert invoke_agent_span["attributes"]["gen_ai.usage.total_tokens"] == 30 + else: + events = capture_events() - invoke_agent_span = invoke_spans[0] + with start_transaction(): + wrapped_invoke = _wrap_pregel_invoke(original_invoke) + result = wrapped_invoke(pregel, test_state) + + assert result is not None + + tx = events[0] + assert tx["type"] == "transaction" + + invoke_spans = [ + span for span in tx["spans"] if span["op"] == OP.GEN_AI_INVOKE_AGENT + ] + assert len(invoke_spans) == 1 - # Verify invoke_agent span has usage data - assert invoke_agent_span["description"] == "invoke_agent test_graph" - assert "gen_ai.usage.input_tokens" in invoke_agent_span["data"] - assert "gen_ai.usage.output_tokens" in invoke_agent_span["data"] - assert "gen_ai.usage.total_tokens" in invoke_agent_span["data"] + invoke_agent_span = invoke_spans[0] - # The usage should match the mock_usage values (aggregated across all calls) - assert invoke_agent_span["data"]["gen_ai.usage.input_tokens"] == 10 - assert invoke_agent_span["data"]["gen_ai.usage.output_tokens"] == 20 - assert invoke_agent_span["data"]["gen_ai.usage.total_tokens"] == 30 + # Verify invoke_agent span has usage data + assert invoke_agent_span["description"] == "invoke_agent test_graph" + assert "gen_ai.usage.input_tokens" in invoke_agent_span["data"] + assert "gen_ai.usage.output_tokens" in invoke_agent_span["data"] + assert "gen_ai.usage.total_tokens" in invoke_agent_span["data"] + # The usage should match the mock_usage values (aggregated across all calls) + assert invoke_agent_span["data"]["gen_ai.usage.input_tokens"] == 10 + assert invoke_agent_span["data"]["gen_ai.usage.output_tokens"] == 20 + assert invoke_agent_span["data"]["gen_ai.usage.total_tokens"] == 30 -def test_pregel_ainvoke_span_includes_usage_data(sentry_init, capture_events): + +@pytest.mark.parametrize("stream_gen_ai_spans", [True, False]) +def test_pregel_ainvoke_span_includes_usage_data( + sentry_init, + capture_events, + capture_items, + stream_gen_ai_spans, +): """ Test that invoke_agent spans include aggregated usage data from context_wrapper. This verifies the new functionality added to track token usage in invoke_agent spans. @@ -594,8 +951,8 @@ def test_pregel_ainvoke_span_includes_usage_data(sentry_init, capture_events): sentry_init( integrations=[LanggraphIntegration()], traces_sample_rate=1.0, + stream_gen_ai_spans=stream_gen_ai_spans, ) - events = capture_events() test_state = { "messages": [ @@ -640,32 +997,67 @@ async def run_test(): result = await wrapped_ainvoke(pregel, test_state) return result - result = asyncio.run(run_test()) - assert result is not None + if stream_gen_ai_spans: + items = capture_items("transaction", "span") - tx = events[0] - assert tx["type"] == "transaction" + result = asyncio.run(run_test()) + assert result is not None - invoke_spans = [ - span for span in tx["spans"] if span["op"] == OP.GEN_AI_INVOKE_AGENT - ] - assert len(invoke_spans) == 1 + spans = [item.payload for item in items if item.type == "span"] + invoke_spans = [ + span + for span in spans + if span["attributes"]["sentry.op"] == OP.GEN_AI_INVOKE_AGENT + ] + assert len(invoke_spans) == 1 + + invoke_agent_span = invoke_spans[0] + + # Verify invoke_agent span has usage data + assert invoke_agent_span["name"] == "invoke_agent test_graph" + assert "gen_ai.usage.input_tokens" in invoke_agent_span["attributes"] + assert "gen_ai.usage.output_tokens" in invoke_agent_span["attributes"] + assert "gen_ai.usage.total_tokens" in invoke_agent_span["attributes"] + + # The usage should match the mock_usage values (aggregated across all calls) + assert invoke_agent_span["attributes"]["gen_ai.usage.input_tokens"] == 10 + assert invoke_agent_span["attributes"]["gen_ai.usage.output_tokens"] == 20 + assert invoke_agent_span["attributes"]["gen_ai.usage.total_tokens"] == 30 + else: + events = capture_events() - invoke_agent_span = invoke_spans[0] + result = asyncio.run(run_test()) + assert result is not None - # Verify invoke_agent span has usage data - assert invoke_agent_span["description"] == "invoke_agent test_graph" - assert "gen_ai.usage.input_tokens" in invoke_agent_span["data"] - assert "gen_ai.usage.output_tokens" in invoke_agent_span["data"] - assert "gen_ai.usage.total_tokens" in invoke_agent_span["data"] + tx = events[0] + assert tx["type"] == "transaction" - # The usage should match the mock_usage values (aggregated across all calls) - assert invoke_agent_span["data"]["gen_ai.usage.input_tokens"] == 10 - assert invoke_agent_span["data"]["gen_ai.usage.output_tokens"] == 20 - assert invoke_agent_span["data"]["gen_ai.usage.total_tokens"] == 30 + invoke_spans = [ + span for span in tx["spans"] if span["op"] == OP.GEN_AI_INVOKE_AGENT + ] + assert len(invoke_spans) == 1 + + invoke_agent_span = invoke_spans[0] + + # Verify invoke_agent span has usage data + assert invoke_agent_span["description"] == "invoke_agent test_graph" + assert "gen_ai.usage.input_tokens" in invoke_agent_span["data"] + assert "gen_ai.usage.output_tokens" in invoke_agent_span["data"] + assert "gen_ai.usage.total_tokens" in invoke_agent_span["data"] + # The usage should match the mock_usage values (aggregated across all calls) + assert invoke_agent_span["data"]["gen_ai.usage.input_tokens"] == 10 + assert invoke_agent_span["data"]["gen_ai.usage.output_tokens"] == 20 + assert invoke_agent_span["data"]["gen_ai.usage.total_tokens"] == 30 -def test_pregel_invoke_multiple_llm_calls_aggregate_usage(sentry_init, capture_events): + +@pytest.mark.parametrize("stream_gen_ai_spans", [True, False]) +def test_pregel_invoke_multiple_llm_calls_aggregate_usage( + sentry_init, + capture_events, + capture_items, + stream_gen_ai_spans, +): """ Test that invoke_agent spans show aggregated usage across multiple LLM calls (e.g., when tools are used and multiple API calls are made). @@ -673,8 +1065,8 @@ def test_pregel_invoke_multiple_llm_calls_aggregate_usage(sentry_init, capture_e sentry_init( integrations=[LanggraphIntegration()], traces_sample_rate=1.0, + stream_gen_ai_spans=stream_gen_ai_spans, ) - events = capture_events() test_state = { "messages": [ @@ -724,29 +1116,61 @@ def original_invoke(self, *args, **kwargs): ] return {"messages": new_messages} - with start_transaction(): - wrapped_invoke = _wrap_pregel_invoke(original_invoke) - result = wrapped_invoke(pregel, test_state) + if stream_gen_ai_spans: + items = capture_items("transaction", "span") - assert result is not None + with start_transaction(): + wrapped_invoke = _wrap_pregel_invoke(original_invoke) + result = wrapped_invoke(pregel, test_state) - tx = events[0] - assert tx["type"] == "transaction" + assert result is not None - invoke_spans = [ - span for span in tx["spans"] if span["op"] == OP.GEN_AI_INVOKE_AGENT - ] - assert len(invoke_spans) == 1 - invoke_agent_span = invoke_spans[0] + spans = [item.payload for item in items if item.type == "span"] + invoke_spans = [ + span + for span in spans + if span["attributes"]["sentry.op"] == OP.GEN_AI_INVOKE_AGENT + ] + assert len(invoke_spans) == 1 + invoke_agent_span = invoke_spans[0] + + # Verify invoke_agent span has aggregated usage from both API calls + # Total: 10 + 20 = 30 input tokens, 5 + 15 = 20 output tokens, 15 + 35 = 50 total + assert invoke_agent_span["attributes"]["gen_ai.usage.input_tokens"] == 30 + assert invoke_agent_span["attributes"]["gen_ai.usage.output_tokens"] == 20 + assert invoke_agent_span["attributes"]["gen_ai.usage.total_tokens"] == 50 + else: + events = capture_events() - # Verify invoke_agent span has aggregated usage from both API calls - # Total: 10 + 20 = 30 input tokens, 5 + 15 = 20 output tokens, 15 + 35 = 50 total - assert invoke_agent_span["data"]["gen_ai.usage.input_tokens"] == 30 - assert invoke_agent_span["data"]["gen_ai.usage.output_tokens"] == 20 - assert invoke_agent_span["data"]["gen_ai.usage.total_tokens"] == 50 + with start_transaction(): + wrapped_invoke = _wrap_pregel_invoke(original_invoke) + result = wrapped_invoke(pregel, test_state) + + assert result is not None + tx = events[0] + assert tx["type"] == "transaction" -def test_pregel_ainvoke_multiple_llm_calls_aggregate_usage(sentry_init, capture_events): + invoke_spans = [ + span for span in tx["spans"] if span["op"] == OP.GEN_AI_INVOKE_AGENT + ] + assert len(invoke_spans) == 1 + invoke_agent_span = invoke_spans[0] + + # Verify invoke_agent span has aggregated usage from both API calls + # Total: 10 + 20 = 30 input tokens, 5 + 15 = 20 output tokens, 15 + 35 = 50 total + assert invoke_agent_span["data"]["gen_ai.usage.input_tokens"] == 30 + assert invoke_agent_span["data"]["gen_ai.usage.output_tokens"] == 20 + assert invoke_agent_span["data"]["gen_ai.usage.total_tokens"] == 50 + + +@pytest.mark.parametrize("stream_gen_ai_spans", [True, False]) +def test_pregel_ainvoke_multiple_llm_calls_aggregate_usage( + sentry_init, + capture_events, + capture_items, + stream_gen_ai_spans, +): """ Test that invoke_agent spans show aggregated usage across multiple LLM calls (e.g., when tools are used and multiple API calls are made). @@ -754,8 +1178,8 @@ def test_pregel_ainvoke_multiple_llm_calls_aggregate_usage(sentry_init, capture_ sentry_init( integrations=[LanggraphIntegration()], traces_sample_rate=1.0, + stream_gen_ai_spans=stream_gen_ai_spans, ) - events = capture_events() test_state = { "messages": [ @@ -811,26 +1235,55 @@ async def run_test(): result = await wrapped_ainvoke(pregel, test_state) return result - result = asyncio.run(run_test()) - assert result is not None + if stream_gen_ai_spans: + items = capture_items("transaction", "span") - tx = events[0] - assert tx["type"] == "transaction" + result = asyncio.run(run_test()) + assert result is not None - invoke_spans = [ - span for span in tx["spans"] if span["op"] == OP.GEN_AI_INVOKE_AGENT - ] - assert len(invoke_spans) == 1 - invoke_agent_span = invoke_spans[0] + spans = [item.payload for item in items if item.type == "span"] + invoke_spans = [ + span + for span in spans + if span["attributes"]["sentry.op"] == OP.GEN_AI_INVOKE_AGENT + ] + assert len(invoke_spans) == 1 + invoke_agent_span = invoke_spans[0] + + # Verify invoke_agent span has aggregated usage from both API calls + # Total: 10 + 20 = 30 input tokens, 5 + 15 = 20 output tokens, 15 + 35 = 50 total + assert invoke_agent_span["attributes"]["gen_ai.usage.input_tokens"] == 30 + assert invoke_agent_span["attributes"]["gen_ai.usage.output_tokens"] == 20 + assert invoke_agent_span["attributes"]["gen_ai.usage.total_tokens"] == 50 + else: + events = capture_events() - # Verify invoke_agent span has aggregated usage from both API calls - # Total: 10 + 20 = 30 input tokens, 5 + 15 = 20 output tokens, 15 + 35 = 50 total - assert invoke_agent_span["data"]["gen_ai.usage.input_tokens"] == 30 - assert invoke_agent_span["data"]["gen_ai.usage.output_tokens"] == 20 - assert invoke_agent_span["data"]["gen_ai.usage.total_tokens"] == 50 + result = asyncio.run(run_test()) + assert result is not None + tx = events[0] + assert tx["type"] == "transaction" -def test_pregel_invoke_span_includes_response_model(sentry_init, capture_events): + invoke_spans = [ + span for span in tx["spans"] if span["op"] == OP.GEN_AI_INVOKE_AGENT + ] + assert len(invoke_spans) == 1 + invoke_agent_span = invoke_spans[0] + + # Verify invoke_agent span has aggregated usage from both API calls + # Total: 10 + 20 = 30 input tokens, 5 + 15 = 20 output tokens, 15 + 35 = 50 total + assert invoke_agent_span["data"]["gen_ai.usage.input_tokens"] == 30 + assert invoke_agent_span["data"]["gen_ai.usage.output_tokens"] == 20 + assert invoke_agent_span["data"]["gen_ai.usage.total_tokens"] == 50 + + +@pytest.mark.parametrize("stream_gen_ai_spans", [True, False]) +def test_pregel_invoke_span_includes_response_model( + sentry_init, + capture_events, + capture_items, + stream_gen_ai_spans, +): """ Test that invoke_agent spans include the response model. When an agent makes multiple LLM calls, it should report the last model used. @@ -838,8 +1291,8 @@ def test_pregel_invoke_span_includes_response_model(sentry_init, capture_events) sentry_init( integrations=[LanggraphIntegration()], traces_sample_rate=1.0, + stream_gen_ai_spans=stream_gen_ai_spans, ) - events = capture_events() test_state = { "messages": [ @@ -878,29 +1331,66 @@ def original_invoke(self, *args, **kwargs): ] return {"messages": new_messages} - with start_transaction(): - wrapped_invoke = _wrap_pregel_invoke(original_invoke) - result = wrapped_invoke(pregel, test_state) + if stream_gen_ai_spans: + items = capture_items("transaction", "span") - assert result is not None + with start_transaction(): + wrapped_invoke = _wrap_pregel_invoke(original_invoke) + result = wrapped_invoke(pregel, test_state) - tx = events[0] - assert tx["type"] == "transaction" + assert result is not None - invoke_spans = [ - span for span in tx["spans"] if span["op"] == OP.GEN_AI_INVOKE_AGENT - ] - assert len(invoke_spans) == 1 + spans = [item.payload for item in items if item.type == "span"] + invoke_spans = [ + span + for span in spans + if span["attributes"]["sentry.op"] == OP.GEN_AI_INVOKE_AGENT + ] + assert len(invoke_spans) == 1 - invoke_agent_span = invoke_spans[0] + invoke_agent_span = invoke_spans[0] - # Verify invoke_agent span has response model - assert invoke_agent_span["description"] == "invoke_agent test_graph" - assert "gen_ai.response.model" in invoke_agent_span["data"] - assert invoke_agent_span["data"]["gen_ai.response.model"] == "gpt-4.1-2025-04-14" + # Verify invoke_agent span has response model + assert invoke_agent_span["name"] == "invoke_agent test_graph" + assert "gen_ai.response.model" in invoke_agent_span["attributes"] + assert ( + invoke_agent_span["attributes"]["gen_ai.response.model"] + == "gpt-4.1-2025-04-14" + ) + else: + events = capture_events() + + with start_transaction(): + wrapped_invoke = _wrap_pregel_invoke(original_invoke) + result = wrapped_invoke(pregel, test_state) + + assert result is not None + + tx = events[0] + assert tx["type"] == "transaction" + + invoke_spans = [ + span for span in tx["spans"] if span["op"] == OP.GEN_AI_INVOKE_AGENT + ] + assert len(invoke_spans) == 1 + invoke_agent_span = invoke_spans[0] -def test_pregel_ainvoke_span_includes_response_model(sentry_init, capture_events): + # Verify invoke_agent span has response model + assert invoke_agent_span["description"] == "invoke_agent test_graph" + assert "gen_ai.response.model" in invoke_agent_span["data"] + assert ( + invoke_agent_span["data"]["gen_ai.response.model"] == "gpt-4.1-2025-04-14" + ) + + +@pytest.mark.parametrize("stream_gen_ai_spans", [True, False]) +def test_pregel_ainvoke_span_includes_response_model( + sentry_init, + capture_events, + capture_items, + stream_gen_ai_spans, +): """ Test that invoke_agent spans include the response model. When an agent makes multiple LLM calls, it should report the last model used. @@ -908,8 +1398,8 @@ def test_pregel_ainvoke_span_includes_response_model(sentry_init, capture_events sentry_init( integrations=[LanggraphIntegration()], traces_sample_rate=1.0, + stream_gen_ai_spans=stream_gen_ai_spans, ) - events = capture_events() test_state = { "messages": [ @@ -954,26 +1444,60 @@ async def run_test(): result = await wrapped_ainvoke(pregel, test_state) return result - result = asyncio.run(run_test()) - assert result is not None + if stream_gen_ai_spans: + items = capture_items("transaction", "span") - tx = events[0] - assert tx["type"] == "transaction" + result = asyncio.run(run_test()) + assert result is not None - invoke_spans = [ - span for span in tx["spans"] if span["op"] == OP.GEN_AI_INVOKE_AGENT - ] - assert len(invoke_spans) == 1 + spans = [item.payload for item in items if item.type == "span"] + invoke_spans = [ + span + for span in spans + if span["attributes"]["sentry.op"] == OP.GEN_AI_INVOKE_AGENT + ] + assert len(invoke_spans) == 1 + + invoke_agent_span = invoke_spans[0] + + # Verify invoke_agent span has response model + assert invoke_agent_span["name"] == "invoke_agent test_graph" + assert "gen_ai.response.model" in invoke_agent_span["attributes"] + assert ( + invoke_agent_span["attributes"]["gen_ai.response.model"] + == "gpt-4.1-2025-04-14" + ) + else: + events = capture_events() - invoke_agent_span = invoke_spans[0] + result = asyncio.run(run_test()) + assert result is not None - # Verify invoke_agent span has response model - assert invoke_agent_span["description"] == "invoke_agent test_graph" - assert "gen_ai.response.model" in invoke_agent_span["data"] - assert invoke_agent_span["data"]["gen_ai.response.model"] == "gpt-4.1-2025-04-14" + tx = events[0] + assert tx["type"] == "transaction" + invoke_spans = [ + span for span in tx["spans"] if span["op"] == OP.GEN_AI_INVOKE_AGENT + ] + assert len(invoke_spans) == 1 -def test_pregel_invoke_span_uses_last_response_model(sentry_init, capture_events): + invoke_agent_span = invoke_spans[0] + + # Verify invoke_agent span has response model + assert invoke_agent_span["description"] == "invoke_agent test_graph" + assert "gen_ai.response.model" in invoke_agent_span["data"] + assert ( + invoke_agent_span["data"]["gen_ai.response.model"] == "gpt-4.1-2025-04-14" + ) + + +@pytest.mark.parametrize("stream_gen_ai_spans", [True, False]) +def test_pregel_invoke_span_uses_last_response_model( + sentry_init, + capture_events, + capture_items, + stream_gen_ai_spans, +): """ Test that when an agent makes multiple LLM calls (e.g., with tools), the invoke_agent span reports the last response model used. @@ -981,8 +1505,8 @@ def test_pregel_invoke_span_uses_last_response_model(sentry_init, capture_events sentry_init( integrations=[LanggraphIntegration()], traces_sample_rate=1.0, + stream_gen_ai_spans=stream_gen_ai_spans, ) - events = capture_events() test_state = { "messages": [ @@ -1034,28 +1558,64 @@ def original_invoke(self, *args, **kwargs): ] return {"messages": new_messages} - with start_transaction(): - wrapped_invoke = _wrap_pregel_invoke(original_invoke) - result = wrapped_invoke(pregel, test_state) + if stream_gen_ai_spans: + items = capture_items("transaction", "span") - assert result is not None + with start_transaction(): + wrapped_invoke = _wrap_pregel_invoke(original_invoke) + result = wrapped_invoke(pregel, test_state) - tx = events[0] - assert tx["type"] == "transaction" + assert result is not None - invoke_spans = [ - span for span in tx["spans"] if span["op"] == OP.GEN_AI_INVOKE_AGENT - ] - assert len(invoke_spans) == 1 + spans = [item.payload for item in items if item.type == "span"] + invoke_spans = [ + span + for span in spans + if span["attributes"]["sentry.op"] == OP.GEN_AI_INVOKE_AGENT + ] + assert len(invoke_spans) == 1 + + invoke_agent_span = invoke_spans[0] + + # Verify invoke_agent span uses the LAST response model + assert "gen_ai.response.model" in invoke_agent_span["attributes"] + assert ( + invoke_agent_span["attributes"]["gen_ai.response.model"] + == "gpt-4.1-2025-04-14" + ) + else: + events = capture_events() + + with start_transaction(): + wrapped_invoke = _wrap_pregel_invoke(original_invoke) + result = wrapped_invoke(pregel, test_state) - invoke_agent_span = invoke_spans[0] + assert result is not None - # Verify invoke_agent span uses the LAST response model - assert "gen_ai.response.model" in invoke_agent_span["data"] - assert invoke_agent_span["data"]["gen_ai.response.model"] == "gpt-4.1-2025-04-14" + tx = events[0] + assert tx["type"] == "transaction" + + invoke_spans = [ + span for span in tx["spans"] if span["op"] == OP.GEN_AI_INVOKE_AGENT + ] + assert len(invoke_spans) == 1 + + invoke_agent_span = invoke_spans[0] + + # Verify invoke_agent span uses the LAST response model + assert "gen_ai.response.model" in invoke_agent_span["data"] + assert ( + invoke_agent_span["data"]["gen_ai.response.model"] == "gpt-4.1-2025-04-14" + ) -def test_pregel_ainvoke_span_uses_last_response_model(sentry_init, capture_events): +@pytest.mark.parametrize("stream_gen_ai_spans", [True, False]) +def test_pregel_ainvoke_span_uses_last_response_model( + sentry_init, + capture_events, + capture_items, + stream_gen_ai_spans, +): """ Test that when an agent makes multiple LLM calls (e.g., with tools), the invoke_agent span reports the last response model used. @@ -1063,8 +1623,8 @@ def test_pregel_ainvoke_span_uses_last_response_model(sentry_init, capture_event sentry_init( integrations=[LanggraphIntegration()], traces_sample_rate=1.0, + stream_gen_ai_spans=stream_gen_ai_spans, ) - events = capture_events() test_state = { "messages": [ @@ -1122,22 +1682,49 @@ async def run_test(): result = await wrapped_ainvoke(pregel, test_state) return result - result = asyncio.run(run_test()) - assert result is not None + if stream_gen_ai_spans: + items = capture_items("transaction", "span") - tx = events[0] - assert tx["type"] == "transaction" + result = asyncio.run(run_test()) + assert result is not None - invoke_spans = [ - span for span in tx["spans"] if span["op"] == OP.GEN_AI_INVOKE_AGENT - ] - assert len(invoke_spans) == 1 + spans = [item.payload for item in items if item.type == "span"] + invoke_spans = [ + span + for span in spans + if span["attributes"]["sentry.op"] == OP.GEN_AI_INVOKE_AGENT + ] + assert len(invoke_spans) == 1 - invoke_agent_span = invoke_spans[0] + invoke_agent_span = invoke_spans[0] - # Verify invoke_agent span uses the LAST response model - assert "gen_ai.response.model" in invoke_agent_span["data"] - assert invoke_agent_span["data"]["gen_ai.response.model"] == "gpt-4.1-2025-04-14" + # Verify invoke_agent span uses the LAST response model + assert "gen_ai.response.model" in invoke_agent_span["attributes"] + assert ( + invoke_agent_span["attributes"]["gen_ai.response.model"] + == "gpt-4.1-2025-04-14" + ) + else: + events = capture_events() + + result = asyncio.run(run_test()) + assert result is not None + + tx = events[0] + assert tx["type"] == "transaction" + + invoke_spans = [ + span for span in tx["spans"] if span["op"] == OP.GEN_AI_INVOKE_AGENT + ] + assert len(invoke_spans) == 1 + + invoke_agent_span = invoke_spans[0] + + # Verify invoke_agent span uses the LAST response model + assert "gen_ai.response.model" in invoke_agent_span["data"] + assert ( + invoke_agent_span["data"]["gen_ai.response.model"] == "gpt-4.1-2025-04-14" + ) def test_complex_message_parsing(): @@ -1187,14 +1774,20 @@ def test_complex_message_parsing(): assert result[2]["function_call"]["name"] == "search" -def test_extraction_functions_complex_scenario(sentry_init, capture_events): +@pytest.mark.parametrize("stream_gen_ai_spans", [True, False]) +def test_extraction_functions_complex_scenario( + sentry_init, + capture_events, + capture_items, + stream_gen_ai_spans, +): """Test extraction functions with complex scenarios including multiple messages and edge cases.""" sentry_init( integrations=[LanggraphIntegration(include_prompts=True)], traces_sample_rate=1.0, send_default_pii=True, + stream_gen_ai_spans=stream_gen_ai_spans, ) - events = capture_events() pregel = MockPregelInstance("complex_graph") test_state = {"messages": [MockMessage("Complex request", name="user")]} @@ -1229,27 +1822,57 @@ def original_invoke(self, *args, **kwargs): ] return {"messages": new_messages} - with start_transaction(): - wrapped_invoke = _wrap_pregel_invoke(original_invoke) - result = wrapped_invoke(pregel, test_state) + if stream_gen_ai_spans: + items = capture_items("transaction", "span") - assert result is not None + with start_transaction(): + wrapped_invoke = _wrap_pregel_invoke(original_invoke) + result = wrapped_invoke(pregel, test_state) - tx = events[0] - invoke_spans = [ - span for span in tx["spans"] if span["op"] == OP.GEN_AI_INVOKE_AGENT - ] - assert len(invoke_spans) == 1 + assert result is not None - invoke_span = invoke_spans[0] - assert SPANDATA.GEN_AI_RESPONSE_TEXT in invoke_span["data"] - response_text = invoke_span["data"][SPANDATA.GEN_AI_RESPONSE_TEXT] - assert response_text == "Final response" + spans = [item.payload for item in items if item.type == "span"] + invoke_spans = [ + span + for span in spans + if span["attributes"]["sentry.op"] == OP.GEN_AI_INVOKE_AGENT + ] + assert len(invoke_spans) == 1 - assert SPANDATA.GEN_AI_RESPONSE_TOOL_CALLS in invoke_span["data"] - import json + invoke_span = invoke_spans[0] + assert SPANDATA.GEN_AI_RESPONSE_TEXT in invoke_span["attributes"] + response_text = invoke_span["attributes"][SPANDATA.GEN_AI_RESPONSE_TEXT] + assert response_text == "Final response" + + assert SPANDATA.GEN_AI_RESPONSE_TOOL_CALLS in invoke_span["attributes"] + import json + + tool_calls_data = invoke_span["attributes"][SPANDATA.GEN_AI_RESPONSE_TOOL_CALLS] + else: + events = capture_events() + + with start_transaction(): + wrapped_invoke = _wrap_pregel_invoke(original_invoke) + result = wrapped_invoke(pregel, test_state) + + assert result is not None + + tx = events[0] + invoke_spans = [ + span for span in tx["spans"] if span["op"] == OP.GEN_AI_INVOKE_AGENT + ] + assert len(invoke_spans) == 1 + + invoke_span = invoke_spans[0] + assert SPANDATA.GEN_AI_RESPONSE_TEXT in invoke_span["data"] + response_text = invoke_span["data"][SPANDATA.GEN_AI_RESPONSE_TEXT] + assert response_text == "Final response" + + assert SPANDATA.GEN_AI_RESPONSE_TOOL_CALLS in invoke_span["data"] + import json + + tool_calls_data = invoke_span["data"][SPANDATA.GEN_AI_RESPONSE_TOOL_CALLS] - tool_calls_data = invoke_span["data"][SPANDATA.GEN_AI_RESPONSE_TOOL_CALLS] if isinstance(tool_calls_data, str): tool_calls_data = json.loads(tool_calls_data) @@ -1260,14 +1883,20 @@ def original_invoke(self, *args, **kwargs): assert tool_calls_data[1]["function"]["name"] == "calculate" -def test_langgraph_message_role_mapping(sentry_init, capture_events): +@pytest.mark.parametrize("stream_gen_ai_spans", [True, False]) +def test_langgraph_message_role_mapping( + sentry_init, + capture_events, + capture_items, + stream_gen_ai_spans, +): """Test that Langgraph integration properly maps message roles like 'ai' to 'assistant'""" sentry_init( integrations=[LanggraphIntegration(include_prompts=True)], traces_sample_rate=1.0, send_default_pii=True, + stream_gen_ai_spans=stream_gen_ai_spans, ) - events = capture_events() # Mock a langgraph message with mixed roles class MockMessage: @@ -1288,50 +1917,83 @@ def __init__(self, content, message_type="human"): compiled_graph = MockCompiledGraph("test_graph") pregel = MockPregelInstance(compiled_graph) - with start_transaction(name="langgraph tx"): - # Use the wrapped invoke function directly - from sentry_sdk.integrations.langgraph import _wrap_pregel_invoke + if stream_gen_ai_spans: + items = capture_items("transaction", "span") - wrapped_invoke = _wrap_pregel_invoke( - lambda self, state_data: {"result": "success"} - ) - wrapped_invoke(pregel, state_data) + with start_transaction(name="langgraph tx"): + # Use the wrapped invoke function directly + from sentry_sdk.integrations.langgraph import _wrap_pregel_invoke - (event,) = events - span = event["spans"][0] + wrapped_invoke = _wrap_pregel_invoke( + lambda self, state_data: {"result": "success"} + ) + wrapped_invoke(pregel, state_data) - # Verify that the span was created correctly - assert span["op"] == "gen_ai.invoke_agent" + span = next(item.payload for item in items if item.type == "span") - # If messages were captured, verify role mapping - if SPANDATA.GEN_AI_REQUEST_MESSAGES in span["data"]: - import json + # Verify that the span was created correctly + assert span["attributes"]["sentry.op"] == "gen_ai.invoke_agent" - stored_messages = json.loads(span["data"][SPANDATA.GEN_AI_REQUEST_MESSAGES]) + # If messages were captured, verify role mapping + if SPANDATA.GEN_AI_REQUEST_MESSAGES in span["attributes"]: + import json - # Find messages with specific content to verify role mapping - ai_message = next( - (msg for msg in stored_messages if msg.get("content") == "Hi there!"), None - ) - assistant_message = next( - (msg for msg in stored_messages if msg.get("content") == "How can I help?"), - None, - ) + stored_messages = json.loads( + span["attributes"][SPANDATA.GEN_AI_REQUEST_MESSAGES] + ) + else: + events = capture_events() + + with start_transaction(name="langgraph tx"): + # Use the wrapped invoke function directly + from sentry_sdk.integrations.langgraph import _wrap_pregel_invoke + + wrapped_invoke = _wrap_pregel_invoke( + lambda self, state_data: {"result": "success"} + ) + wrapped_invoke(pregel, state_data) + + (event,) = events + span = event["spans"][0] + + # Verify that the span was created correctly + assert span["op"] == "gen_ai.invoke_agent" - if ai_message: - # "ai" should have been mapped to "assistant" - assert ai_message["role"] == "assistant" + # If messages were captured, verify role mapping + if SPANDATA.GEN_AI_REQUEST_MESSAGES in span["data"]: + import json + + stored_messages = json.loads(span["data"][SPANDATA.GEN_AI_REQUEST_MESSAGES]) + + # Find messages with specific content to verify role mapping + ai_message = next( + (msg for msg in stored_messages if msg.get("content") == "Hi there!"), None + ) + assistant_message = next( + (msg for msg in stored_messages if msg.get("content") == "How can I help?"), + None, + ) - if assistant_message: - # "assistant" should stay "assistant" - assert assistant_message["role"] == "assistant" + if ai_message: + # "ai" should have been mapped to "assistant" + assert ai_message["role"] == "assistant" - # Verify no "ai" roles remain - roles = [msg["role"] for msg in stored_messages if "role" in msg] - assert "ai" not in roles + if assistant_message: + # "assistant" should stay "assistant" + assert assistant_message["role"] == "assistant" + # Verify no "ai" roles remain + roles = [msg["role"] for msg in stored_messages if "role" in msg] + assert "ai" not in roles -def test_langgraph_message_truncation(sentry_init, capture_events): + +@pytest.mark.parametrize("stream_gen_ai_spans", [True, False]) +def test_langgraph_message_truncation( + sentry_init, + capture_events, + capture_items, + stream_gen_ai_spans, +): """Test that large messages are truncated properly in Langgraph integration.""" import json @@ -1339,8 +2001,8 @@ def test_langgraph_message_truncation(sentry_init, capture_events): integrations=[LanggraphIntegration(include_prompts=True)], traces_sample_rate=1.0, send_default_pii=True, + stream_gen_ai_spans=stream_gen_ai_spans, ) - events = capture_events() large_content = ( "This is a very long message that will exceed our size limits. " * 1000 @@ -1360,28 +2022,66 @@ def test_langgraph_message_truncation(sentry_init, capture_events): def original_invoke(self, *args, **kwargs): return {"messages": args[0].get("messages", [])} - with start_transaction(): - wrapped_invoke = _wrap_pregel_invoke(original_invoke) - result = wrapped_invoke(pregel, test_state) + if stream_gen_ai_spans: + items = capture_items("transaction", "span") - assert result is not None - assert len(events) > 0 - tx = events[0] - assert tx["type"] == "transaction" + with start_transaction(): + wrapped_invoke = _wrap_pregel_invoke(original_invoke) + result = wrapped_invoke(pregel, test_state) - invoke_spans = [ - span for span in tx.get("spans", []) if span.get("op") == OP.GEN_AI_INVOKE_AGENT - ] - assert len(invoke_spans) > 0 + assert result is not None + + spans = [item.payload for item in items if item.type == "span"] + invoke_spans = [ + span + for span in spans + if span["attributes"].get("sentry.op") == OP.GEN_AI_INVOKE_AGENT + ] + + assert len(invoke_spans) > 0 + + invoke_span = invoke_spans[0] + assert SPANDATA.GEN_AI_REQUEST_MESSAGES in invoke_span["attributes"] + + messages_data = invoke_span["attributes"][SPANDATA.GEN_AI_REQUEST_MESSAGES] + + assert isinstance(messages_data, str) + + parsed_messages = json.loads(messages_data) + assert isinstance(parsed_messages, list) + assert len(parsed_messages) == 1 + assert "small message 5" in str(parsed_messages[0]) + (tx,) = (item.payload for item in items if item.type == "transaction") + else: + events = capture_events() + + with start_transaction(): + wrapped_invoke = _wrap_pregel_invoke(original_invoke) + result = wrapped_invoke(pregel, test_state) + + assert result is not None + + assert len(events) > 0 + tx = events[0] + assert tx["type"] == "transaction" + + invoke_spans = [ + span + for span in tx.get("spans", []) + if span.get("op") == OP.GEN_AI_INVOKE_AGENT + ] + + assert len(invoke_spans) > 0 + + invoke_span = invoke_spans[0] + assert SPANDATA.GEN_AI_REQUEST_MESSAGES in invoke_span["data"] - invoke_span = invoke_spans[0] - assert SPANDATA.GEN_AI_REQUEST_MESSAGES in invoke_span["data"] + messages_data = invoke_span["data"][SPANDATA.GEN_AI_REQUEST_MESSAGES] + assert isinstance(messages_data, str) - messages_data = invoke_span["data"][SPANDATA.GEN_AI_REQUEST_MESSAGES] - assert isinstance(messages_data, str) + parsed_messages = json.loads(messages_data) + assert isinstance(parsed_messages, list) + assert len(parsed_messages) == 1 + assert "small message 5" in str(parsed_messages[0]) - parsed_messages = json.loads(messages_data) - assert isinstance(parsed_messages, list) - assert len(parsed_messages) == 1 - assert "small message 5" in str(parsed_messages[0]) assert tx["_meta"]["spans"]["0"]["data"]["gen_ai.request.messages"][""]["len"] == 5 diff --git a/tests/integrations/litellm/test_litellm.py b/tests/integrations/litellm/test_litellm.py index 18f8cfaf6e..22663f9472 100644 --- a/tests/integrations/litellm/test_litellm.py +++ b/tests/integrations/litellm/test_litellm.py @@ -131,6 +131,7 @@ def __init__( self.created = 1234567890 +@pytest.mark.parametrize("stream_gen_ai_spans", [True, False]) @pytest.mark.parametrize( "send_default_pii, include_prompts", [ @@ -144,17 +145,19 @@ def test_nonstreaming_chat_completion( reset_litellm_executor, sentry_init, capture_events, + capture_items, send_default_pii, include_prompts, get_model_response, nonstreaming_chat_completions_model_response, + stream_gen_ai_spans, ): sentry_init( integrations=[LiteLLMIntegration(include_prompts=include_prompts)], traces_sample_rate=1.0, send_default_pii=send_default_pii, + stream_gen_ai_spans=stream_gen_ai_spans, ) - events = capture_events() messages = [{"role": "user", "content": "Hello!"}] @@ -176,12 +179,14 @@ def test_nonstreaming_chat_completion( request_headers={"X-Stainless-Raw-Response": "true"}, ) - with mock.patch.object( - client.completions._client._client, - "send", - return_value=model_response, - ): - with start_transaction(name="litellm test"): + if stream_gen_ai_spans: + items = capture_items("transaction", "span") + + with mock.patch.object( + client.completions._client._client, + "send", + return_value=model_response, + ), start_transaction(name="litellm test"): litellm.completion( model="gpt-3.5-turbo", messages=messages, @@ -190,39 +195,86 @@ def test_nonstreaming_chat_completion( litellm_utils.executor.shutdown(wait=True) - assert len(events) == 1 - (event,) = events + (event,) = (item.payload for item in items if item.type == "transaction") + assert event["transaction"] == "litellm test" - assert event["type"] == "transaction" - assert event["transaction"] == "litellm test" - - chat_spans = list( - x - for x in event["spans"] - if x["op"] == OP.GEN_AI_CHAT and x["origin"] == "auto.ai.litellm" - ) - assert len(chat_spans) == 1 - span = chat_spans[0] - - assert span["op"] == OP.GEN_AI_CHAT - assert span["description"] == "chat gpt-3.5-turbo" - assert span["data"][SPANDATA.GEN_AI_REQUEST_MODEL] == "gpt-3.5-turbo" - assert span["data"][SPANDATA.GEN_AI_RESPONSE_MODEL] == "gpt-3.5-turbo" - assert span["data"][SPANDATA.GEN_AI_SYSTEM] == "openai" - assert span["data"][SPANDATA.GEN_AI_OPERATION_NAME] == "chat" - - if send_default_pii and include_prompts: - assert SPANDATA.GEN_AI_REQUEST_MESSAGES in span["data"] - assert SPANDATA.GEN_AI_RESPONSE_TEXT in span["data"] + spans = [item.payload for item in items if item.type == "span"] + chat_spans = list( + x + for x in spans + if x["attributes"]["sentry.op"] == OP.GEN_AI_CHAT + and x["attributes"]["sentry.origin"] == "auto.ai.litellm" + ) + assert len(chat_spans) == 1 + span = chat_spans[0] + + assert span["attributes"]["sentry.op"] == OP.GEN_AI_CHAT + assert span["name"] == "chat gpt-3.5-turbo" + assert span["attributes"][SPANDATA.GEN_AI_REQUEST_MODEL] == "gpt-3.5-turbo" + assert span["attributes"][SPANDATA.GEN_AI_RESPONSE_MODEL] == "gpt-3.5-turbo" + assert span["attributes"][SPANDATA.GEN_AI_SYSTEM] == "openai" + assert span["attributes"][SPANDATA.GEN_AI_OPERATION_NAME] == "chat" + + if send_default_pii and include_prompts: + assert SPANDATA.GEN_AI_REQUEST_MESSAGES in span["attributes"] + assert SPANDATA.GEN_AI_RESPONSE_TEXT in span["attributes"] + else: + assert SPANDATA.GEN_AI_REQUEST_MESSAGES not in span["attributes"] + assert SPANDATA.GEN_AI_RESPONSE_TEXT not in span["attributes"] + + assert span["attributes"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS] == 10 + assert span["attributes"][SPANDATA.GEN_AI_USAGE_OUTPUT_TOKENS] == 20 + assert span["attributes"][SPANDATA.GEN_AI_USAGE_TOTAL_TOKENS] == 30 else: - assert SPANDATA.GEN_AI_REQUEST_MESSAGES not in span["data"] - assert SPANDATA.GEN_AI_RESPONSE_TEXT not in span["data"] + events = capture_events() + + with mock.patch.object( + client.completions._client._client, + "send", + return_value=model_response, + ), start_transaction(name="litellm test"): + litellm.completion( + model="gpt-3.5-turbo", + messages=messages, + client=client, + ) + + litellm_utils.executor.shutdown(wait=True) + + assert len(events) == 1 + (event,) = events + + assert event["type"] == "transaction" + assert event["transaction"] == "litellm test" + + chat_spans = list( + x + for x in event["spans"] + if x["op"] == OP.GEN_AI_CHAT and x["origin"] == "auto.ai.litellm" + ) + assert len(chat_spans) == 1 + span = chat_spans[0] - assert span["data"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS] == 10 - assert span["data"][SPANDATA.GEN_AI_USAGE_OUTPUT_TOKENS] == 20 - assert span["data"][SPANDATA.GEN_AI_USAGE_TOTAL_TOKENS] == 30 + assert span["op"] == OP.GEN_AI_CHAT + assert span["description"] == "chat gpt-3.5-turbo" + assert span["data"][SPANDATA.GEN_AI_REQUEST_MODEL] == "gpt-3.5-turbo" + assert span["data"][SPANDATA.GEN_AI_RESPONSE_MODEL] == "gpt-3.5-turbo" + assert span["data"][SPANDATA.GEN_AI_SYSTEM] == "openai" + assert span["data"][SPANDATA.GEN_AI_OPERATION_NAME] == "chat" + if send_default_pii and include_prompts: + assert SPANDATA.GEN_AI_REQUEST_MESSAGES in span["data"] + assert SPANDATA.GEN_AI_RESPONSE_TEXT in span["data"] + else: + assert SPANDATA.GEN_AI_REQUEST_MESSAGES not in span["data"] + assert SPANDATA.GEN_AI_RESPONSE_TEXT not in span["data"] + assert span["data"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS] == 10 + assert span["data"][SPANDATA.GEN_AI_USAGE_OUTPUT_TOKENS] == 20 + assert span["data"][SPANDATA.GEN_AI_USAGE_TOTAL_TOKENS] == 30 + + +@pytest.mark.parametrize("stream_gen_ai_spans", [True, False]) @pytest.mark.asyncio(loop_scope="session") @pytest.mark.parametrize( "send_default_pii, include_prompts", @@ -236,17 +288,19 @@ def test_nonstreaming_chat_completion( async def test_async_nonstreaming_chat_completion( sentry_init, capture_events, + capture_items, send_default_pii, include_prompts, get_model_response, nonstreaming_chat_completions_model_response, + stream_gen_ai_spans, ): sentry_init( integrations=[LiteLLMIntegration(include_prompts=include_prompts)], traces_sample_rate=1.0, send_default_pii=send_default_pii, + stream_gen_ai_spans=stream_gen_ai_spans, ) - events = capture_events() messages = [{"role": "user", "content": "Hello!"}] @@ -268,12 +322,14 @@ async def test_async_nonstreaming_chat_completion( request_headers={"X-Stainless-Raw-Response": "true"}, ) - with mock.patch.object( - client.completions._client._client, - "send", - return_value=model_response, - ): - with start_transaction(name="litellm test"): + if stream_gen_ai_spans: + items = capture_items("transaction", "span") + + with mock.patch.object( + client.completions._client._client, + "send", + return_value=model_response, + ), start_transaction(name="litellm test"): await litellm.acompletion( model="gpt-3.5-turbo", messages=messages, @@ -283,39 +339,87 @@ async def test_async_nonstreaming_chat_completion( await GLOBAL_LOGGING_WORKER.flush() await asyncio.sleep(0.5) - assert len(events) == 1 - (event,) = events - - assert event["type"] == "transaction" - assert event["transaction"] == "litellm test" + (event,) = (item.payload for item in items if item.type == "transaction") + assert event["transaction"] == "litellm test" - chat_spans = list( - x - for x in event["spans"] - if x["op"] == OP.GEN_AI_CHAT and x["origin"] == "auto.ai.litellm" - ) - assert len(chat_spans) == 1 - span = chat_spans[0] - - assert span["op"] == OP.GEN_AI_CHAT - assert span["description"] == "chat gpt-3.5-turbo" - assert span["data"][SPANDATA.GEN_AI_REQUEST_MODEL] == "gpt-3.5-turbo" - assert span["data"][SPANDATA.GEN_AI_RESPONSE_MODEL] == "gpt-3.5-turbo" - assert span["data"][SPANDATA.GEN_AI_SYSTEM] == "openai" - assert span["data"][SPANDATA.GEN_AI_OPERATION_NAME] == "chat" - - if send_default_pii and include_prompts: - assert SPANDATA.GEN_AI_REQUEST_MESSAGES in span["data"] - assert SPANDATA.GEN_AI_RESPONSE_TEXT in span["data"] + spans = [item.payload for item in items if item.type == "span"] + chat_spans = list( + x + for x in spans + if x["attributes"]["sentry.op"] == OP.GEN_AI_CHAT + and x["attributes"]["sentry.origin"] == "auto.ai.litellm" + ) + assert len(chat_spans) == 1 + span = chat_spans[0] + + assert span["attributes"]["sentry.op"] == OP.GEN_AI_CHAT + assert span["name"] == "chat gpt-3.5-turbo" + assert span["attributes"][SPANDATA.GEN_AI_REQUEST_MODEL] == "gpt-3.5-turbo" + assert span["attributes"][SPANDATA.GEN_AI_RESPONSE_MODEL] == "gpt-3.5-turbo" + assert span["attributes"][SPANDATA.GEN_AI_SYSTEM] == "openai" + assert span["attributes"][SPANDATA.GEN_AI_OPERATION_NAME] == "chat" + + if send_default_pii and include_prompts: + assert SPANDATA.GEN_AI_REQUEST_MESSAGES in span["attributes"] + assert SPANDATA.GEN_AI_RESPONSE_TEXT in span["attributes"] + else: + assert SPANDATA.GEN_AI_REQUEST_MESSAGES not in span["attributes"] + assert SPANDATA.GEN_AI_RESPONSE_TEXT not in span["attributes"] + + assert span["attributes"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS] == 10 + assert span["attributes"][SPANDATA.GEN_AI_USAGE_OUTPUT_TOKENS] == 20 + assert span["attributes"][SPANDATA.GEN_AI_USAGE_TOTAL_TOKENS] == 30 else: - assert SPANDATA.GEN_AI_REQUEST_MESSAGES not in span["data"] - assert SPANDATA.GEN_AI_RESPONSE_TEXT not in span["data"] + events = capture_events() + + with mock.patch.object( + client.completions._client._client, + "send", + return_value=model_response, + ), start_transaction(name="litellm test"): + await litellm.acompletion( + model="gpt-3.5-turbo", + messages=messages, + client=client, + ) + + await GLOBAL_LOGGING_WORKER.flush() + await asyncio.sleep(0.5) + + assert len(events) == 1 + (event,) = events + + assert event["type"] == "transaction" + assert event["transaction"] == "litellm test" - assert span["data"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS] == 10 - assert span["data"][SPANDATA.GEN_AI_USAGE_OUTPUT_TOKENS] == 20 - assert span["data"][SPANDATA.GEN_AI_USAGE_TOTAL_TOKENS] == 30 + chat_spans = list( + x + for x in event["spans"] + if x["op"] == OP.GEN_AI_CHAT and x["origin"] == "auto.ai.litellm" + ) + assert len(chat_spans) == 1 + span = chat_spans[0] + + assert span["op"] == OP.GEN_AI_CHAT + assert span["description"] == "chat gpt-3.5-turbo" + assert span["data"][SPANDATA.GEN_AI_REQUEST_MODEL] == "gpt-3.5-turbo" + assert span["data"][SPANDATA.GEN_AI_RESPONSE_MODEL] == "gpt-3.5-turbo" + assert span["data"][SPANDATA.GEN_AI_SYSTEM] == "openai" + assert span["data"][SPANDATA.GEN_AI_OPERATION_NAME] == "chat" + + if send_default_pii and include_prompts: + assert SPANDATA.GEN_AI_REQUEST_MESSAGES in span["data"] + assert SPANDATA.GEN_AI_RESPONSE_TEXT in span["data"] + else: + assert SPANDATA.GEN_AI_REQUEST_MESSAGES not in span["data"] + assert SPANDATA.GEN_AI_RESPONSE_TEXT not in span["data"] + + assert span["data"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS] == 10 + assert span["data"][SPANDATA.GEN_AI_USAGE_OUTPUT_TOKENS] == 20 + assert span["data"][SPANDATA.GEN_AI_USAGE_TOTAL_TOKENS] == 30 +@pytest.mark.parametrize("stream_gen_ai_spans", [True, False]) @pytest.mark.parametrize( "send_default_pii, include_prompts", [ @@ -329,18 +433,20 @@ def test_streaming_chat_completion( reset_litellm_executor, sentry_init, capture_events, + capture_items, send_default_pii, include_prompts, get_model_response, server_side_event_chunks, streaming_chat_completions_model_response, + stream_gen_ai_spans, ): sentry_init( integrations=[LiteLLMIntegration(include_prompts=include_prompts)], traces_sample_rate=1.0, send_default_pii=send_default_pii, + stream_gen_ai_spans=stream_gen_ai_spans, ) - events = capture_events() messages = [{"role": "user", "content": "Hello!"}] @@ -354,12 +460,14 @@ def test_streaming_chat_completion( request_headers={"X-Stainless-Raw-Response": "true"}, ) - with mock.patch.object( - client.completions._client._client, - "send", - return_value=model_response, - ): - with start_transaction(name="litellm test"): + if stream_gen_ai_spans: + items = capture_items("span") + + with mock.patch.object( + client.completions._client._client, + "send", + return_value=model_response, + ), start_transaction(name="litellm test"): response = litellm.completion( model="gpt-3.5-turbo", messages=messages, @@ -371,22 +479,54 @@ def test_streaming_chat_completion( streaming_handler.executor.shutdown(wait=True) - assert len(events) == 1 - (event,) = events + spans = [item.payload for item in items if item.type == "span"] + chat_spans = list( + x + for x in spans + if x["attributes"]["sentry.op"] == OP.GEN_AI_CHAT + and x["attributes"]["sentry.origin"] == "auto.ai.litellm" + ) + assert len(chat_spans) == 1 + span = chat_spans[0] - assert event["type"] == "transaction" - chat_spans = list( - x - for x in event["spans"] - if x["op"] == OP.GEN_AI_CHAT and x["origin"] == "auto.ai.litellm" - ) - assert len(chat_spans) == 1 - span = chat_spans[0] + assert span["attributes"]["sentry.op"] == OP.GEN_AI_CHAT + assert span["attributes"][SPANDATA.GEN_AI_RESPONSE_STREAMING] is True + else: + events = capture_events() - assert span["op"] == OP.GEN_AI_CHAT - assert span["data"][SPANDATA.GEN_AI_RESPONSE_STREAMING] is True + with mock.patch.object( + client.completions._client._client, + "send", + return_value=model_response, + ), start_transaction(name="litellm test"): + response = litellm.completion( + model="gpt-3.5-turbo", + messages=messages, + client=client, + stream=True, + ) + for _ in response: + pass + streaming_handler.executor.shutdown(wait=True) + assert len(events) == 1 + (event,) = events + + assert event["type"] == "transaction" + chat_spans = list( + x + for x in event["spans"] + if x["op"] == OP.GEN_AI_CHAT and x["origin"] == "auto.ai.litellm" + ) + assert len(chat_spans) == 1 + span = chat_spans[0] + + assert span["op"] == OP.GEN_AI_CHAT + assert span["data"][SPANDATA.GEN_AI_RESPONSE_STREAMING] is True + + +@pytest.mark.parametrize("stream_gen_ai_spans", [True, False]) @pytest.mark.asyncio(loop_scope="session") @pytest.mark.parametrize( "send_default_pii, include_prompts", @@ -400,19 +540,21 @@ def test_streaming_chat_completion( async def test_async_streaming_chat_completion( sentry_init, capture_events, + capture_items, send_default_pii, include_prompts, get_model_response, async_iterator, server_side_event_chunks, streaming_chat_completions_model_response, + stream_gen_ai_spans, ): sentry_init( integrations=[LiteLLMIntegration(include_prompts=include_prompts)], traces_sample_rate=1.0, send_default_pii=send_default_pii, + stream_gen_ai_spans=stream_gen_ai_spans, ) - events = capture_events() messages = [{"role": "user", "content": "Hello!"}] @@ -428,12 +570,14 @@ async def test_async_streaming_chat_completion( request_headers={"X-Stainless-Raw-Response": "true"}, ) - with mock.patch.object( - client.completions._client._client, - "send", - return_value=model_response, - ): - with start_transaction(name="litellm test"): + if stream_gen_ai_spans: + items = capture_items("transaction", "span") + + with mock.patch.object( + client.completions._client._client, + "send", + return_value=model_response, + ), start_transaction(name="litellm test"): response = await litellm.acompletion( model="gpt-3.5-turbo", messages=messages, @@ -446,28 +590,63 @@ async def test_async_streaming_chat_completion( await GLOBAL_LOGGING_WORKER.flush() await asyncio.sleep(0.5) - assert len(events) == 1 - (event,) = events + spans = [item.payload for item in items if item.type == "span"] + chat_spans = list( + x + for x in spans + if x["attributes"]["sentry.op"] == OP.GEN_AI_CHAT + and x["attributes"]["sentry.origin"] == "auto.ai.litellm" + ) + assert len(chat_spans) == 1 + span = chat_spans[0] - assert event["type"] == "transaction" - chat_spans = list( - x - for x in event["spans"] - if x["op"] == OP.GEN_AI_CHAT and x["origin"] == "auto.ai.litellm" - ) - assert len(chat_spans) == 1 - span = chat_spans[0] + assert span["attributes"]["sentry.op"] == OP.GEN_AI_CHAT + assert span["attributes"][SPANDATA.GEN_AI_RESPONSE_STREAMING] is True + else: + events = capture_events() + + with mock.patch.object( + client.completions._client._client, + "send", + return_value=model_response, + ), start_transaction(name="litellm test"): + response = await litellm.acompletion( + model="gpt-3.5-turbo", + messages=messages, + client=client, + stream=True, + ) + async for _ in response: + pass + + await GLOBAL_LOGGING_WORKER.flush() + await asyncio.sleep(0.5) + + assert len(events) == 1 + (event,) = events + + assert event["type"] == "transaction" + chat_spans = list( + x + for x in event["spans"] + if x["op"] == OP.GEN_AI_CHAT and x["origin"] == "auto.ai.litellm" + ) + assert len(chat_spans) == 1 + span = chat_spans[0] - assert span["op"] == OP.GEN_AI_CHAT - assert span["data"][SPANDATA.GEN_AI_RESPONSE_STREAMING] is True + assert span["op"] == OP.GEN_AI_CHAT + assert span["data"][SPANDATA.GEN_AI_RESPONSE_STREAMING] is True +@pytest.mark.parametrize("stream_gen_ai_spans", [True, False]) def test_embeddings_create( sentry_init, capture_events, + capture_items, get_model_response, openai_embedding_model_response, clear_litellm_cache, + stream_gen_ai_spans, ): """ Test that litellm.embedding() calls are properly instrumented. @@ -479,8 +658,8 @@ def test_embeddings_create( integrations=[LiteLLMIntegration(include_prompts=True)], traces_sample_rate=1.0, send_default_pii=True, + stream_gen_ai_spans=stream_gen_ai_spans, ) - events = capture_events() client = OpenAI(api_key="test-key") @@ -490,51 +669,100 @@ def test_embeddings_create( request_headers={"X-Stainless-Raw-Response": "true"}, ) - with mock.patch.object( - client.embeddings._client._client, - "send", - return_value=model_response, - ): - with start_transaction(name="litellm test"): - response = litellm.embedding( - model="text-embedding-ada-002", - input="Hello, world!", - client=client, + if stream_gen_ai_spans: + items = capture_items("transaction", "span") + + with mock.patch.object( + client.embeddings._client._client, + "send", + return_value=model_response, + ): + with start_transaction(name="litellm test"): + response = litellm.embedding( + model="text-embedding-ada-002", + input="Hello, world!", + client=client, + ) + # Allow time for callbacks to complete (they may run in separate threads) + time.sleep(0.1) + + # Response is processed by litellm, so just check it exists + assert response is not None + spans = [item.payload for item in items if item.type == "span"] + spans = list( + x + for x in spans + if x["attributes"]["sentry.op"] == OP.GEN_AI_EMBEDDINGS + and x["attributes"]["sentry.origin"] == "auto.ai.litellm" ) - # Allow time for callbacks to complete (they may run in separate threads) - time.sleep(0.1) - # Response is processed by litellm, so just check it exists - assert response is not None - assert len(events) == 1 - (event,) = events + assert len(spans) == 1 + span = spans[0] - assert event["type"] == "transaction" - spans = list( - x - for x in event["spans"] - if x["op"] == OP.GEN_AI_EMBEDDINGS and x["origin"] == "auto.ai.litellm" - ) - assert len(spans) == 1 - span = spans[0] - - assert span["op"] == OP.GEN_AI_EMBEDDINGS - assert span["description"] == "embeddings text-embedding-ada-002" - assert span["data"][SPANDATA.GEN_AI_OPERATION_NAME] == "embeddings" - assert span["data"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS] == 5 - assert span["data"][SPANDATA.GEN_AI_REQUEST_MODEL] == "text-embedding-ada-002" - # Check that embeddings input is captured (it's JSON serialized) - embeddings_input = span["data"][SPANDATA.GEN_AI_EMBEDDINGS_INPUT] - assert json.loads(embeddings_input) == ["Hello, world!"] + assert span["attributes"]["sentry.op"] == OP.GEN_AI_EMBEDDINGS + assert span["name"] == "embeddings text-embedding-ada-002" + assert span["attributes"][SPANDATA.GEN_AI_OPERATION_NAME] == "embeddings" + assert span["attributes"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS] == 5 + assert ( + span["attributes"][SPANDATA.GEN_AI_REQUEST_MODEL] + == "text-embedding-ada-002" + ) + # Check that embeddings input is captured (it's JSON serialized) + embeddings_input = span["attributes"][SPANDATA.GEN_AI_EMBEDDINGS_INPUT] + assert json.loads(embeddings_input) == ["Hello, world!"] + else: + events = capture_events() + + with mock.patch.object( + client.embeddings._client._client, + "send", + return_value=model_response, + ): + with start_transaction(name="litellm test"): + response = litellm.embedding( + model="text-embedding-ada-002", + input="Hello, world!", + client=client, + ) + # Allow time for callbacks to complete (they may run in separate threads) + time.sleep(0.1) + + # Response is processed by litellm, so just check it exists + assert response is not None + assert len(events) == 1 + (event,) = events + + assert event["type"] == "transaction" + spans = list( + x + for x in event["spans"] + if x["op"] == OP.GEN_AI_EMBEDDINGS and x["origin"] == "auto.ai.litellm" + ) + assert len(spans) == 1 + span = spans[0] + + assert span["op"] == OP.GEN_AI_EMBEDDINGS + assert span["description"] == "embeddings text-embedding-ada-002" + assert span["data"][SPANDATA.GEN_AI_OPERATION_NAME] == "embeddings" + assert span["data"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS] == 5 + assert ( + span["data"][SPANDATA.GEN_AI_REQUEST_MODEL] == "text-embedding-ada-002" + ) + # Check that embeddings input is captured (it's JSON serialized) + embeddings_input = span["data"][SPANDATA.GEN_AI_EMBEDDINGS_INPUT] + assert json.loads(embeddings_input) == ["Hello, world!"] +@pytest.mark.parametrize("stream_gen_ai_spans", [True, False]) @pytest.mark.asyncio(loop_scope="session") async def test_async_embeddings_create( sentry_init, capture_events, + capture_items, get_model_response, openai_embedding_model_response, clear_litellm_cache, + stream_gen_ai_spans, ): """ Test that litellm.embedding() calls are properly instrumented. @@ -546,8 +774,8 @@ async def test_async_embeddings_create( integrations=[LiteLLMIntegration(include_prompts=True)], traces_sample_rate=1.0, send_default_pii=True, + stream_gen_ai_spans=stream_gen_ai_spans, ) - events = capture_events() client = AsyncOpenAI(api_key="test-key") @@ -557,59 +785,109 @@ async def test_async_embeddings_create( request_headers={"X-Stainless-Raw-Response": "true"}, ) - with mock.patch.object( - client.embeddings._client._client, - "send", - return_value=model_response, - ): - with start_transaction(name="litellm test"): - response = await litellm.aembedding( - model="text-embedding-ada-002", - input="Hello, world!", - client=client, + if stream_gen_ai_spans: + items = capture_items("transaction", "span") + + with mock.patch.object( + client.embeddings._client._client, + "send", + return_value=model_response, + ): + with start_transaction(name="litellm test"): + response = await litellm.aembedding( + model="text-embedding-ada-002", + input="Hello, world!", + client=client, + ) + + await GLOBAL_LOGGING_WORKER.flush() + await asyncio.sleep(0.5) + + # Response is processed by litellm, so just check it exists + assert response is not None + spans = [item.payload for item in items if item.type == "span"] + spans = list( + x + for x in spans + if x["attributes"]["sentry.op"] == OP.GEN_AI_EMBEDDINGS + and x["attributes"]["sentry.origin"] == "auto.ai.litellm" + ) + assert len(spans) == 1 + span = spans[0] + + assert span["attributes"]["sentry.op"] == OP.GEN_AI_EMBEDDINGS + assert span["name"] == "embeddings text-embedding-ada-002" + assert span["attributes"][SPANDATA.GEN_AI_OPERATION_NAME] == "embeddings" + assert span["attributes"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS] == 5 + assert ( + span["attributes"][SPANDATA.GEN_AI_REQUEST_MODEL] + == "text-embedding-ada-002" ) + # Check that embeddings input is captured (it's JSON serialized) + embeddings_input = span["attributes"][SPANDATA.GEN_AI_EMBEDDINGS_INPUT] - await GLOBAL_LOGGING_WORKER.flush() - await asyncio.sleep(0.5) + assert json.loads(embeddings_input) == ["Hello, world!"] + else: + events = capture_events() + + with mock.patch.object( + client.embeddings._client._client, + "send", + return_value=model_response, + ): + with start_transaction(name="litellm test"): + response = await litellm.aembedding( + model="text-embedding-ada-002", + input="Hello, world!", + client=client, + ) - # Response is processed by litellm, so just check it exists - assert response is not None - assert len(events) == 1 - (event,) = events + await GLOBAL_LOGGING_WORKER.flush() + await asyncio.sleep(0.5) - assert event["type"] == "transaction" - spans = list( - x - for x in event["spans"] - if x["op"] == OP.GEN_AI_EMBEDDINGS and x["origin"] == "auto.ai.litellm" - ) - assert len(spans) == 1 - span = spans[0] + # Response is processed by litellm, so just check it exists + assert response is not None + assert len(events) == 1 + (event,) = events - assert span["op"] == OP.GEN_AI_EMBEDDINGS - assert span["description"] == "embeddings text-embedding-ada-002" - assert span["data"][SPANDATA.GEN_AI_OPERATION_NAME] == "embeddings" - assert span["data"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS] == 5 - assert span["data"][SPANDATA.GEN_AI_REQUEST_MODEL] == "text-embedding-ada-002" - # Check that embeddings input is captured (it's JSON serialized) - embeddings_input = span["data"][SPANDATA.GEN_AI_EMBEDDINGS_INPUT] - assert json.loads(embeddings_input) == ["Hello, world!"] + assert event["type"] == "transaction" + spans = list( + x + for x in event["spans"] + if x["op"] == OP.GEN_AI_EMBEDDINGS and x["origin"] == "auto.ai.litellm" + ) + assert len(spans) == 1 + span = spans[0] + + assert span["op"] == OP.GEN_AI_EMBEDDINGS + assert span["description"] == "embeddings text-embedding-ada-002" + assert span["data"][SPANDATA.GEN_AI_OPERATION_NAME] == "embeddings" + assert span["data"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS] == 5 + assert ( + span["data"][SPANDATA.GEN_AI_REQUEST_MODEL] == "text-embedding-ada-002" + ) + # Check that embeddings input is captured (it's JSON serialized) + embeddings_input = span["data"][SPANDATA.GEN_AI_EMBEDDINGS_INPUT] + assert json.loads(embeddings_input) == ["Hello, world!"] +@pytest.mark.parametrize("stream_gen_ai_spans", [True, False]) def test_embeddings_create_with_list_input( sentry_init, capture_events, + capture_items, get_model_response, openai_embedding_model_response, clear_litellm_cache, + stream_gen_ai_spans, ): """Test embedding with list input.""" sentry_init( integrations=[LiteLLMIntegration(include_prompts=True)], traces_sample_rate=1.0, send_default_pii=True, + stream_gen_ai_spans=stream_gen_ai_spans, ) - events = capture_events() client = OpenAI(api_key="test-key") @@ -619,60 +897,106 @@ def test_embeddings_create_with_list_input( request_headers={"X-Stainless-Raw-Response": "true"}, ) - with mock.patch.object( - client.embeddings._client._client, - "send", - return_value=model_response, - ): - with start_transaction(name="litellm test"): - response = litellm.embedding( - model="text-embedding-ada-002", - input=["First text", "Second text", "Third text"], - client=client, - ) - # Allow time for callbacks to complete (they may run in separate threads) - time.sleep(0.1) + if stream_gen_ai_spans: + items = capture_items("transaction", "span") - # Response is processed by litellm, so just check it exists - assert response is not None - assert len(events) == 1 - (event,) = events + with mock.patch.object( + client.embeddings._client._client, + "send", + return_value=model_response, + ): + with start_transaction(name="litellm test"): + response = litellm.embedding( + model="text-embedding-ada-002", + input=["First text", "Second text", "Third text"], + client=client, + ) + # Allow time for callbacks to complete (they may run in separate threads) + time.sleep(0.1) + + # Response is processed by litellm, so just check it exists + assert response is not None + + spans = [item.payload for item in items if item.type == "span"] + spans = list( + x + for x in spans + if x["attributes"]["sentry.op"] == OP.GEN_AI_EMBEDDINGS + and x["attributes"]["sentry.origin"] == "auto.ai.litellm" + ) + assert len(spans) == 1 + span = spans[0] + + assert span["attributes"]["sentry.op"] == OP.GEN_AI_EMBEDDINGS + assert span["attributes"][SPANDATA.GEN_AI_OPERATION_NAME] == "embeddings" + # Check that list of embeddings input is captured (it's JSON serialized) + embeddings_input = span["attributes"][SPANDATA.GEN_AI_EMBEDDINGS_INPUT] + + assert json.loads(embeddings_input) == [ + "First text", + "Second text", + "Third text", + ] + else: + events = capture_events() + + with mock.patch.object( + client.embeddings._client._client, + "send", + return_value=model_response, + ): + with start_transaction(name="litellm test"): + response = litellm.embedding( + model="text-embedding-ada-002", + input=["First text", "Second text", "Third text"], + client=client, + ) + # Allow time for callbacks to complete (they may run in separate threads) + time.sleep(0.1) + + # Response is processed by litellm, so just check it exists + assert response is not None + assert len(events) == 1 + (event,) = events + + assert event["type"] == "transaction" + spans = list( + x + for x in event["spans"] + if x["op"] == OP.GEN_AI_EMBEDDINGS and x["origin"] == "auto.ai.litellm" + ) + assert len(spans) == 1 + span = spans[0] - assert event["type"] == "transaction" - spans = list( - x - for x in event["spans"] - if x["op"] == OP.GEN_AI_EMBEDDINGS and x["origin"] == "auto.ai.litellm" - ) - assert len(spans) == 1 - span = spans[0] - - assert span["op"] == OP.GEN_AI_EMBEDDINGS - assert span["data"][SPANDATA.GEN_AI_OPERATION_NAME] == "embeddings" - # Check that list of embeddings input is captured (it's JSON serialized) - embeddings_input = span["data"][SPANDATA.GEN_AI_EMBEDDINGS_INPUT] - assert json.loads(embeddings_input) == [ - "First text", - "Second text", - "Third text", - ] + assert span["op"] == OP.GEN_AI_EMBEDDINGS + assert span["data"][SPANDATA.GEN_AI_OPERATION_NAME] == "embeddings" + # Check that list of embeddings input is captured (it's JSON serialized) + embeddings_input = span["data"][SPANDATA.GEN_AI_EMBEDDINGS_INPUT] + assert json.loads(embeddings_input) == [ + "First text", + "Second text", + "Third text", + ] +@pytest.mark.parametrize("stream_gen_ai_spans", [True, False]) @pytest.mark.asyncio(loop_scope="session") async def test_async_embeddings_create_with_list_input( sentry_init, capture_events, + capture_items, get_model_response, openai_embedding_model_response, clear_litellm_cache, + stream_gen_ai_spans, ): """Test embedding with list input.""" sentry_init( integrations=[LiteLLMIntegration(include_prompts=True)], traces_sample_rate=1.0, send_default_pii=True, + stream_gen_ai_spans=stream_gen_ai_spans, ) - events = capture_events() client = AsyncOpenAI(api_key="test-key") @@ -682,60 +1006,106 @@ async def test_async_embeddings_create_with_list_input( request_headers={"X-Stainless-Raw-Response": "true"}, ) - with mock.patch.object( - client.embeddings._client._client, - "send", - return_value=model_response, - ): - with start_transaction(name="litellm test"): - response = await litellm.aembedding( - model="text-embedding-ada-002", - input=["First text", "Second text", "Third text"], - client=client, + if stream_gen_ai_spans: + items = capture_items("transaction", "span") + + with mock.patch.object( + client.embeddings._client._client, + "send", + return_value=model_response, + ): + with start_transaction(name="litellm test"): + response = await litellm.aembedding( + model="text-embedding-ada-002", + input=["First text", "Second text", "Third text"], + client=client, + ) + + await GLOBAL_LOGGING_WORKER.flush() + await asyncio.sleep(0.5) + + # Response is processed by litellm, so just check it exists + assert response is not None + spans = [item.payload for item in items if item.type == "span"] + spans = list( + x + for x in spans + if x["attributes"]["sentry.op"] == OP.GEN_AI_EMBEDDINGS + and x["attributes"]["sentry.origin"] == "auto.ai.litellm" ) + assert len(spans) == 1 + span = spans[0] + + assert span["attributes"]["sentry.op"] == OP.GEN_AI_EMBEDDINGS + assert span["attributes"][SPANDATA.GEN_AI_OPERATION_NAME] == "embeddings" + # Check that list of embeddings input is captured (it's JSON serialized) + embeddings_input = span["attributes"][SPANDATA.GEN_AI_EMBEDDINGS_INPUT] + + assert json.loads(embeddings_input) == [ + "First text", + "Second text", + "Third text", + ] + else: + events = capture_events() + + with mock.patch.object( + client.embeddings._client._client, + "send", + return_value=model_response, + ): + with start_transaction(name="litellm test"): + response = await litellm.aembedding( + model="text-embedding-ada-002", + input=["First text", "Second text", "Third text"], + client=client, + ) - await GLOBAL_LOGGING_WORKER.flush() - await asyncio.sleep(0.5) + await GLOBAL_LOGGING_WORKER.flush() + await asyncio.sleep(0.5) - # Response is processed by litellm, so just check it exists - assert response is not None - assert len(events) == 1 - (event,) = events + # Response is processed by litellm, so just check it exists + assert response is not None + assert len(events) == 1 + (event,) = events - assert event["type"] == "transaction" - spans = list( - x - for x in event["spans"] - if x["op"] == OP.GEN_AI_EMBEDDINGS and x["origin"] == "auto.ai.litellm" - ) - assert len(spans) == 1 - span = spans[0] - - assert span["op"] == OP.GEN_AI_EMBEDDINGS - assert span["data"][SPANDATA.GEN_AI_OPERATION_NAME] == "embeddings" - # Check that list of embeddings input is captured (it's JSON serialized) - embeddings_input = span["data"][SPANDATA.GEN_AI_EMBEDDINGS_INPUT] - assert json.loads(embeddings_input) == [ - "First text", - "Second text", - "Third text", - ] + assert event["type"] == "transaction" + spans = list( + x + for x in event["spans"] + if x["op"] == OP.GEN_AI_EMBEDDINGS and x["origin"] == "auto.ai.litellm" + ) + assert len(spans) == 1 + span = spans[0] + + assert span["op"] == OP.GEN_AI_EMBEDDINGS + assert span["data"][SPANDATA.GEN_AI_OPERATION_NAME] == "embeddings" + # Check that list of embeddings input is captured (it's JSON serialized) + embeddings_input = span["data"][SPANDATA.GEN_AI_EMBEDDINGS_INPUT] + assert json.loads(embeddings_input) == [ + "First text", + "Second text", + "Third text", + ] +@pytest.mark.parametrize("stream_gen_ai_spans", [True, False]) def test_embeddings_no_pii( sentry_init, capture_events, + capture_items, get_model_response, openai_embedding_model_response, clear_litellm_cache, + stream_gen_ai_spans, ): """Test that PII is not captured when disabled.""" sentry_init( integrations=[LiteLLMIntegration(include_prompts=True)], traces_sample_rate=1.0, send_default_pii=False, # PII disabled + stream_gen_ai_spans=stream_gen_ai_spans, ) - events = capture_events() client = OpenAI(api_key="test-key") @@ -745,54 +1115,92 @@ def test_embeddings_no_pii( request_headers={"X-Stainless-Raw-Response": "true"}, ) - with mock.patch.object( - client.embeddings._client._client, - "send", - return_value=model_response, - ): - with start_transaction(name="litellm test"): - response = litellm.embedding( - model="text-embedding-ada-002", - input="Hello, world!", - client=client, - ) - # Allow time for callbacks to complete (they may run in separate threads) - time.sleep(0.1) + if stream_gen_ai_spans: + items = capture_items("transaction", "span") - # Response is processed by litellm, so just check it exists - assert response is not None - assert len(events) == 1 - (event,) = events + with mock.patch.object( + client.embeddings._client._client, + "send", + return_value=model_response, + ): + with start_transaction(name="litellm test"): + response = litellm.embedding( + model="text-embedding-ada-002", + input="Hello, world!", + client=client, + ) + # Allow time for callbacks to complete (they may run in separate threads) + time.sleep(0.1) + + # Response is processed by litellm, so just check it exists + assert response is not None + spans = [item.payload for item in items if item.type == "span"] + spans = list( + x + for x in spans + if x["attributes"]["sentry.op"] == OP.GEN_AI_EMBEDDINGS + and x["attributes"]["sentry.origin"] == "auto.ai.litellm" + ) + assert len(spans) == 1 + span = spans[0] - assert event["type"] == "transaction" - spans = list( - x - for x in event["spans"] - if x["op"] == OP.GEN_AI_EMBEDDINGS and x["origin"] == "auto.ai.litellm" - ) - assert len(spans) == 1 - span = spans[0] + assert span["attributes"]["sentry.op"] == OP.GEN_AI_EMBEDDINGS + # Check that embeddings input is NOT captured when PII is disabled + assert SPANDATA.GEN_AI_EMBEDDINGS_INPUT not in span["attributes"] + else: + events = capture_events() + + with mock.patch.object( + client.embeddings._client._client, + "send", + return_value=model_response, + ): + with start_transaction(name="litellm test"): + response = litellm.embedding( + model="text-embedding-ada-002", + input="Hello, world!", + client=client, + ) + # Allow time for callbacks to complete (they may run in separate threads) + time.sleep(0.1) + + # Response is processed by litellm, so just check it exists + assert response is not None + assert len(events) == 1 + (event,) = events + + assert event["type"] == "transaction" + spans = list( + x + for x in event["spans"] + if x["op"] == OP.GEN_AI_EMBEDDINGS and x["origin"] == "auto.ai.litellm" + ) + assert len(spans) == 1 + span = spans[0] - assert span["op"] == OP.GEN_AI_EMBEDDINGS - # Check that embeddings input is NOT captured when PII is disabled - assert SPANDATA.GEN_AI_EMBEDDINGS_INPUT not in span["data"] + assert span["op"] == OP.GEN_AI_EMBEDDINGS + # Check that embeddings input is NOT captured when PII is disabled + assert SPANDATA.GEN_AI_EMBEDDINGS_INPUT not in span["data"] +@pytest.mark.parametrize("stream_gen_ai_spans", [True, False]) @pytest.mark.asyncio(loop_scope="session") async def test_async_embeddings_no_pii( sentry_init, capture_events, + capture_items, get_model_response, openai_embedding_model_response, clear_litellm_cache, + stream_gen_ai_spans, ): """Test that PII is not captured when disabled.""" sentry_init( integrations=[LiteLLMIntegration(include_prompts=True)], traces_sample_rate=1.0, send_default_pii=False, # PII disabled + stream_gen_ai_spans=stream_gen_ai_spans, ) - events = capture_events() client = AsyncOpenAI(api_key="test-key") @@ -802,48 +1210,90 @@ async def test_async_embeddings_no_pii( request_headers={"X-Stainless-Raw-Response": "true"}, ) - with mock.patch.object( - client.embeddings._client._client, - "send", - return_value=model_response, - ): - with start_transaction(name="litellm test"): - response = await litellm.aembedding( - model="text-embedding-ada-002", - input="Hello, world!", - client=client, + if stream_gen_ai_spans: + items = capture_items("transaction", "span") + + with mock.patch.object( + client.embeddings._client._client, + "send", + return_value=model_response, + ): + with start_transaction(name="litellm test"): + response = await litellm.aembedding( + model="text-embedding-ada-002", + input="Hello, world!", + client=client, + ) + + await GLOBAL_LOGGING_WORKER.flush() + await asyncio.sleep(0.5) + + # Response is processed by litellm, so just check it exists + assert response is not None + spans = [item.payload for item in items if item.type == "span"] + spans = list( + x + for x in spans + if x["attributes"]["sentry.op"] == OP.GEN_AI_EMBEDDINGS + and x["attributes"]["sentry.origin"] == "auto.ai.litellm" ) + assert len(spans) == 1 + span = spans[0] - await GLOBAL_LOGGING_WORKER.flush() - await asyncio.sleep(0.5) + assert span["attributes"]["sentry.op"] == OP.GEN_AI_EMBEDDINGS + # Check that embeddings input is NOT captured when PII is disabled + assert SPANDATA.GEN_AI_EMBEDDINGS_INPUT not in span["attributes"] + else: + events = capture_events() + + with mock.patch.object( + client.embeddings._client._client, + "send", + return_value=model_response, + ): + with start_transaction(name="litellm test"): + response = await litellm.aembedding( + model="text-embedding-ada-002", + input="Hello, world!", + client=client, + ) - # Response is processed by litellm, so just check it exists - assert response is not None - assert len(events) == 1 - (event,) = events + await GLOBAL_LOGGING_WORKER.flush() + await asyncio.sleep(0.5) - assert event["type"] == "transaction" - spans = list( - x - for x in event["spans"] - if x["op"] == OP.GEN_AI_EMBEDDINGS and x["origin"] == "auto.ai.litellm" - ) - assert len(spans) == 1 - span = spans[0] + # Response is processed by litellm, so just check it exists + assert response is not None + assert len(events) == 1 + (event,) = events + + assert event["type"] == "transaction" + spans = list( + x + for x in event["spans"] + if x["op"] == OP.GEN_AI_EMBEDDINGS and x["origin"] == "auto.ai.litellm" + ) + assert len(spans) == 1 + span = spans[0] - assert span["op"] == OP.GEN_AI_EMBEDDINGS - # Check that embeddings input is NOT captured when PII is disabled - assert SPANDATA.GEN_AI_EMBEDDINGS_INPUT not in span["data"] + assert span["op"] == OP.GEN_AI_EMBEDDINGS + # Check that embeddings input is NOT captured when PII is disabled + assert SPANDATA.GEN_AI_EMBEDDINGS_INPUT not in span["data"] +@pytest.mark.parametrize("stream_gen_ai_spans", [True, False]) def test_exception_handling( - reset_litellm_executor, sentry_init, capture_events, get_rate_limit_model_response + reset_litellm_executor, + sentry_init, + capture_events, + capture_items, + get_rate_limit_model_response, + stream_gen_ai_spans, ): sentry_init( integrations=[LiteLLMIntegration()], traces_sample_rate=1.0, + stream_gen_ai_spans=stream_gen_ai_spans, ) - events = capture_events() messages = [{"role": "user", "content": "Hello!"}] @@ -851,35 +1301,65 @@ def test_exception_handling( model_response = get_rate_limit_model_response() - with mock.patch.object( - client.completions._client._client, - "send", - return_value=model_response, - ): - with start_transaction(name="litellm test"): - with pytest.raises(litellm.RateLimitError): - litellm.completion( - model="gpt-3.5-turbo", - messages=messages, - client=client, - ) + if stream_gen_ai_spans: + items = capture_items("event") + + with mock.patch.object( + client.completions._client._client, + "send", + return_value=model_response, + ), start_transaction(name="litellm test"), pytest.raises( + litellm.RateLimitError + ): + litellm.completion( + model="gpt-3.5-turbo", + messages=messages, + client=client, + ) - # Should have error event and transaction - assert len(events) >= 1 - # Find the error event - error_events = [e for e in events if e.get("level") == "error"] + # Find the error event + error_events = [ + item.payload + for item in items + if item.type == "event" and item.payload.get("level") == "error" + ] + else: + events = capture_events() + + with mock.patch.object( + client.completions._client._client, + "send", + return_value=model_response, + ), start_transaction(name="litellm test"), pytest.raises( + litellm.RateLimitError + ): + litellm.completion( + model="gpt-3.5-turbo", + messages=messages, + client=client, + ) + + # Should have error event and transaction + assert len(events) >= 1 + # Find the error event + error_events = [e for e in events if e.get("level") == "error"] assert len(error_events) == 1 +@pytest.mark.parametrize("stream_gen_ai_spans", [True, False]) @pytest.mark.asyncio(loop_scope="session") async def test_async_exception_handling( - sentry_init, capture_events, get_rate_limit_model_response + sentry_init, + capture_events, + capture_items, + get_rate_limit_model_response, + stream_gen_ai_spans, ): sentry_init( integrations=[LiteLLMIntegration()], traces_sample_rate=1.0, + stream_gen_ai_spans=stream_gen_ai_spans, ) - events = capture_events() messages = [{"role": "user", "content": "Hello!"}] @@ -887,38 +1367,66 @@ async def test_async_exception_handling( model_response = get_rate_limit_model_response() - with mock.patch.object( - client.embeddings._client._client, - "send", - return_value=model_response, - ): - with start_transaction(name="litellm test"): - with pytest.raises(litellm.RateLimitError): - await litellm.acompletion( - model="gpt-3.5-turbo", - messages=messages, - client=client, - ) + if stream_gen_ai_spans: + items = capture_items("event") + + with mock.patch.object( + client.embeddings._client._client, + "send", + return_value=model_response, + ), start_transaction(name="litellm test"), pytest.raises( + litellm.RateLimitError + ): + await litellm.acompletion( + model="gpt-3.5-turbo", + messages=messages, + client=client, + ) - # Should have error event and transaction - assert len(events) >= 1 - # Find the error event - error_events = [e for e in events if e.get("level") == "error"] + # Find the error event + error_events = [ + item.payload + for item in items + if item.type == "event" and item.payload.get("level") == "error" + ] + else: + events = capture_events() + + with mock.patch.object( + client.embeddings._client._client, + "send", + return_value=model_response, + ), start_transaction(name="litellm test"), pytest.raises( + litellm.RateLimitError + ): + await litellm.acompletion( + model="gpt-3.5-turbo", + messages=messages, + client=client, + ) + + # Should have error event and transaction + assert len(events) >= 1 + # Find the error event + error_events = [e for e in events if e.get("level") == "error"] assert len(error_events) == 1 +@pytest.mark.parametrize("stream_gen_ai_spans", [True, False]) def test_span_origin( reset_litellm_executor, sentry_init, capture_events, + capture_items, get_model_response, nonstreaming_chat_completions_model_response, + stream_gen_ai_spans, ): sentry_init( integrations=[LiteLLMIntegration()], traces_sample_rate=1.0, + stream_gen_ai_spans=stream_gen_ai_spans, ) - events = capture_events() messages = [{"role": "user", "content": "Hello!"}] @@ -940,12 +1448,14 @@ def test_span_origin( request_headers={"X-Stainless-Raw-Response": "true"}, ) - with mock.patch.object( - client.completions._client._client, - "send", - return_value=model_response, - ): - with start_transaction(name="litellm test"): + if stream_gen_ai_spans: + items = capture_items("transaction", "span") + + with mock.patch.object( + client.completions._client._client, + "send", + return_value=model_response, + ), start_transaction(name="litellm test"): litellm.completion( model="gpt-3.5-turbo", messages=messages, @@ -954,27 +1464,51 @@ def test_span_origin( litellm_utils.executor.shutdown(wait=True) - (event,) = events + (event,) = (item.payload for item in items if item.type == "transaction") + assert event["contexts"]["trace"]["origin"] == "manual" - assert event["contexts"]["trace"]["origin"] == "manual" - assert event["spans"][0]["origin"] == "auto.ai.litellm" + spans = [item.payload for item in items if item.type == "span"] + assert spans[0]["attributes"]["sentry.origin"] == "auto.ai.litellm" + else: + events = capture_events() + with mock.patch.object( + client.completions._client._client, + "send", + return_value=model_response, + ), start_transaction(name="litellm test"): + litellm.completion( + model="gpt-3.5-turbo", + messages=messages, + client=client, + ) + + litellm_utils.executor.shutdown(wait=True) + + (event,) = events + + assert event["contexts"]["trace"]["origin"] == "manual" + assert event["spans"][0]["origin"] == "auto.ai.litellm" + +@pytest.mark.parametrize("stream_gen_ai_spans", [True, False]) def test_multiple_providers( reset_litellm_executor, sentry_init, capture_events, + capture_items, get_model_response, nonstreaming_chat_completions_model_response, nonstreaming_anthropic_model_response, nonstreaming_google_genai_model_response, + stream_gen_ai_spans, ): """Test that the integration correctly identifies different providers.""" sentry_init( integrations=[LiteLLMIntegration()], traces_sample_rate=1.0, + stream_gen_ai_spans=stream_gen_ai_spans, ) - events = capture_events() messages = [{"role": "user", "content": "Hello!"}] @@ -995,12 +1529,14 @@ def test_multiple_providers( request_headers={"X-Stainless-Raw-Response": "true"}, ) - with mock.patch.object( - openai_client.completions._client._client, - "send", - return_value=openai_model_response, - ): - with start_transaction(name="test gpt-3.5-turbo"): + if stream_gen_ai_spans: + items = capture_items("transaction") + + with mock.patch.object( + openai_client.completions._client._client, + "send", + return_value=openai_model_response, + ), start_transaction(name="test gpt-3.5-turbo"): litellm.completion( model="gpt-3.5-turbo", messages=messages, @@ -1009,21 +1545,20 @@ def test_multiple_providers( litellm_utils.executor.shutdown(wait=True) - _reset_litellm_executor() + _reset_litellm_executor() - anthropic_client = HTTPHandler() - anthropic_model_response = get_model_response( - nonstreaming_anthropic_model_response, - serialize_pydantic=True, - request_headers={"X-Stainless-Raw-Response": "true"}, - ) + anthropic_client = HTTPHandler() + anthropic_model_response = get_model_response( + nonstreaming_anthropic_model_response, + serialize_pydantic=True, + request_headers={"X-Stainless-Raw-Response": "true"}, + ) - with mock.patch.object( - anthropic_client, - "post", - return_value=anthropic_model_response, - ): - with start_transaction(name="test claude-3-opus-20240229"): + with mock.patch.object( + anthropic_client, + "post", + return_value=anthropic_model_response, + ), start_transaction(name="test claude-3-opus-20240229"): litellm.completion( model="claude-3-opus-20240229", messages=messages, @@ -1033,20 +1568,19 @@ def test_multiple_providers( litellm_utils.executor.shutdown(wait=True) - _reset_litellm_executor() + _reset_litellm_executor() - gemini_client = HTTPHandler() - gemini_model_response = get_model_response( - nonstreaming_google_genai_model_response, - serialize_pydantic=True, - ) + gemini_client = HTTPHandler() + gemini_model_response = get_model_response( + nonstreaming_google_genai_model_response, + serialize_pydantic=True, + ) - with mock.patch.object( - gemini_client, - "post", - return_value=gemini_model_response, - ): - with start_transaction(name="test gemini/gemini-pro"): + with mock.patch.object( + gemini_client, + "post", + return_value=gemini_model_response, + ), start_transaction(name="test gemini/gemini-pro"): litellm.completion( model="gemini/gemini-pro", messages=messages, @@ -1056,29 +1590,100 @@ def test_multiple_providers( litellm_utils.executor.shutdown(wait=True) - assert len(events) == 3 + events = [item.payload for item in items if item.type == "transaction"] + assert len(events) == 3 + + spans = [item.payload for item in items if item.type == "span"] + for span in spans: + # The provider should be detected by litellm.get_llm_provider + assert SPANDATA.GEN_AI_SYSTEM in span["attributes"] + else: + events = capture_events() + + with mock.patch.object( + openai_client.completions._client._client, + "send", + return_value=openai_model_response, + ), start_transaction(name="test gpt-3.5-turbo"): + litellm.completion( + model="gpt-3.5-turbo", + messages=messages, + client=openai_client, + ) + + litellm_utils.executor.shutdown(wait=True) - for i in range(3): - span = events[i]["spans"][0] - # The provider should be detected by litellm.get_llm_provider - assert SPANDATA.GEN_AI_SYSTEM in span["data"] + _reset_litellm_executor() + anthropic_client = HTTPHandler() + anthropic_model_response = get_model_response( + nonstreaming_anthropic_model_response, + serialize_pydantic=True, + request_headers={"X-Stainless-Raw-Response": "true"}, + ) + with mock.patch.object( + anthropic_client, + "post", + return_value=anthropic_model_response, + ), start_transaction(name="test claude-3-opus-20240229"): + litellm.completion( + model="claude-3-opus-20240229", + messages=messages, + client=anthropic_client, + api_key="test-key", + ) + + litellm_utils.executor.shutdown(wait=True) + + _reset_litellm_executor() + + gemini_client = HTTPHandler() + gemini_model_response = get_model_response( + nonstreaming_google_genai_model_response, + serialize_pydantic=True, + ) + + with mock.patch.object( + gemini_client, + "post", + return_value=gemini_model_response, + ), start_transaction(name="test gemini/gemini-pro"): + litellm.completion( + model="gemini/gemini-pro", + messages=messages, + client=gemini_client, + api_key="test-key", + ) + + litellm_utils.executor.shutdown(wait=True) + + assert len(events) == 3 + + for i in range(3): + span = events[i]["spans"][0] + # The provider should be detected by litellm.get_llm_provider + assert SPANDATA.GEN_AI_SYSTEM in span["data"] + + +@pytest.mark.parametrize("stream_gen_ai_spans", [True, False]) @pytest.mark.asyncio(loop_scope="session") async def test_async_multiple_providers( sentry_init, capture_events, + capture_items, get_model_response, nonstreaming_chat_completions_model_response, nonstreaming_anthropic_model_response, nonstreaming_google_genai_model_response, + stream_gen_ai_spans, ): """Test that the integration correctly identifies different providers.""" sentry_init( integrations=[LiteLLMIntegration()], traces_sample_rate=1.0, + stream_gen_ai_spans=stream_gen_ai_spans, ) - events = capture_events() messages = [{"role": "user", "content": "Hello!"}] @@ -1099,12 +1704,14 @@ async def test_async_multiple_providers( request_headers={"X-Stainless-Raw-Response": "true"}, ) - with mock.patch.object( - openai_client.completions._client._client, - "send", - return_value=openai_model_response, - ): - with start_transaction(name="test gpt-3.5-turbo"): + if stream_gen_ai_spans: + items = capture_items("transaction", "span") + + with mock.patch.object( + openai_client.completions._client._client, + "send", + return_value=openai_model_response, + ), start_transaction(name="test gpt-3.5-turbo"): await litellm.acompletion( model="gpt-3.5-turbo", messages=messages, @@ -1114,21 +1721,20 @@ async def test_async_multiple_providers( await GLOBAL_LOGGING_WORKER.flush() await asyncio.sleep(0.5) - _reset_litellm_executor() + _reset_litellm_executor() - anthropic_client = AsyncHTTPHandler() - anthropic_model_response = get_model_response( - nonstreaming_anthropic_model_response, - serialize_pydantic=True, - request_headers={"X-Stainless-Raw-Response": "True"}, - ) + anthropic_client = AsyncHTTPHandler() + anthropic_model_response = get_model_response( + nonstreaming_anthropic_model_response, + serialize_pydantic=True, + request_headers={"X-Stainless-Raw-Response": "True"}, + ) - with mock.patch.object( - anthropic_client, - "post", - return_value=anthropic_model_response, - ): - with start_transaction(name="test claude-3-opus-20240229"): + with mock.patch.object( + anthropic_client, + "post", + return_value=anthropic_model_response, + ), start_transaction(name="test claude-3-opus-20240229"): await litellm.acompletion( model="claude-3-opus-20240229", messages=messages, @@ -1139,20 +1745,90 @@ async def test_async_multiple_providers( await GLOBAL_LOGGING_WORKER.flush() await asyncio.sleep(0.5) - _reset_litellm_executor() + _reset_litellm_executor() - gemini_client = AsyncHTTPHandler() - gemini_model_response = get_model_response( - nonstreaming_google_genai_model_response, - serialize_pydantic=True, - ) + gemini_client = AsyncHTTPHandler() + gemini_model_response = get_model_response( + nonstreaming_google_genai_model_response, + serialize_pydantic=True, + ) - with mock.patch.object( - gemini_client, - "post", - return_value=gemini_model_response, - ): - with start_transaction(name="test gemini/gemini-pro"): + with mock.patch.object( + gemini_client, + "post", + return_value=gemini_model_response, + ), start_transaction(name="test gemini/gemini-pro"): + await litellm.acompletion( + model="gemini/gemini-pro", + messages=messages, + client=gemini_client, + api_key="test-key", + ) + + await GLOBAL_LOGGING_WORKER.flush() + await asyncio.sleep(0.5) + + events = [item.payload for item in items if item.type == "transaction"] + assert len(events) == 3 + + spans = [item.payload for item in items if item.type == "span"] + for span in spans: + # The provider should be detected by litellm.get_llm_provider + assert SPANDATA.GEN_AI_SYSTEM in span["attributes"] + else: + events = capture_events() + + with mock.patch.object( + openai_client.completions._client._client, + "send", + return_value=openai_model_response, + ), start_transaction(name="test gpt-3.5-turbo"): + await litellm.acompletion( + model="gpt-3.5-turbo", + messages=messages, + client=openai_client, + ) + + await GLOBAL_LOGGING_WORKER.flush() + await asyncio.sleep(0.5) + + _reset_litellm_executor() + + anthropic_client = AsyncHTTPHandler() + anthropic_model_response = get_model_response( + nonstreaming_anthropic_model_response, + serialize_pydantic=True, + request_headers={"X-Stainless-Raw-Response": "True"}, + ) + + with mock.patch.object( + anthropic_client, + "post", + return_value=anthropic_model_response, + ), start_transaction(name="test claude-3-opus-20240229"): + await litellm.acompletion( + model="claude-3-opus-20240229", + messages=messages, + client=anthropic_client, + api_key="test-key", + ) + + await GLOBAL_LOGGING_WORKER.flush() + await asyncio.sleep(0.5) + + _reset_litellm_executor() + + gemini_client = AsyncHTTPHandler() + gemini_model_response = get_model_response( + nonstreaming_google_genai_model_response, + serialize_pydantic=True, + ) + + with mock.patch.object( + gemini_client, + "post", + return_value=gemini_model_response, + ), start_transaction(name="test gemini/gemini-pro"): await litellm.acompletion( model="gemini/gemini-pro", messages=messages, @@ -1163,27 +1839,30 @@ async def test_async_multiple_providers( await GLOBAL_LOGGING_WORKER.flush() await asyncio.sleep(0.5) - assert len(events) == 3 + assert len(events) == 3 - for i in range(3): - span = events[i]["spans"][0] - # The provider should be detected by litellm.get_llm_provider - assert SPANDATA.GEN_AI_SYSTEM in span["data"] + for i in range(3): + span = events[i]["spans"][0] + # The provider should be detected by litellm.get_llm_provider + assert SPANDATA.GEN_AI_SYSTEM in span["data"] +@pytest.mark.parametrize("stream_gen_ai_spans", [True, False]) def test_additional_parameters( reset_litellm_executor, sentry_init, capture_events, + capture_items, get_model_response, nonstreaming_chat_completions_model_response, + stream_gen_ai_spans, ): """Test that additional parameters are captured.""" sentry_init( integrations=[LiteLLMIntegration()], traces_sample_rate=1.0, + stream_gen_ai_spans=stream_gen_ai_spans, ) - events = capture_events() messages = [{"role": "user", "content": "Hello!"}] client = OpenAI(api_key="test-key") @@ -1204,12 +1883,14 @@ def test_additional_parameters( request_headers={"X-Stainless-Raw-Response": "true"}, ) - with mock.patch.object( - client.completions._client._client, - "send", - return_value=model_response, - ): - with start_transaction(name="litellm test"): + if stream_gen_ai_spans: + items = capture_items("transaction", "span") + + with mock.patch.object( + client.completions._client._client, + "send", + return_value=model_response, + ), start_transaction(name="litellm test"): litellm.completion( model="gpt-3.5-turbo", messages=messages, @@ -1223,35 +1904,74 @@ def test_additional_parameters( litellm_utils.executor.shutdown(wait=True) - (event,) = events - chat_spans = list( - x - for x in event["spans"] - if x["op"] == OP.GEN_AI_CHAT and x["origin"] == "auto.ai.litellm" - ) - assert len(chat_spans) == 1 - span = chat_spans[0] + spans = [item.payload for item in items if item.type == "span"] + chat_spans = list( + x + for x in spans + if x["attributes"]["sentry.op"] == OP.GEN_AI_CHAT + and x["attributes"]["sentry.origin"] == "auto.ai.litellm" + ) + assert len(chat_spans) == 1 + span = chat_spans[0] + + assert span["attributes"][SPANDATA.GEN_AI_REQUEST_TEMPERATURE] == 0.7 + assert span["attributes"][SPANDATA.GEN_AI_REQUEST_MAX_TOKENS] == 100 + assert span["attributes"][SPANDATA.GEN_AI_REQUEST_TOP_P] == 0.9 + assert span["attributes"][SPANDATA.GEN_AI_REQUEST_FREQUENCY_PENALTY] == 0.5 + assert span["attributes"][SPANDATA.GEN_AI_REQUEST_PRESENCE_PENALTY] == 0.5 + else: + events = capture_events() + + with mock.patch.object( + client.completions._client._client, + "send", + return_value=model_response, + ), start_transaction(name="litellm test"): + litellm.completion( + model="gpt-3.5-turbo", + messages=messages, + client=client, + temperature=0.7, + max_tokens=100, + top_p=0.9, + frequency_penalty=0.5, + presence_penalty=0.5, + ) + + litellm_utils.executor.shutdown(wait=True) + + (event,) = events + chat_spans = list( + x + for x in event["spans"] + if x["op"] == OP.GEN_AI_CHAT and x["origin"] == "auto.ai.litellm" + ) + assert len(chat_spans) == 1 + span = chat_spans[0] - assert span["data"][SPANDATA.GEN_AI_REQUEST_TEMPERATURE] == 0.7 - assert span["data"][SPANDATA.GEN_AI_REQUEST_MAX_TOKENS] == 100 - assert span["data"][SPANDATA.GEN_AI_REQUEST_TOP_P] == 0.9 - assert span["data"][SPANDATA.GEN_AI_REQUEST_FREQUENCY_PENALTY] == 0.5 - assert span["data"][SPANDATA.GEN_AI_REQUEST_PRESENCE_PENALTY] == 0.5 + assert span["data"][SPANDATA.GEN_AI_REQUEST_TEMPERATURE] == 0.7 + assert span["data"][SPANDATA.GEN_AI_REQUEST_MAX_TOKENS] == 100 + assert span["data"][SPANDATA.GEN_AI_REQUEST_TOP_P] == 0.9 + assert span["data"][SPANDATA.GEN_AI_REQUEST_FREQUENCY_PENALTY] == 0.5 + assert span["data"][SPANDATA.GEN_AI_REQUEST_PRESENCE_PENALTY] == 0.5 +@pytest.mark.parametrize("stream_gen_ai_spans", [True, False]) @pytest.mark.asyncio(loop_scope="session") async def test_async_additional_parameters( sentry_init, capture_events, + capture_items, get_model_response, nonstreaming_chat_completions_model_response, + stream_gen_ai_spans, ): """Test that additional parameters are captured.""" sentry_init( integrations=[LiteLLMIntegration()], traces_sample_rate=1.0, + stream_gen_ai_spans=stream_gen_ai_spans, ) - events = capture_events() messages = [{"role": "user", "content": "Hello!"}] client = AsyncOpenAI(api_key="test-key") @@ -1272,12 +1992,14 @@ async def test_async_additional_parameters( request_headers={"X-Stainless-Raw-Response": "true"}, ) - with mock.patch.object( - client.completions._client._client, - "send", - return_value=model_response, - ): - with start_transaction(name="litellm test"): + if stream_gen_ai_spans: + items = capture_items("transaction", "span") + + with mock.patch.object( + client.completions._client._client, + "send", + return_value=model_response, + ), start_transaction(name="litellm test"): await litellm.acompletion( model="gpt-3.5-turbo", messages=messages, @@ -1292,34 +2014,74 @@ async def test_async_additional_parameters( await GLOBAL_LOGGING_WORKER.flush() await asyncio.sleep(0.5) - (event,) = events - chat_spans = list( - x - for x in event["spans"] - if x["op"] == OP.GEN_AI_CHAT and x["origin"] == "auto.ai.litellm" - ) - assert len(chat_spans) == 1 - span = chat_spans[0] + spans = [item.payload for item in items if item.type == "span"] + chat_spans = list( + x + for x in spans + if x["attributes"]["sentry.op"] == OP.GEN_AI_CHAT + and x["attributes"]["sentry.origin"] == "auto.ai.litellm" + ) + assert len(chat_spans) == 1 + span = chat_spans[0] + + assert span["attributes"][SPANDATA.GEN_AI_REQUEST_TEMPERATURE] == 0.7 + assert span["attributes"][SPANDATA.GEN_AI_REQUEST_MAX_TOKENS] == 100 + assert span["attributes"][SPANDATA.GEN_AI_REQUEST_TOP_P] == 0.9 + assert span["attributes"][SPANDATA.GEN_AI_REQUEST_FREQUENCY_PENALTY] == 0.5 + assert span["attributes"][SPANDATA.GEN_AI_REQUEST_PRESENCE_PENALTY] == 0.5 + else: + events = capture_events() - assert span["data"][SPANDATA.GEN_AI_REQUEST_TEMPERATURE] == 0.7 - assert span["data"][SPANDATA.GEN_AI_REQUEST_MAX_TOKENS] == 100 - assert span["data"][SPANDATA.GEN_AI_REQUEST_TOP_P] == 0.9 - assert span["data"][SPANDATA.GEN_AI_REQUEST_FREQUENCY_PENALTY] == 0.5 - assert span["data"][SPANDATA.GEN_AI_REQUEST_PRESENCE_PENALTY] == 0.5 + with mock.patch.object( + client.completions._client._client, + "send", + return_value=model_response, + ), start_transaction(name="litellm test"): + await litellm.acompletion( + model="gpt-3.5-turbo", + messages=messages, + client=client, + temperature=0.7, + max_tokens=100, + top_p=0.9, + frequency_penalty=0.5, + presence_penalty=0.5, + ) + + await GLOBAL_LOGGING_WORKER.flush() + await asyncio.sleep(0.5) + + (event,) = events + chat_spans = list( + x + for x in event["spans"] + if x["op"] == OP.GEN_AI_CHAT and x["origin"] == "auto.ai.litellm" + ) + assert len(chat_spans) == 1 + span = chat_spans[0] + assert span["data"][SPANDATA.GEN_AI_REQUEST_TEMPERATURE] == 0.7 + assert span["data"][SPANDATA.GEN_AI_REQUEST_MAX_TOKENS] == 100 + assert span["data"][SPANDATA.GEN_AI_REQUEST_TOP_P] == 0.9 + assert span["data"][SPANDATA.GEN_AI_REQUEST_FREQUENCY_PENALTY] == 0.5 + assert span["data"][SPANDATA.GEN_AI_REQUEST_PRESENCE_PENALTY] == 0.5 + +@pytest.mark.parametrize("stream_gen_ai_spans", [True, False]) def test_no_integration( reset_litellm_executor, sentry_init, capture_events, + capture_items, get_model_response, nonstreaming_chat_completions_model_response, + stream_gen_ai_spans, ): """Test that when integration is not enabled, callbacks don't break.""" sentry_init( traces_sample_rate=1.0, + stream_gen_ai_spans=stream_gen_ai_spans, ) - events = capture_events() messages = [{"role": "user", "content": "Hello!"}] client = OpenAI(api_key="test-key") @@ -1340,12 +2102,14 @@ def test_no_integration( request_headers={"X-Stainless-Raw-Response": "true"}, ) - with mock.patch.object( - client.completions._client._client, - "send", - return_value=model_response, - ): - with start_transaction(name="litellm test"): + if stream_gen_ai_spans: + items = capture_items("transaction", "span") + + with mock.patch.object( + client.completions._client._client, + "send", + return_value=model_response, + ), start_transaction(name="litellm test"): litellm.completion( model="gpt-3.5-turbo", messages=messages, @@ -1354,29 +2118,55 @@ def test_no_integration( litellm_utils.executor.shutdown(wait=True) - (event,) = events - # Should still have the transaction, but no child spans since integration is off - assert event["type"] == "transaction" - chat_spans = list( - x - for x in event["spans"] - if x["op"] == OP.GEN_AI_CHAT and x["origin"] == "auto.ai.litellm" - ) + spans = [item.payload for item in items if item.type == "span"] + chat_spans = list( + x + for x in spans + if x["attributes"]["sentry.op"] == OP.GEN_AI_CHAT + and x["attributes"]["sentry.origin"] == "auto.ai.litellm" + ) + else: + events = capture_events() + + with mock.patch.object( + client.completions._client._client, + "send", + return_value=model_response, + ), start_transaction(name="litellm test"): + litellm.completion( + model="gpt-3.5-turbo", + messages=messages, + client=client, + ) + + litellm_utils.executor.shutdown(wait=True) + + (event,) = events + # Should still have the transaction, but no child spans since integration is off + assert event["type"] == "transaction" + chat_spans = list( + x + for x in event["spans"] + if x["op"] == OP.GEN_AI_CHAT and x["origin"] == "auto.ai.litellm" + ) assert len(chat_spans) == 0 +@pytest.mark.parametrize("stream_gen_ai_spans", [True, False]) @pytest.mark.asyncio(loop_scope="session") async def test_async_no_integration( sentry_init, capture_events, + capture_items, get_model_response, nonstreaming_chat_completions_model_response, + stream_gen_ai_spans, ): """Test that when integration is not enabled, callbacks don't break.""" sentry_init( traces_sample_rate=1.0, + stream_gen_ai_spans=stream_gen_ai_spans, ) - events = capture_events() messages = [{"role": "user", "content": "Hello!"}] client = AsyncOpenAI(api_key="test-key") @@ -1397,12 +2187,14 @@ async def test_async_no_integration( request_headers={"X-Stainless-Raw-Response": "true"}, ) - with mock.patch.object( - client.completions._client._client, - "send", - return_value=model_response, - ): - with start_transaction(name="litellm test"): + if stream_gen_ai_spans: + items = capture_items("transaction", "span") + + with mock.patch.object( + client.completions._client._client, + "send", + return_value=model_response, + ), start_transaction(name="litellm test"): await litellm.acompletion( model="gpt-3.5-turbo", messages=messages, @@ -1412,24 +2204,54 @@ async def test_async_no_integration( await GLOBAL_LOGGING_WORKER.flush() await asyncio.sleep(0.5) - (event,) = events - # Should still have the transaction, but no child spans since integration is off - assert event["type"] == "transaction" - chat_spans = list( - x - for x in event["spans"] - if x["op"] == OP.GEN_AI_CHAT and x["origin"] == "auto.ai.litellm" - ) + spans = [item.payload for item in items if item.type == "span"] + chat_spans = list( + x + for x in spans + if x["attributes"]["sentry.op"] == OP.GEN_AI_CHAT + and x["attributes"]["sentry.origin"] == "auto.ai.litellm" + ) + else: + events = capture_events() + + with mock.patch.object( + client.completions._client._client, + "send", + return_value=model_response, + ), start_transaction(name="litellm test"): + await litellm.acompletion( + model="gpt-3.5-turbo", + messages=messages, + client=client, + ) + + await GLOBAL_LOGGING_WORKER.flush() + await asyncio.sleep(0.5) + + (event,) = events + # Should still have the transaction, but no child spans since integration is off + assert event["type"] == "transaction" + chat_spans = list( + x + for x in event["spans"] + if x["op"] == OP.GEN_AI_CHAT and x["origin"] == "auto.ai.litellm" + ) assert len(chat_spans) == 0 -def test_response_without_usage(sentry_init, capture_events): +@pytest.mark.parametrize("stream_gen_ai_spans", [True, False]) +def test_response_without_usage( + sentry_init, + capture_events, + capture_items, + stream_gen_ai_spans, +): """Test handling of responses without usage information.""" sentry_init( integrations=[LiteLLMIntegration()], traces_sample_rate=1.0, + stream_gen_ai_spans=stream_gen_ai_spans, ) - events = capture_events() messages = [{"role": "user", "content": "Hello!"}] @@ -1443,26 +2265,51 @@ def test_response_without_usage(sentry_init, capture_events): }, )() - with start_transaction(name="litellm test"): - kwargs = { - "model": "gpt-3.5-turbo", - "messages": messages, - } + if stream_gen_ai_spans: + items = capture_items("transaction", "span") - _input_callback(kwargs) - _success_callback( - kwargs, - mock_response, - datetime.now(), - datetime.now(), - ) + with start_transaction(name="litellm test"): + kwargs = { + "model": "gpt-3.5-turbo", + "messages": messages, + } + + _input_callback(kwargs) + _success_callback( + kwargs, + mock_response, + datetime.now(), + datetime.now(), + ) + + (span,) = (item.payload for item in items if item.type == "span") + + # Span should still be created even without usage info + assert span["attributes"]["sentry.op"] == OP.GEN_AI_CHAT + assert span["name"] == "chat gpt-3.5-turbo" + else: + events = capture_events() + + with start_transaction(name="litellm test"): + kwargs = { + "model": "gpt-3.5-turbo", + "messages": messages, + } + + _input_callback(kwargs) + _success_callback( + kwargs, + mock_response, + datetime.now(), + datetime.now(), + ) - (event,) = events - (span,) = event["spans"] + (event,) = events + (span,) = event["spans"] - # Span should still be created even without usage info - assert span["op"] == OP.GEN_AI_CHAT - assert span["description"] == "chat gpt-3.5-turbo" + # Span should still be created even without usage info + assert span["op"] == OP.GEN_AI_CHAT + assert span["description"] == "chat gpt-3.5-turbo" def test_integration_setup(sentry_init): @@ -1478,14 +2325,20 @@ def test_integration_setup(sentry_init): assert _failure_callback in (litellm.failure_callback or []) -def test_litellm_message_truncation(sentry_init, capture_events): +@pytest.mark.parametrize("stream_gen_ai_spans", [True, False]) +def test_litellm_message_truncation( + sentry_init, + capture_events, + capture_items, + stream_gen_ai_spans, +): """Test that large messages are truncated properly in LiteLLM integration.""" sentry_init( integrations=[LiteLLMIntegration(include_prompts=True)], traces_sample_rate=1.0, send_default_pii=True, + stream_gen_ai_spans=stream_gen_ai_spans, ) - events = capture_events() large_content = ( "This is a very long message that will exceed our size limits. " * 1000 @@ -1499,39 +2352,78 @@ def test_litellm_message_truncation(sentry_init, capture_events): ] mock_response = MockCompletionResponse() - with start_transaction(name="litellm test"): - kwargs = { - "model": "gpt-3.5-turbo", - "messages": messages, - } + if stream_gen_ai_spans: + items = capture_items("transaction", "span") - _input_callback(kwargs) - _success_callback( - kwargs, - mock_response, - datetime.now(), - datetime.now(), - ) + with start_transaction(name="litellm test"): + kwargs = { + "model": "gpt-3.5-turbo", + "messages": messages, + } + + _input_callback(kwargs) + _success_callback( + kwargs, + mock_response, + datetime.now(), + datetime.now(), + ) + + spans = [item.payload for item in items if item.type == "span"] + chat_spans = [ + span + for span in spans + if span["attributes"].get("sentry.op") == OP.GEN_AI_CHAT + ] + assert len(chat_spans) > 0 - assert len(events) > 0 - tx = events[0] - assert tx["type"] == "transaction" + chat_span = chat_spans[0] + assert SPANDATA.GEN_AI_REQUEST_MESSAGES in chat_span["attributes"] - chat_spans = [ - span for span in tx.get("spans", []) if span.get("op") == OP.GEN_AI_CHAT - ] - assert len(chat_spans) > 0 + messages_data = chat_span["attributes"][SPANDATA.GEN_AI_REQUEST_MESSAGES] + assert isinstance(messages_data, str) + + parsed_messages = json.loads(messages_data) + assert isinstance(parsed_messages, list) + assert len(parsed_messages) == 1 + assert "small message 5" in str(parsed_messages[0]) + tx = next(item.payload for item in items if item.type == "transaction") + else: + events = capture_events() + + with start_transaction(name="litellm test"): + kwargs = { + "model": "gpt-3.5-turbo", + "messages": messages, + } + + _input_callback(kwargs) + _success_callback( + kwargs, + mock_response, + datetime.now(), + datetime.now(), + ) + + assert len(events) > 0 + tx = events[0] + assert tx["type"] == "transaction" + + chat_spans = [ + span for span in tx.get("spans", []) if span.get("op") == OP.GEN_AI_CHAT + ] + assert len(chat_spans) > 0 - chat_span = chat_spans[0] - assert SPANDATA.GEN_AI_REQUEST_MESSAGES in chat_span["data"] + chat_span = chat_spans[0] + assert SPANDATA.GEN_AI_REQUEST_MESSAGES in chat_span["data"] - messages_data = chat_span["data"][SPANDATA.GEN_AI_REQUEST_MESSAGES] - assert isinstance(messages_data, str) + messages_data = chat_span["data"][SPANDATA.GEN_AI_REQUEST_MESSAGES] + assert isinstance(messages_data, str) - parsed_messages = json.loads(messages_data) - assert isinstance(parsed_messages, list) - assert len(parsed_messages) == 1 - assert "small message 5" in str(parsed_messages[0]) + parsed_messages = json.loads(messages_data) + assert isinstance(parsed_messages, list) + assert len(parsed_messages) == 1 + assert "small message 5" in str(parsed_messages[0]) assert tx["_meta"]["spans"]["0"]["data"]["gen_ai.request.messages"][""]["len"] == 5 @@ -1540,19 +2432,22 @@ def test_litellm_message_truncation(sentry_init, capture_events): IMAGE_DATA_URI = f"data:image/png;base64,{IMAGE_B64}" +@pytest.mark.parametrize("stream_gen_ai_spans", [True, False]) def test_binary_content_encoding_image_url( reset_litellm_executor, sentry_init, capture_events, + capture_items, get_model_response, nonstreaming_chat_completions_model_response, + stream_gen_ai_spans, ): sentry_init( integrations=[LiteLLMIntegration(include_prompts=True)], traces_sample_rate=1.0, send_default_pii=True, + stream_gen_ai_spans=stream_gen_ai_spans, ) - events = capture_events() messages = [ { @@ -1584,12 +2479,14 @@ def test_binary_content_encoding_image_url( request_headers={"X-Stainless-Raw-Response": "true"}, ) - with mock.patch.object( - client.completions._client._client, - "send", - return_value=model_response, - ): - with start_transaction(name="litellm test"): + if stream_gen_ai_spans: + items = capture_items("transaction", "span") + + with mock.patch.object( + client.completions._client._client, + "send", + return_value=model_response, + ), start_transaction(name="litellm test"): litellm.completion( model="gpt-4-vision-preview", messages=messages, @@ -1599,15 +2496,42 @@ def test_binary_content_encoding_image_url( litellm_utils.executor.shutdown(wait=True) - (event,) = events - chat_spans = list( - x - for x in event["spans"] - if x["op"] == OP.GEN_AI_CHAT and x["origin"] == "auto.ai.litellm" - ) - assert len(chat_spans) == 1 - span = chat_spans[0] - messages_data = json.loads(span["data"][SPANDATA.GEN_AI_REQUEST_MESSAGES]) + spans = [item.payload for item in items if item.type == "span"] + chat_spans = list( + x + for x in spans + if x["attributes"]["sentry.op"] == OP.GEN_AI_CHAT + and x["attributes"]["sentry.origin"] == "auto.ai.litellm" + ) + assert len(chat_spans) == 1 + span = chat_spans[0] + messages_data = json.loads(span["attributes"][SPANDATA.GEN_AI_REQUEST_MESSAGES]) + else: + events = capture_events() + + with mock.patch.object( + client.completions._client._client, + "send", + return_value=model_response, + ), start_transaction(name="litellm test"): + litellm.completion( + model="gpt-4-vision-preview", + messages=messages, + client=client, + custom_llm_provider="openai", + ) + + litellm_utils.executor.shutdown(wait=True) + + (event,) = events + chat_spans = list( + x + for x in event["spans"] + if x["op"] == OP.GEN_AI_CHAT and x["origin"] == "auto.ai.litellm" + ) + assert len(chat_spans) == 1 + span = chat_spans[0] + messages_data = json.loads(span["data"][SPANDATA.GEN_AI_REQUEST_MESSAGES]) blob_item = next( ( @@ -1628,19 +2552,22 @@ def test_binary_content_encoding_image_url( ) +@pytest.mark.parametrize("stream_gen_ai_spans", [True, False]) @pytest.mark.asyncio(loop_scope="session") async def test_async_binary_content_encoding_image_url( sentry_init, capture_events, + capture_items, get_model_response, nonstreaming_chat_completions_model_response, + stream_gen_ai_spans, ): sentry_init( integrations=[LiteLLMIntegration(include_prompts=True)], traces_sample_rate=1.0, send_default_pii=True, + stream_gen_ai_spans=stream_gen_ai_spans, ) - events = capture_events() messages = [ { @@ -1672,12 +2599,14 @@ async def test_async_binary_content_encoding_image_url( request_headers={"X-Stainless-Raw-Response": "true"}, ) - with mock.patch.object( - client.completions._client._client, - "send", - return_value=model_response, - ): - with start_transaction(name="litellm test"): + if stream_gen_ai_spans: + items = capture_items("transaction", "span") + + with mock.patch.object( + client.completions._client._client, + "send", + return_value=model_response, + ), start_transaction(name="litellm test"): await litellm.acompletion( model="gpt-4-vision-preview", messages=messages, @@ -1688,15 +2617,43 @@ async def test_async_binary_content_encoding_image_url( await GLOBAL_LOGGING_WORKER.flush() await asyncio.sleep(0.5) - (event,) = events - chat_spans = list( - x - for x in event["spans"] - if x["op"] == OP.GEN_AI_CHAT and x["origin"] == "auto.ai.litellm" - ) - assert len(chat_spans) == 1 - span = chat_spans[0] - messages_data = json.loads(span["data"][SPANDATA.GEN_AI_REQUEST_MESSAGES]) + spans = [item.payload for item in items if item.type == "span"] + chat_spans = list( + x + for x in spans + if x["attributes"]["sentry.op"] == OP.GEN_AI_CHAT + and x["attributes"]["sentry.origin"] == "auto.ai.litellm" + ) + assert len(chat_spans) == 1 + span = chat_spans[0] + messages_data = json.loads(span["attributes"][SPANDATA.GEN_AI_REQUEST_MESSAGES]) + else: + events = capture_events() + + with mock.patch.object( + client.completions._client._client, + "send", + return_value=model_response, + ), start_transaction(name="litellm test"): + await litellm.acompletion( + model="gpt-4-vision-preview", + messages=messages, + client=client, + custom_llm_provider="openai", + ) + + await GLOBAL_LOGGING_WORKER.flush() + await asyncio.sleep(0.5) + + (event,) = events + chat_spans = list( + x + for x in event["spans"] + if x["op"] == OP.GEN_AI_CHAT and x["origin"] == "auto.ai.litellm" + ) + assert len(chat_spans) == 1 + span = chat_spans[0] + messages_data = json.loads(span["data"][SPANDATA.GEN_AI_REQUEST_MESSAGES]) blob_item = next( ( @@ -1717,19 +2674,22 @@ async def test_async_binary_content_encoding_image_url( ) +@pytest.mark.parametrize("stream_gen_ai_spans", [True, False]) def test_binary_content_encoding_mixed_content( reset_litellm_executor, sentry_init, capture_events, + capture_items, get_model_response, nonstreaming_chat_completions_model_response, + stream_gen_ai_spans, ): sentry_init( integrations=[LiteLLMIntegration(include_prompts=True)], traces_sample_rate=1.0, send_default_pii=True, + stream_gen_ai_spans=stream_gen_ai_spans, ) - events = capture_events() messages = [ { @@ -1762,12 +2722,14 @@ def test_binary_content_encoding_mixed_content( request_headers={"X-Stainless-Raw-Response": "true"}, ) - with mock.patch.object( - client.completions._client._client, - "send", - return_value=model_response, - ): - with start_transaction(name="litellm test"): + if stream_gen_ai_spans: + items = capture_items("transaction", "span") + + with mock.patch.object( + client.completions._client._client, + "send", + return_value=model_response, + ), start_transaction(name="litellm test"): litellm.completion( model="gpt-4-vision-preview", messages=messages, @@ -1777,15 +2739,42 @@ def test_binary_content_encoding_mixed_content( litellm_utils.executor.shutdown(wait=True) - (event,) = events - chat_spans = list( - x - for x in event["spans"] - if x["op"] == OP.GEN_AI_CHAT and x["origin"] == "auto.ai.litellm" - ) - assert len(chat_spans) == 1 - span = chat_spans[0] - messages_data = json.loads(span["data"][SPANDATA.GEN_AI_REQUEST_MESSAGES]) + spans = [item.payload for item in items if item.type == "span"] + chat_spans = list( + x + for x in spans + if x["attributes"]["sentry.op"] == OP.GEN_AI_CHAT + and x["attributes"]["sentry.origin"] == "auto.ai.litellm" + ) + assert len(chat_spans) == 1 + span = chat_spans[0] + messages_data = json.loads(span["attributes"][SPANDATA.GEN_AI_REQUEST_MESSAGES]) + else: + events = capture_events() + + with mock.patch.object( + client.completions._client._client, + "send", + return_value=model_response, + ), start_transaction(name="litellm test"): + litellm.completion( + model="gpt-4-vision-preview", + messages=messages, + client=client, + custom_llm_provider="openai", + ) + + litellm_utils.executor.shutdown(wait=True) + + (event,) = events + chat_spans = list( + x + for x in event["spans"] + if x["op"] == OP.GEN_AI_CHAT and x["origin"] == "auto.ai.litellm" + ) + assert len(chat_spans) == 1 + span = chat_spans[0] + messages_data = json.loads(span["data"][SPANDATA.GEN_AI_REQUEST_MESSAGES]) content_items = [ item for msg in messages_data if "content" in msg for item in msg["content"] @@ -1794,19 +2783,22 @@ def test_binary_content_encoding_mixed_content( assert any(item.get("type") == "blob" for item in content_items) +@pytest.mark.parametrize("stream_gen_ai_spans", [True, False]) @pytest.mark.asyncio(loop_scope="session") async def test_async_binary_content_encoding_mixed_content( sentry_init, capture_events, + capture_items, get_model_response, nonstreaming_chat_completions_model_response, + stream_gen_ai_spans, ): sentry_init( integrations=[LiteLLMIntegration(include_prompts=True)], traces_sample_rate=1.0, send_default_pii=True, + stream_gen_ai_spans=stream_gen_ai_spans, ) - events = capture_events() messages = [ { @@ -1839,12 +2831,14 @@ async def test_async_binary_content_encoding_mixed_content( request_headers={"X-Stainless-Raw-Response": "true"}, ) - with mock.patch.object( - client.completions._client._client, - "send", - return_value=model_response, - ): - with start_transaction(name="litellm test"): + if stream_gen_ai_spans: + items = capture_items("transaction", "span") + + with mock.patch.object( + client.completions._client._client, + "send", + return_value=model_response, + ), start_transaction(name="litellm test"): await litellm.acompletion( model="gpt-4-vision-preview", messages=messages, @@ -1855,15 +2849,43 @@ async def test_async_binary_content_encoding_mixed_content( await GLOBAL_LOGGING_WORKER.flush() await asyncio.sleep(0.5) - (event,) = events - chat_spans = list( - x - for x in event["spans"] - if x["op"] == OP.GEN_AI_CHAT and x["origin"] == "auto.ai.litellm" - ) - assert len(chat_spans) == 1 - span = chat_spans[0] - messages_data = json.loads(span["data"][SPANDATA.GEN_AI_REQUEST_MESSAGES]) + spans = [item.payload for item in items if item.type == "span"] + chat_spans = list( + x + for x in spans + if x["attributes"]["sentry.op"] == OP.GEN_AI_CHAT + and x["attributes"]["sentry.origin"] == "auto.ai.litellm" + ) + assert len(chat_spans) == 1 + span = chat_spans[0] + messages_data = json.loads(span["attributes"][SPANDATA.GEN_AI_REQUEST_MESSAGES]) + else: + events = capture_events() + + with mock.patch.object( + client.completions._client._client, + "send", + return_value=model_response, + ), start_transaction(name="litellm test"): + await litellm.acompletion( + model="gpt-4-vision-preview", + messages=messages, + client=client, + custom_llm_provider="openai", + ) + + await GLOBAL_LOGGING_WORKER.flush() + await asyncio.sleep(0.5) + + (event,) = events + chat_spans = list( + x + for x in event["spans"] + if x["op"] == OP.GEN_AI_CHAT and x["origin"] == "auto.ai.litellm" + ) + assert len(chat_spans) == 1 + span = chat_spans[0] + messages_data = json.loads(span["data"][SPANDATA.GEN_AI_REQUEST_MESSAGES]) content_items = [ item for msg in messages_data if "content" in msg for item in msg["content"] @@ -1872,19 +2894,22 @@ async def test_async_binary_content_encoding_mixed_content( assert any(item.get("type") == "blob" for item in content_items) +@pytest.mark.parametrize("stream_gen_ai_spans", [True, False]) def test_binary_content_encoding_uri_type( reset_litellm_executor, sentry_init, capture_events, + capture_items, get_model_response, nonstreaming_chat_completions_model_response, + stream_gen_ai_spans, ): sentry_init( integrations=[LiteLLMIntegration(include_prompts=True)], traces_sample_rate=1.0, send_default_pii=True, + stream_gen_ai_spans=stream_gen_ai_spans, ) - events = capture_events() messages = [ { @@ -1915,12 +2940,13 @@ def test_binary_content_encoding_uri_type( request_headers={"X-Stainless-Raw-Response": "true"}, ) - with mock.patch.object( - client.completions._client._client, - "send", - return_value=model_response, - ): - with start_transaction(name="litellm test"): + if stream_gen_ai_spans: + items = capture_items("transaction", "span") + with mock.patch.object( + client.completions._client._client, + "send", + return_value=model_response, + ), start_transaction(name="litellm test"): litellm.completion( model="gpt-4-vision-preview", messages=messages, @@ -1930,15 +2956,42 @@ def test_binary_content_encoding_uri_type( litellm_utils.executor.shutdown(wait=True) - (event,) = events - chat_spans = list( - x - for x in event["spans"] - if x["op"] == OP.GEN_AI_CHAT and x["origin"] == "auto.ai.litellm" - ) - assert len(chat_spans) == 1 - span = chat_spans[0] - messages_data = json.loads(span["data"][SPANDATA.GEN_AI_REQUEST_MESSAGES]) + spans = [item.payload for item in items if item.type == "span"] + chat_spans = list( + x + for x in spans + if x["attributes"]["sentry.op"] == OP.GEN_AI_CHAT + and x["attributes"]["sentry.origin"] == "auto.ai.litellm" + ) + assert len(chat_spans) == 1 + span = chat_spans[0] + messages_data = json.loads(span["attributes"][SPANDATA.GEN_AI_REQUEST_MESSAGES]) + else: + events = capture_events() + + with mock.patch.object( + client.completions._client._client, + "send", + return_value=model_response, + ), start_transaction(name="litellm test"): + litellm.completion( + model="gpt-4-vision-preview", + messages=messages, + client=client, + custom_llm_provider="openai", + ) + + litellm_utils.executor.shutdown(wait=True) + + (event,) = events + chat_spans = list( + x + for x in event["spans"] + if x["op"] == OP.GEN_AI_CHAT and x["origin"] == "auto.ai.litellm" + ) + assert len(chat_spans) == 1 + span = chat_spans[0] + messages_data = json.loads(span["data"][SPANDATA.GEN_AI_REQUEST_MESSAGES]) uri_item = next( ( @@ -1954,19 +3007,22 @@ def test_binary_content_encoding_uri_type( assert uri_item["uri"] == "https://example.com/image.jpg" +@pytest.mark.parametrize("stream_gen_ai_spans", [True, False]) @pytest.mark.asyncio(loop_scope="session") async def test_async_binary_content_encoding_uri_type( sentry_init, capture_events, + capture_items, get_model_response, nonstreaming_chat_completions_model_response, + stream_gen_ai_spans, ): sentry_init( integrations=[LiteLLMIntegration(include_prompts=True)], traces_sample_rate=1.0, send_default_pii=True, + stream_gen_ai_spans=stream_gen_ai_spans, ) - events = capture_events() messages = [ { @@ -1997,12 +3053,14 @@ async def test_async_binary_content_encoding_uri_type( request_headers={"X-Stainless-Raw-Response": "true"}, ) - with mock.patch.object( - client.completions._client._client, - "send", - return_value=model_response, - ): - with start_transaction(name="litellm test"): + if stream_gen_ai_spans: + items = capture_items("transaction", "span") + + with mock.patch.object( + client.completions._client._client, + "send", + return_value=model_response, + ), start_transaction(name="litellm test"): await litellm.acompletion( model="gpt-4-vision-preview", messages=messages, @@ -2013,15 +3071,43 @@ async def test_async_binary_content_encoding_uri_type( await GLOBAL_LOGGING_WORKER.flush() await asyncio.sleep(0.5) - (event,) = events - chat_spans = list( - x - for x in event["spans"] - if x["op"] == OP.GEN_AI_CHAT and x["origin"] == "auto.ai.litellm" - ) - assert len(chat_spans) == 1 - span = chat_spans[0] - messages_data = json.loads(span["data"][SPANDATA.GEN_AI_REQUEST_MESSAGES]) + spans = [item.payload for item in items if item.type == "span"] + chat_spans = list( + x + for x in spans + if x["attributes"]["sentry.op"] == OP.GEN_AI_CHAT + and x["attributes"]["sentry.origin"] == "auto.ai.litellm" + ) + assert len(chat_spans) == 1 + span = chat_spans[0] + messages_data = json.loads(span["attributes"][SPANDATA.GEN_AI_REQUEST_MESSAGES]) + else: + events = capture_events() + + with mock.patch.object( + client.completions._client._client, + "send", + return_value=model_response, + ), start_transaction(name="litellm test"): + await litellm.acompletion( + model="gpt-4-vision-preview", + messages=messages, + client=client, + custom_llm_provider="openai", + ) + + await GLOBAL_LOGGING_WORKER.flush() + await asyncio.sleep(0.5) + + (event,) = events + chat_spans = list( + x + for x in event["spans"] + if x["op"] == OP.GEN_AI_CHAT and x["origin"] == "auto.ai.litellm" + ) + assert len(chat_spans) == 1 + span = chat_spans[0] + messages_data = json.loads(span["data"][SPANDATA.GEN_AI_REQUEST_MESSAGES]) uri_item = next( ( diff --git a/tests/integrations/openai/test_openai.py b/tests/integrations/openai/test_openai.py index f758d0138f..934a0b8f4e 100644 --- a/tests/integrations/openai/test_openai.py +++ b/tests/integrations/openai/test_openai.py @@ -103,6 +103,7 @@ async def __call__(self, *args, **kwargs): ) +@pytest.mark.parametrize("stream_gen_ai_spans", [True, False]) @pytest.mark.parametrize( "send_default_pii, include_prompts", [ @@ -114,16 +115,18 @@ async def __call__(self, *args, **kwargs): def test_nonstreaming_chat_completion_no_prompts( sentry_init, capture_events, + capture_items, send_default_pii, include_prompts, nonstreaming_chat_completions_model_response, + stream_gen_ai_spans, ): sentry_init( integrations=[OpenAIIntegration(include_prompts=include_prompts)], traces_sample_rate=1.0, send_default_pii=send_default_pii, + stream_gen_ai_spans=stream_gen_ai_spans, ) - events = capture_events() client = OpenAI(api_key="z") client.chat.completions._post = mock.Mock( @@ -140,53 +143,98 @@ def test_nonstreaming_chat_completion_no_prompts( ) ) - with start_transaction(name="openai tx"): - response = ( - client.chat.completions.create( - model="some-model", - messages=[ - {"role": "system", "content": "You are a helpful assistant."}, - {"role": "user", "content": "hello"}, - ], - max_tokens=100, - presence_penalty=0.1, - frequency_penalty=0.2, - temperature=0.7, - top_p=0.9, + if stream_gen_ai_spans: + items = capture_items("span") + + with start_transaction(name="openai tx"): + response = ( + client.chat.completions.create( + model="some-model", + messages=[ + {"role": "system", "content": "You are a helpful assistant."}, + {"role": "user", "content": "hello"}, + ], + max_tokens=100, + presence_penalty=0.1, + frequency_penalty=0.2, + temperature=0.7, + top_p=0.9, + ) + .choices[0] + .message.content ) - .choices[0] - .message.content - ) - assert response == "the model response" - tx = events[0] - assert tx["type"] == "transaction" - span = tx["spans"][0] - assert span["op"] == "gen_ai.chat" - assert span["data"][SPANDATA.GEN_AI_SYSTEM] == "openai" - assert span["data"][SPANDATA.GEN_AI_RESPONSE_STREAMING] is False + assert response == "the model response" + span = next(item.payload for item in items if item.type == "span") + assert span["attributes"]["sentry.op"] == "gen_ai.chat" + assert span["attributes"][SPANDATA.GEN_AI_SYSTEM] == "openai" + assert span["attributes"][SPANDATA.GEN_AI_RESPONSE_STREAMING] is False + + assert span["attributes"][SPANDATA.GEN_AI_REQUEST_MODEL] == "some-model" + assert span["attributes"][SPANDATA.GEN_AI_REQUEST_MAX_TOKENS] == 100 + assert span["attributes"][SPANDATA.GEN_AI_REQUEST_PRESENCE_PENALTY] == 0.1 + assert span["attributes"][SPANDATA.GEN_AI_REQUEST_FREQUENCY_PENALTY] == 0.2 + assert span["attributes"][SPANDATA.GEN_AI_REQUEST_TEMPERATURE] == 0.7 + assert span["attributes"][SPANDATA.GEN_AI_REQUEST_TOP_P] == 0.9 + + assert SPANDATA.GEN_AI_SYSTEM_INSTRUCTIONS not in span["attributes"] + assert SPANDATA.GEN_AI_REQUEST_MESSAGES not in span["attributes"] + assert SPANDATA.GEN_AI_RESPONSE_TEXT not in span["attributes"] + + assert span["attributes"]["gen_ai.usage.output_tokens"] == 10 + assert span["attributes"]["gen_ai.usage.input_tokens"] == 20 + assert span["attributes"]["gen_ai.usage.total_tokens"] == 30 + else: + events = capture_events() - assert span["data"][SPANDATA.GEN_AI_REQUEST_MODEL] == "some-model" - assert span["data"][SPANDATA.GEN_AI_REQUEST_MAX_TOKENS] == 100 - assert span["data"][SPANDATA.GEN_AI_REQUEST_PRESENCE_PENALTY] == 0.1 - assert span["data"][SPANDATA.GEN_AI_REQUEST_FREQUENCY_PENALTY] == 0.2 - assert span["data"][SPANDATA.GEN_AI_REQUEST_TEMPERATURE] == 0.7 - assert span["data"][SPANDATA.GEN_AI_REQUEST_TOP_P] == 0.9 + with start_transaction(name="openai tx"): + response = ( + client.chat.completions.create( + model="some-model", + messages=[ + {"role": "system", "content": "You are a helpful assistant."}, + {"role": "user", "content": "hello"}, + ], + max_tokens=100, + presence_penalty=0.1, + frequency_penalty=0.2, + temperature=0.7, + top_p=0.9, + ) + .choices[0] + .message.content + ) - assert SPANDATA.GEN_AI_SYSTEM_INSTRUCTIONS not in span["data"] - assert SPANDATA.GEN_AI_REQUEST_MESSAGES not in span["data"] - assert SPANDATA.GEN_AI_RESPONSE_TEXT not in span["data"] + assert response == "the model response" + tx = events[0] + assert tx["type"] == "transaction" + span = tx["spans"][0] + assert span["op"] == "gen_ai.chat" + assert span["data"][SPANDATA.GEN_AI_SYSTEM] == "openai" + assert span["data"][SPANDATA.GEN_AI_RESPONSE_STREAMING] is False + + assert span["data"][SPANDATA.GEN_AI_REQUEST_MODEL] == "some-model" + assert span["data"][SPANDATA.GEN_AI_REQUEST_MAX_TOKENS] == 100 + assert span["data"][SPANDATA.GEN_AI_REQUEST_PRESENCE_PENALTY] == 0.1 + assert span["data"][SPANDATA.GEN_AI_REQUEST_FREQUENCY_PENALTY] == 0.2 + assert span["data"][SPANDATA.GEN_AI_REQUEST_TEMPERATURE] == 0.7 + assert span["data"][SPANDATA.GEN_AI_REQUEST_TOP_P] == 0.9 + + assert SPANDATA.GEN_AI_SYSTEM_INSTRUCTIONS not in span["data"] + assert SPANDATA.GEN_AI_REQUEST_MESSAGES not in span["data"] + assert SPANDATA.GEN_AI_RESPONSE_TEXT not in span["data"] - assert span["data"]["gen_ai.usage.output_tokens"] == 10 - assert span["data"]["gen_ai.usage.input_tokens"] == 20 - assert span["data"]["gen_ai.usage.total_tokens"] == 30 + assert span["data"]["gen_ai.usage.output_tokens"] == 10 + assert span["data"]["gen_ai.usage.input_tokens"] == 20 + assert span["data"]["gen_ai.usage.total_tokens"] == 30 +@pytest.mark.parametrize("stream_gen_ai_spans", [True, False]) @pytest.mark.parametrize( - "messages", + "get_messages", [ pytest.param( - [ + lambda: [ { "role": "system", "content": "You are a helpful assistant.", @@ -196,7 +244,7 @@ def test_nonstreaming_chat_completion_no_prompts( id="blocks", ), pytest.param( - [ + lambda: [ { "role": "system", "content": [ @@ -209,7 +257,7 @@ def test_nonstreaming_chat_completion_no_prompts( id="parts", ), pytest.param( - iter( + lambda: iter( [ { "role": "system", @@ -228,16 +276,18 @@ def test_nonstreaming_chat_completion_no_prompts( def test_nonstreaming_chat_completion( sentry_init, capture_events, - messages, + capture_items, + get_messages, request, nonstreaming_chat_completions_model_response, + stream_gen_ai_spans, ): sentry_init( integrations=[OpenAIIntegration(include_prompts=True)], traces_sample_rate=1.0, send_default_pii=True, + stream_gen_ai_spans=stream_gen_ai_spans, ) - events = capture_events() client = OpenAI(api_key="z") client.chat.completions._post = mock.Mock( @@ -254,64 +304,129 @@ def test_nonstreaming_chat_completion( ) ) - with start_transaction(name="openai tx"): - response = ( - client.chat.completions.create( - model="some-model", - messages=messages, - max_tokens=100, - presence_penalty=0.1, - frequency_penalty=0.2, - temperature=0.7, - top_p=0.9, + if stream_gen_ai_spans: + items = capture_items("span") + + with start_transaction(name="openai tx"): + response = ( + client.chat.completions.create( + model="some-model", + messages=get_messages(), + max_tokens=100, + presence_penalty=0.1, + frequency_penalty=0.2, + temperature=0.7, + top_p=0.9, + ) + .choices[0] + .message.content ) - .choices[0] - .message.content - ) - assert response == "the model response" - tx = events[0] - assert tx["type"] == "transaction" - span = tx["spans"][0] - assert span["op"] == "gen_ai.chat" - assert span["data"][SPANDATA.GEN_AI_SYSTEM] == "openai" - assert span["data"][SPANDATA.GEN_AI_RESPONSE_STREAMING] is False - - assert span["data"][SPANDATA.GEN_AI_REQUEST_MODEL] == "some-model" - assert span["data"][SPANDATA.GEN_AI_REQUEST_MAX_TOKENS] == 100 - assert span["data"][SPANDATA.GEN_AI_REQUEST_PRESENCE_PENALTY] == 0.1 - assert span["data"][SPANDATA.GEN_AI_REQUEST_FREQUENCY_PENALTY] == 0.2 - assert span["data"][SPANDATA.GEN_AI_REQUEST_TEMPERATURE] == 0.7 - assert span["data"][SPANDATA.GEN_AI_REQUEST_TOP_P] == 0.9 - - param_id = request.node.callspec.id - if "blocks" in param_id: - assert json.loads(span["data"][SPANDATA.GEN_AI_SYSTEM_INSTRUCTIONS]) == [ - { - "type": "text", - "content": "You are a helpful assistant.", - } - ] + assert response == "the model response" + span = next(item.payload for item in items if item.type == "span") + assert span["attributes"]["sentry.op"] == "gen_ai.chat" + assert span["attributes"][SPANDATA.GEN_AI_SYSTEM] == "openai" + assert span["attributes"][SPANDATA.GEN_AI_RESPONSE_STREAMING] is False + + assert span["attributes"][SPANDATA.GEN_AI_REQUEST_MODEL] == "some-model" + assert span["attributes"][SPANDATA.GEN_AI_REQUEST_MAX_TOKENS] == 100 + assert span["attributes"][SPANDATA.GEN_AI_REQUEST_PRESENCE_PENALTY] == 0.1 + assert span["attributes"][SPANDATA.GEN_AI_REQUEST_FREQUENCY_PENALTY] == 0.2 + assert span["attributes"][SPANDATA.GEN_AI_REQUEST_TEMPERATURE] == 0.7 + assert span["attributes"][SPANDATA.GEN_AI_REQUEST_TOP_P] == 0.9 + + param_id = request.node.callspec.id + if "blocks" in param_id: + assert json.loads( + span["attributes"][SPANDATA.GEN_AI_SYSTEM_INSTRUCTIONS] + ) == [ + { + "type": "text", + "content": "You are a helpful assistant.", + } + ] + else: + assert json.loads( + span["attributes"][SPANDATA.GEN_AI_SYSTEM_INSTRUCTIONS] + ) == [ + { + "type": "text", + "content": "You are a helpful assistant.", + }, + { + "type": "text", + "content": "Be concise and clear.", + }, + ] + + assert "hello" in span["attributes"][SPANDATA.GEN_AI_REQUEST_MESSAGES] + assert "the model response" in span["attributes"][SPANDATA.GEN_AI_RESPONSE_TEXT] + + assert span["attributes"]["gen_ai.usage.output_tokens"] == 10 + assert span["attributes"]["gen_ai.usage.input_tokens"] == 20 + assert span["attributes"]["gen_ai.usage.total_tokens"] == 30 else: - assert json.loads(span["data"][SPANDATA.GEN_AI_SYSTEM_INSTRUCTIONS]) == [ - { - "type": "text", - "content": "You are a helpful assistant.", - }, - { - "type": "text", - "content": "Be concise and clear.", - }, - ] + events = capture_events() + + with start_transaction(name="openai tx"): + response = ( + client.chat.completions.create( + model="some-model", + messages=get_messages(), + max_tokens=100, + presence_penalty=0.1, + frequency_penalty=0.2, + temperature=0.7, + top_p=0.9, + ) + .choices[0] + .message.content + ) + + assert response == "the model response" + tx = events[0] + assert tx["type"] == "transaction" + span = tx["spans"][0] + assert span["op"] == "gen_ai.chat" + assert span["data"][SPANDATA.GEN_AI_SYSTEM] == "openai" + assert span["data"][SPANDATA.GEN_AI_RESPONSE_STREAMING] is False + + assert span["data"][SPANDATA.GEN_AI_REQUEST_MODEL] == "some-model" + assert span["data"][SPANDATA.GEN_AI_REQUEST_MAX_TOKENS] == 100 + assert span["data"][SPANDATA.GEN_AI_REQUEST_PRESENCE_PENALTY] == 0.1 + assert span["data"][SPANDATA.GEN_AI_REQUEST_FREQUENCY_PENALTY] == 0.2 + assert span["data"][SPANDATA.GEN_AI_REQUEST_TEMPERATURE] == 0.7 + assert span["data"][SPANDATA.GEN_AI_REQUEST_TOP_P] == 0.9 + + param_id = request.node.callspec.id + if "blocks" in param_id: + assert json.loads(span["data"][SPANDATA.GEN_AI_SYSTEM_INSTRUCTIONS]) == [ + { + "type": "text", + "content": "You are a helpful assistant.", + } + ] + else: + assert json.loads(span["data"][SPANDATA.GEN_AI_SYSTEM_INSTRUCTIONS]) == [ + { + "type": "text", + "content": "You are a helpful assistant.", + }, + { + "type": "text", + "content": "Be concise and clear.", + }, + ] - assert "hello" in span["data"][SPANDATA.GEN_AI_REQUEST_MESSAGES] - assert "the model response" in span["data"][SPANDATA.GEN_AI_RESPONSE_TEXT] + assert "hello" in span["data"][SPANDATA.GEN_AI_REQUEST_MESSAGES] + assert "the model response" in span["data"][SPANDATA.GEN_AI_RESPONSE_TEXT] - assert span["data"]["gen_ai.usage.output_tokens"] == 10 - assert span["data"]["gen_ai.usage.input_tokens"] == 20 - assert span["data"]["gen_ai.usage.total_tokens"] == 30 + assert span["data"]["gen_ai.usage.output_tokens"] == 10 + assert span["data"]["gen_ai.usage.input_tokens"] == 20 + assert span["data"]["gen_ai.usage.total_tokens"] == 30 +@pytest.mark.parametrize("stream_gen_ai_spans", [True, False]) @pytest.mark.asyncio @pytest.mark.parametrize( "send_default_pii, include_prompts", @@ -324,16 +439,18 @@ def test_nonstreaming_chat_completion( async def test_nonstreaming_chat_completion_async_no_prompts( sentry_init, capture_events, + capture_items, send_default_pii, include_prompts, nonstreaming_chat_completions_model_response, + stream_gen_ai_spans, ): sentry_init( integrations=[OpenAIIntegration(include_prompts=include_prompts)], traces_sample_rate=1.0, send_default_pii=send_default_pii, + stream_gen_ai_spans=stream_gen_ai_spans, ) - events = capture_events() client = AsyncOpenAI(api_key="z") client.chat.completions._post = mock.AsyncMock( @@ -350,51 +467,93 @@ async def test_nonstreaming_chat_completion_async_no_prompts( ) ) - with start_transaction(name="openai tx"): - response = await client.chat.completions.create( - model="some-model", - messages=[ - {"role": "system", "content": "You are a helpful assistant."}, - {"role": "user", "content": "hello"}, - ], - max_tokens=100, - presence_penalty=0.1, - frequency_penalty=0.2, - temperature=0.7, - top_p=0.9, - ) - response = response.choices[0].message.content - - assert response == "the model response" - tx = events[0] - assert tx["type"] == "transaction" - span = tx["spans"][0] - assert span["op"] == "gen_ai.chat" - assert span["data"][SPANDATA.GEN_AI_SYSTEM] == "openai" - assert span["data"][SPANDATA.GEN_AI_RESPONSE_STREAMING] is False + if stream_gen_ai_spans: + items = capture_items("span") - assert span["data"][SPANDATA.GEN_AI_REQUEST_MODEL] == "some-model" - assert span["data"][SPANDATA.GEN_AI_REQUEST_MAX_TOKENS] == 100 - assert span["data"][SPANDATA.GEN_AI_REQUEST_PRESENCE_PENALTY] == 0.1 - assert span["data"][SPANDATA.GEN_AI_REQUEST_FREQUENCY_PENALTY] == 0.2 - assert span["data"][SPANDATA.GEN_AI_REQUEST_TEMPERATURE] == 0.7 - assert span["data"][SPANDATA.GEN_AI_REQUEST_TOP_P] == 0.9 + with start_transaction(name="openai tx"): + response = await client.chat.completions.create( + model="some-model", + messages=[ + {"role": "system", "content": "You are a helpful assistant."}, + {"role": "user", "content": "hello"}, + ], + max_tokens=100, + presence_penalty=0.1, + frequency_penalty=0.2, + temperature=0.7, + top_p=0.9, + ) + response = response.choices[0].message.content + + assert response == "the model response" + span = next(item.payload for item in items if item.type == "span") + assert span["attributes"]["sentry.op"] == "gen_ai.chat" + assert span["attributes"][SPANDATA.GEN_AI_SYSTEM] == "openai" + assert span["attributes"][SPANDATA.GEN_AI_RESPONSE_STREAMING] is False + + assert span["attributes"][SPANDATA.GEN_AI_REQUEST_MODEL] == "some-model" + assert span["attributes"][SPANDATA.GEN_AI_REQUEST_MAX_TOKENS] == 100 + assert span["attributes"][SPANDATA.GEN_AI_REQUEST_PRESENCE_PENALTY] == 0.1 + assert span["attributes"][SPANDATA.GEN_AI_REQUEST_FREQUENCY_PENALTY] == 0.2 + assert span["attributes"][SPANDATA.GEN_AI_REQUEST_TEMPERATURE] == 0.7 + assert span["attributes"][SPANDATA.GEN_AI_REQUEST_TOP_P] == 0.9 + + assert SPANDATA.GEN_AI_SYSTEM_INSTRUCTIONS not in span["attributes"] + assert SPANDATA.GEN_AI_REQUEST_MESSAGES not in span["attributes"] + assert SPANDATA.GEN_AI_RESPONSE_TEXT not in span["attributes"] + + assert span["attributes"]["gen_ai.usage.output_tokens"] == 10 + assert span["attributes"]["gen_ai.usage.input_tokens"] == 20 + assert span["attributes"]["gen_ai.usage.total_tokens"] == 30 + else: + events = capture_events() - assert SPANDATA.GEN_AI_SYSTEM_INSTRUCTIONS not in span["data"] - assert SPANDATA.GEN_AI_REQUEST_MESSAGES not in span["data"] - assert SPANDATA.GEN_AI_RESPONSE_TEXT not in span["data"] + with start_transaction(name="openai tx"): + response = await client.chat.completions.create( + model="some-model", + messages=[ + {"role": "system", "content": "You are a helpful assistant."}, + {"role": "user", "content": "hello"}, + ], + max_tokens=100, + presence_penalty=0.1, + frequency_penalty=0.2, + temperature=0.7, + top_p=0.9, + ) + response = response.choices[0].message.content + + assert response == "the model response" + tx = events[0] + assert tx["type"] == "transaction" + span = tx["spans"][0] + assert span["op"] == "gen_ai.chat" + assert span["data"][SPANDATA.GEN_AI_SYSTEM] == "openai" + assert span["data"][SPANDATA.GEN_AI_RESPONSE_STREAMING] is False + + assert span["data"][SPANDATA.GEN_AI_REQUEST_MODEL] == "some-model" + assert span["data"][SPANDATA.GEN_AI_REQUEST_MAX_TOKENS] == 100 + assert span["data"][SPANDATA.GEN_AI_REQUEST_PRESENCE_PENALTY] == 0.1 + assert span["data"][SPANDATA.GEN_AI_REQUEST_FREQUENCY_PENALTY] == 0.2 + assert span["data"][SPANDATA.GEN_AI_REQUEST_TEMPERATURE] == 0.7 + assert span["data"][SPANDATA.GEN_AI_REQUEST_TOP_P] == 0.9 + + assert SPANDATA.GEN_AI_SYSTEM_INSTRUCTIONS not in span["data"] + assert SPANDATA.GEN_AI_REQUEST_MESSAGES not in span["data"] + assert SPANDATA.GEN_AI_RESPONSE_TEXT not in span["data"] - assert span["data"]["gen_ai.usage.output_tokens"] == 10 - assert span["data"]["gen_ai.usage.input_tokens"] == 20 - assert span["data"]["gen_ai.usage.total_tokens"] == 30 + assert span["data"]["gen_ai.usage.output_tokens"] == 10 + assert span["data"]["gen_ai.usage.input_tokens"] == 20 + assert span["data"]["gen_ai.usage.total_tokens"] == 30 +@pytest.mark.parametrize("stream_gen_ai_spans", [True, False]) @pytest.mark.asyncio @pytest.mark.parametrize( - "messages", + "get_messages", [ pytest.param( - [ + lambda: [ { "role": "system", "content": "You are a helpful assistant.", @@ -404,7 +563,7 @@ async def test_nonstreaming_chat_completion_async_no_prompts( id="blocks", ), pytest.param( - [ + lambda: [ { "role": "system", "content": [ @@ -417,7 +576,7 @@ async def test_nonstreaming_chat_completion_async_no_prompts( id="parts", ), pytest.param( - iter( + lambda: iter( [ { "role": "system", @@ -436,16 +595,18 @@ async def test_nonstreaming_chat_completion_async_no_prompts( async def test_nonstreaming_chat_completion_async( sentry_init, capture_events, - messages, + capture_items, + get_messages, request, nonstreaming_chat_completions_model_response, + stream_gen_ai_spans, ): sentry_init( integrations=[OpenAIIntegration(include_prompts=True)], traces_sample_rate=1.0, send_default_pii=True, + stream_gen_ai_spans=stream_gen_ai_spans, ) - events = capture_events() client = AsyncOpenAI(api_key="z") client.chat.completions._post = AsyncMock( @@ -462,59 +623,120 @@ async def test_nonstreaming_chat_completion_async( ) ) - with start_transaction(name="openai tx"): - response = await client.chat.completions.create( - model="some-model", - messages=messages, - max_tokens=100, - presence_penalty=0.1, - frequency_penalty=0.2, - temperature=0.7, - top_p=0.9, - ) - response = response.choices[0].message.content - - assert response == "the model response" - tx = events[0] - assert tx["type"] == "transaction" - span = tx["spans"][0] - assert span["op"] == "gen_ai.chat" - assert span["data"][SPANDATA.GEN_AI_SYSTEM] == "openai" - assert span["data"][SPANDATA.GEN_AI_RESPONSE_STREAMING] is False - - assert span["data"][SPANDATA.GEN_AI_REQUEST_MODEL] == "some-model" - assert span["data"][SPANDATA.GEN_AI_REQUEST_MAX_TOKENS] == 100 - assert span["data"][SPANDATA.GEN_AI_REQUEST_PRESENCE_PENALTY] == 0.1 - assert span["data"][SPANDATA.GEN_AI_REQUEST_FREQUENCY_PENALTY] == 0.2 - assert span["data"][SPANDATA.GEN_AI_REQUEST_TEMPERATURE] == 0.7 - assert span["data"][SPANDATA.GEN_AI_REQUEST_TOP_P] == 0.9 - - param_id = request.node.callspec.id - if "blocks" in param_id: - assert json.loads(span["data"][SPANDATA.GEN_AI_SYSTEM_INSTRUCTIONS]) == [ - { - "type": "text", - "content": "You are a helpful assistant.", - } - ] + if stream_gen_ai_spans: + items = capture_items("span") + + with start_transaction(name="openai tx"): + response = await client.chat.completions.create( + model="some-model", + messages=get_messages(), + max_tokens=100, + presence_penalty=0.1, + frequency_penalty=0.2, + temperature=0.7, + top_p=0.9, + ) + response = response.choices[0].message.content + + assert response == "the model response" + span = next(item.payload for item in items if item.type == "span") + assert span["attributes"]["sentry.op"] == "gen_ai.chat" + assert span["attributes"][SPANDATA.GEN_AI_SYSTEM] == "openai" + assert span["attributes"][SPANDATA.GEN_AI_RESPONSE_STREAMING] is False + + assert span["attributes"][SPANDATA.GEN_AI_REQUEST_MODEL] == "some-model" + assert span["attributes"][SPANDATA.GEN_AI_REQUEST_MAX_TOKENS] == 100 + assert span["attributes"][SPANDATA.GEN_AI_REQUEST_PRESENCE_PENALTY] == 0.1 + assert span["attributes"][SPANDATA.GEN_AI_REQUEST_FREQUENCY_PENALTY] == 0.2 + assert span["attributes"][SPANDATA.GEN_AI_REQUEST_TEMPERATURE] == 0.7 + assert span["attributes"][SPANDATA.GEN_AI_REQUEST_TOP_P] == 0.9 + + param_id = request.node.callspec.id + if "blocks" in param_id: + assert json.loads( + span["attributes"][SPANDATA.GEN_AI_SYSTEM_INSTRUCTIONS] + ) == [ + { + "type": "text", + "content": "You are a helpful assistant.", + } + ] + else: + assert json.loads( + span["attributes"][SPANDATA.GEN_AI_SYSTEM_INSTRUCTIONS] + ) == [ + { + "type": "text", + "content": "You are a helpful assistant.", + }, + { + "type": "text", + "content": "Be concise and clear.", + }, + ] + + assert "hello" in span["attributes"][SPANDATA.GEN_AI_REQUEST_MESSAGES] + assert "the model response" in span["attributes"][SPANDATA.GEN_AI_RESPONSE_TEXT] + + assert span["attributes"]["gen_ai.usage.output_tokens"] == 10 + assert span["attributes"]["gen_ai.usage.input_tokens"] == 20 + assert span["attributes"]["gen_ai.usage.total_tokens"] == 30 else: - assert json.loads(span["data"][SPANDATA.GEN_AI_SYSTEM_INSTRUCTIONS]) == [ - { - "type": "text", - "content": "You are a helpful assistant.", - }, - { - "type": "text", - "content": "Be concise and clear.", - }, - ] + events = capture_events() + + with start_transaction(name="openai tx"): + response = await client.chat.completions.create( + model="some-model", + messages=get_messages(), + max_tokens=100, + presence_penalty=0.1, + frequency_penalty=0.2, + temperature=0.7, + top_p=0.9, + ) + response = response.choices[0].message.content + + assert response == "the model response" + tx = events[0] + assert tx["type"] == "transaction" + span = tx["spans"][0] + assert span["op"] == "gen_ai.chat" + assert span["data"][SPANDATA.GEN_AI_SYSTEM] == "openai" + assert span["data"][SPANDATA.GEN_AI_RESPONSE_STREAMING] is False + + assert span["data"][SPANDATA.GEN_AI_REQUEST_MODEL] == "some-model" + assert span["data"][SPANDATA.GEN_AI_REQUEST_MAX_TOKENS] == 100 + assert span["data"][SPANDATA.GEN_AI_REQUEST_PRESENCE_PENALTY] == 0.1 + assert span["data"][SPANDATA.GEN_AI_REQUEST_FREQUENCY_PENALTY] == 0.2 + assert span["data"][SPANDATA.GEN_AI_REQUEST_TEMPERATURE] == 0.7 + assert span["data"][SPANDATA.GEN_AI_REQUEST_TOP_P] == 0.9 + + param_id = request.node.callspec.id + if "blocks" in param_id: + assert json.loads(span["data"][SPANDATA.GEN_AI_SYSTEM_INSTRUCTIONS]) == [ + { + "type": "text", + "content": "You are a helpful assistant.", + } + ] + else: + assert json.loads(span["data"][SPANDATA.GEN_AI_SYSTEM_INSTRUCTIONS]) == [ + { + "type": "text", + "content": "You are a helpful assistant.", + }, + { + "type": "text", + "content": "Be concise and clear.", + }, + ] - assert "hello" in span["data"][SPANDATA.GEN_AI_REQUEST_MESSAGES] - assert "the model response" in span["data"][SPANDATA.GEN_AI_RESPONSE_TEXT] + assert "hello" in span["data"][SPANDATA.GEN_AI_REQUEST_MESSAGES] + assert "the model response" in span["data"][SPANDATA.GEN_AI_RESPONSE_TEXT] - assert span["data"]["gen_ai.usage.output_tokens"] == 10 - assert span["data"]["gen_ai.usage.input_tokens"] == 20 - assert span["data"]["gen_ai.usage.total_tokens"] == 30 + assert span["data"]["gen_ai.usage.output_tokens"] == 10 + assert span["data"]["gen_ai.usage.input_tokens"] == 20 + assert span["data"]["gen_ai.usage.total_tokens"] == 30 def tiktoken_encoding_if_installed(): @@ -527,6 +749,7 @@ def tiktoken_encoding_if_installed(): # noinspection PyTypeChecker +@pytest.mark.parametrize("stream_gen_ai_spans", [True, False]) @pytest.mark.parametrize( "send_default_pii, include_prompts", [ @@ -538,10 +761,12 @@ def tiktoken_encoding_if_installed(): def test_streaming_chat_completion_no_prompts( sentry_init, capture_events, + capture_items, send_default_pii, include_prompts, get_model_response, server_side_event_chunks, + stream_gen_ai_spans, ): sentry_init( integrations=[ @@ -552,8 +777,8 @@ def test_streaming_chat_completion_no_prompts( ], traces_sample_rate=1.0, send_default_pii=send_default_pii, + stream_gen_ai_spans=stream_gen_ai_spans, ) - events = capture_events() client = OpenAI(api_key="z") returned_stream = get_model_response( @@ -603,12 +828,14 @@ def test_streaming_chat_completion_no_prompts( ) ) - with mock.patch.object( - client.chat._client._client, - "send", - return_value=returned_stream, - ): - with start_transaction(name="openai tx"): + if stream_gen_ai_spans: + items = capture_items("span") + + with mock.patch.object( + client.chat._client._client, + "send", + return_value=returned_stream, + ), start_transaction(name="openai tx"): response_stream = client.chat.completions.create( model="some-model", messages=[ @@ -626,37 +853,90 @@ def test_streaming_chat_completion_no_prompts( map(lambda x: x.choices[0].delta.content, response_stream) ) - assert response_string == "hello world" - tx = events[0] - assert tx["type"] == "transaction" - span = tx["spans"][0] - assert span["op"] == "gen_ai.chat" - assert span["data"][SPANDATA.GEN_AI_SYSTEM] == "openai" - assert span["data"][SPANDATA.GEN_AI_RESPONSE_STREAMING] is True + assert response_string == "hello world" + span = next(item.payload for item in items if item.type == "span") + assert span["attributes"]["sentry.op"] == "gen_ai.chat" + assert span["attributes"][SPANDATA.GEN_AI_SYSTEM] == "openai" + assert span["attributes"][SPANDATA.GEN_AI_RESPONSE_STREAMING] is True + + assert span["attributes"][SPANDATA.GEN_AI_REQUEST_MODEL] == "some-model" + assert span["attributes"][SPANDATA.GEN_AI_REQUEST_MAX_TOKENS] == 100 + assert span["attributes"][SPANDATA.GEN_AI_REQUEST_PRESENCE_PENALTY] == 0.1 + assert span["attributes"][SPANDATA.GEN_AI_REQUEST_FREQUENCY_PENALTY] == 0.2 + assert span["attributes"][SPANDATA.GEN_AI_REQUEST_TEMPERATURE] == 0.7 + assert span["attributes"][SPANDATA.GEN_AI_REQUEST_TOP_P] == 0.9 + + assert span["attributes"][SPANDATA.GEN_AI_RESPONSE_MODEL] == "model-id" + + assert SPANDATA.GEN_AI_SYSTEM_INSTRUCTIONS not in span["attributes"] + assert SPANDATA.GEN_AI_REQUEST_MESSAGES not in span["attributes"] + assert SPANDATA.GEN_AI_RESPONSE_TEXT not in span["attributes"] + + try: + import tiktoken # type: ignore # noqa # pylint: disable=unused-import + + assert span["attributes"]["gen_ai.usage.output_tokens"] == 2 + assert span["attributes"]["gen_ai.usage.input_tokens"] == 7 + assert span["attributes"]["gen_ai.usage.total_tokens"] == 9 + except ImportError: + pass # if tiktoken is not installed, we can't guarantee token usage will be calculated properly + else: + events = capture_events() - assert span["data"][SPANDATA.GEN_AI_REQUEST_MODEL] == "some-model" - assert span["data"][SPANDATA.GEN_AI_REQUEST_MAX_TOKENS] == 100 - assert span["data"][SPANDATA.GEN_AI_REQUEST_PRESENCE_PENALTY] == 0.1 - assert span["data"][SPANDATA.GEN_AI_REQUEST_FREQUENCY_PENALTY] == 0.2 - assert span["data"][SPANDATA.GEN_AI_REQUEST_TEMPERATURE] == 0.7 - assert span["data"][SPANDATA.GEN_AI_REQUEST_TOP_P] == 0.9 + with mock.patch.object( + client.chat._client._client, + "send", + return_value=returned_stream, + ), start_transaction(name="openai tx"): + response_stream = client.chat.completions.create( + model="some-model", + messages=[ + {"role": "system", "content": "You are a helpful assistant."}, + {"role": "user", "content": "hello"}, + ], + stream=True, + max_tokens=100, + presence_penalty=0.1, + frequency_penalty=0.2, + temperature=0.7, + top_p=0.9, + ) + response_string = "".join( + map(lambda x: x.choices[0].delta.content, response_stream) + ) - assert span["data"][SPANDATA.GEN_AI_RESPONSE_MODEL] == "model-id" + assert response_string == "hello world" + tx = events[0] + assert tx["type"] == "transaction" + span = tx["spans"][0] + assert span["op"] == "gen_ai.chat" + assert span["data"][SPANDATA.GEN_AI_SYSTEM] == "openai" + assert span["data"][SPANDATA.GEN_AI_RESPONSE_STREAMING] is True - assert SPANDATA.GEN_AI_SYSTEM_INSTRUCTIONS not in span["data"] - assert SPANDATA.GEN_AI_REQUEST_MESSAGES not in span["data"] - assert SPANDATA.GEN_AI_RESPONSE_TEXT not in span["data"] + assert span["data"][SPANDATA.GEN_AI_REQUEST_MODEL] == "some-model" + assert span["data"][SPANDATA.GEN_AI_REQUEST_MAX_TOKENS] == 100 + assert span["data"][SPANDATA.GEN_AI_REQUEST_PRESENCE_PENALTY] == 0.1 + assert span["data"][SPANDATA.GEN_AI_REQUEST_FREQUENCY_PENALTY] == 0.2 + assert span["data"][SPANDATA.GEN_AI_REQUEST_TEMPERATURE] == 0.7 + assert span["data"][SPANDATA.GEN_AI_REQUEST_TOP_P] == 0.9 - try: - import tiktoken # type: ignore # noqa # pylint: disable=unused-import + assert span["data"][SPANDATA.GEN_AI_RESPONSE_MODEL] == "model-id" - assert span["data"]["gen_ai.usage.output_tokens"] == 2 - assert span["data"]["gen_ai.usage.input_tokens"] == 7 - assert span["data"]["gen_ai.usage.total_tokens"] == 9 - except ImportError: - pass # if tiktoken is not installed, we can't guarantee token usage will be calculated properly + assert SPANDATA.GEN_AI_SYSTEM_INSTRUCTIONS not in span["data"] + assert SPANDATA.GEN_AI_REQUEST_MESSAGES not in span["data"] + assert SPANDATA.GEN_AI_RESPONSE_TEXT not in span["data"] + + try: + import tiktoken # type: ignore # noqa # pylint: disable=unused-import + + assert span["data"]["gen_ai.usage.output_tokens"] == 2 + assert span["data"]["gen_ai.usage.input_tokens"] == 7 + assert span["data"]["gen_ai.usage.total_tokens"] == 9 + except ImportError: + pass # if tiktoken is not installed, we can't guarantee token usage will be calculated properly +@pytest.mark.parametrize("stream_gen_ai_spans", [True, False]) @pytest.mark.skipif( OPENAI_VERSION <= (1, 1, 0), reason="OpenAI versions <=1.1.0 do not support the stream_options parameter.", @@ -664,16 +944,18 @@ def test_streaming_chat_completion_no_prompts( def test_streaming_chat_completion_with_usage_in_stream( sentry_init, capture_events, + capture_items, get_model_response, server_side_event_chunks, + stream_gen_ai_spans, ): """When stream_options=include_usage is set, token usage comes from the final chunk's usage field.""" sentry_init( integrations=[OpenAIIntegration(include_prompts=False)], traces_sample_rate=1.0, send_default_pii=False, + stream_gen_ai_spans=stream_gen_ai_spans, ) - events = capture_events() client = OpenAI(api_key="z") returned_stream = get_model_response( @@ -715,12 +997,36 @@ def test_streaming_chat_completion_with_usage_in_stream( ) ) - with mock.patch.object( - client.chat._client._client, - "send", - return_value=returned_stream, - ): - with start_transaction(name="openai tx"): + if stream_gen_ai_spans: + items = capture_items("span") + + with mock.patch.object( + client.chat._client._client, + "send", + return_value=returned_stream, + ), start_transaction(name="openai tx"): + response_stream = client.chat.completions.create( + model="some-model", + messages=[{"role": "user", "content": "hello"}], + stream=True, + stream_options={"include_usage": True}, + ) + for _ in response_stream: + pass + + span = next(item.payload for item in items if item.type == "span") + assert span["attributes"]["sentry.op"] == "gen_ai.chat" + assert span["attributes"]["gen_ai.usage.input_tokens"] == 20 + assert span["attributes"]["gen_ai.usage.output_tokens"] == 10 + assert span["attributes"]["gen_ai.usage.total_tokens"] == 30 + else: + events = capture_events() + + with mock.patch.object( + client.chat._client._client, + "send", + return_value=returned_stream, + ), start_transaction(name="openai tx"): response_stream = client.chat.completions.create( model="some-model", messages=[{"role": "user", "content": "hello"}], @@ -730,15 +1036,16 @@ def test_streaming_chat_completion_with_usage_in_stream( for _ in response_stream: pass - tx = events[0] - assert tx["type"] == "transaction" - span = tx["spans"][0] - assert span["op"] == "gen_ai.chat" - assert span["data"]["gen_ai.usage.input_tokens"] == 20 - assert span["data"]["gen_ai.usage.output_tokens"] == 10 - assert span["data"]["gen_ai.usage.total_tokens"] == 30 + tx = events[0] + assert tx["type"] == "transaction" + span = tx["spans"][0] + assert span["op"] == "gen_ai.chat" + assert span["data"]["gen_ai.usage.input_tokens"] == 20 + assert span["data"]["gen_ai.usage.output_tokens"] == 10 + assert span["data"]["gen_ai.usage.total_tokens"] == 30 +@pytest.mark.parametrize("stream_gen_ai_spans", [True, False]) @pytest.mark.skipif( OPENAI_VERSION <= (1, 1, 0), reason="OpenAI versions <=1.1.0 do not support the stream_options parameter.", @@ -746,16 +1053,18 @@ def test_streaming_chat_completion_with_usage_in_stream( def test_streaming_chat_completion_empty_content_preserves_token_usage( sentry_init, capture_events, + capture_items, get_model_response, server_side_event_chunks, + stream_gen_ai_spans, ): """Token usage from the stream is recorded even when no content is produced (e.g. content filter).""" sentry_init( integrations=[OpenAIIntegration(include_prompts=False)], traces_sample_rate=1.0, send_default_pii=False, + stream_gen_ai_spans=stream_gen_ai_spans, ) - events = capture_events() client = OpenAI(api_key="z") returned_stream = get_model_response( @@ -778,12 +1087,36 @@ def test_streaming_chat_completion_empty_content_preserves_token_usage( ) ) - with mock.patch.object( - client.chat._client._client, - "send", - return_value=returned_stream, - ): - with start_transaction(name="openai tx"): + if stream_gen_ai_spans: + items = capture_items("span") + + with mock.patch.object( + client.chat._client._client, + "send", + return_value=returned_stream, + ), start_transaction(name="openai tx"): + response_stream = client.chat.completions.create( + model="some-model", + messages=[{"role": "user", "content": "hello"}], + stream=True, + stream_options={"include_usage": True}, + ) + for _ in response_stream: + pass + + span = next(item.payload for item in items if item.type == "span") + assert span["attributes"]["sentry.op"] == "gen_ai.chat" + assert span["attributes"]["gen_ai.usage.input_tokens"] == 20 + assert "gen_ai.usage.output_tokens" not in span["attributes"] + assert span["attributes"]["gen_ai.usage.total_tokens"] == 20 + else: + events = capture_events() + + with mock.patch.object( + client.chat._client._client, + "send", + return_value=returned_stream, + ), start_transaction(name="openai tx"): response_stream = client.chat.completions.create( model="some-model", messages=[{"role": "user", "content": "hello"}], @@ -793,15 +1126,16 @@ def test_streaming_chat_completion_empty_content_preserves_token_usage( for _ in response_stream: pass - tx = events[0] - assert tx["type"] == "transaction" - span = tx["spans"][0] - assert span["op"] == "gen_ai.chat" - assert span["data"]["gen_ai.usage.input_tokens"] == 20 - assert "gen_ai.usage.output_tokens" not in span["data"] - assert span["data"]["gen_ai.usage.total_tokens"] == 20 + tx = events[0] + assert tx["type"] == "transaction" + span = tx["spans"][0] + assert span["op"] == "gen_ai.chat" + assert span["data"]["gen_ai.usage.input_tokens"] == 20 + assert "gen_ai.usage.output_tokens" not in span["data"] + assert span["data"]["gen_ai.usage.total_tokens"] == 20 +@pytest.mark.parametrize("stream_gen_ai_spans", [True, False]) @pytest.mark.skipif( OPENAI_VERSION <= (1, 1, 0), reason="OpenAI versions <=1.1.0 do not support the stream_options parameter.", @@ -810,17 +1144,19 @@ def test_streaming_chat_completion_empty_content_preserves_token_usage( async def test_streaming_chat_completion_empty_content_preserves_token_usage_async( sentry_init, capture_events, + capture_items, get_model_response, async_iterator, server_side_event_chunks, + stream_gen_ai_spans, ): """Token usage from the stream is recorded even when no content is produced - async variant.""" sentry_init( integrations=[OpenAIIntegration(include_prompts=False)], traces_sample_rate=1.0, send_default_pii=False, + stream_gen_ai_spans=stream_gen_ai_spans, ) - events = capture_events() client = AsyncOpenAI(api_key="z") returned_stream = get_model_response( @@ -845,12 +1181,36 @@ async def test_streaming_chat_completion_empty_content_preserves_token_usage_asy ) ) - with mock.patch.object( - client.chat._client._client, - "send", - return_value=returned_stream, - ): - with start_transaction(name="openai tx"): + if stream_gen_ai_spans: + items = capture_items("span") + + with mock.patch.object( + client.chat._client._client, + "send", + return_value=returned_stream, + ), start_transaction(name="openai tx"): + response_stream = await client.chat.completions.create( + model="some-model", + messages=[{"role": "user", "content": "hello"}], + stream=True, + stream_options={"include_usage": True}, + ) + async for _ in response_stream: + pass + + span = next(item.payload for item in items if item.type == "span") + assert span["attributes"]["sentry.op"] == "gen_ai.chat" + assert span["attributes"]["gen_ai.usage.input_tokens"] == 20 + assert "gen_ai.usage.output_tokens" not in span["attributes"] + assert span["attributes"]["gen_ai.usage.total_tokens"] == 20 + else: + events = capture_events() + + with mock.patch.object( + client.chat._client._client, + "send", + return_value=returned_stream, + ), start_transaction(name="openai tx"): response_stream = await client.chat.completions.create( model="some-model", messages=[{"role": "user", "content": "hello"}], @@ -860,15 +1220,16 @@ async def test_streaming_chat_completion_empty_content_preserves_token_usage_asy async for _ in response_stream: pass - tx = events[0] - assert tx["type"] == "transaction" - span = tx["spans"][0] - assert span["op"] == "gen_ai.chat" - assert span["data"]["gen_ai.usage.input_tokens"] == 20 - assert "gen_ai.usage.output_tokens" not in span["data"] - assert span["data"]["gen_ai.usage.total_tokens"] == 20 + tx = events[0] + assert tx["type"] == "transaction" + span = tx["spans"][0] + assert span["op"] == "gen_ai.chat" + assert span["data"]["gen_ai.usage.input_tokens"] == 20 + assert "gen_ai.usage.output_tokens" not in span["data"] + assert span["data"]["gen_ai.usage.total_tokens"] == 20 +@pytest.mark.parametrize("stream_gen_ai_spans", [True, False]) @pytest.mark.skipif( OPENAI_VERSION <= (1, 1, 0), reason="OpenAI versions <=1.1.0 do not support the stream_options parameter.", @@ -877,17 +1238,19 @@ async def test_streaming_chat_completion_empty_content_preserves_token_usage_asy async def test_streaming_chat_completion_async_with_usage_in_stream( sentry_init, capture_events, + capture_items, get_model_response, async_iterator, server_side_event_chunks, + stream_gen_ai_spans, ): """When stream_options=include_usage is set, token usage comes from the final chunk's usage field (async).""" sentry_init( integrations=[OpenAIIntegration(include_prompts=False)], traces_sample_rate=1.0, send_default_pii=False, + stream_gen_ai_spans=stream_gen_ai_spans, ) - events = capture_events() client = AsyncOpenAI(api_key="z") returned_stream = get_model_response( @@ -931,12 +1294,36 @@ async def test_streaming_chat_completion_async_with_usage_in_stream( ) ) - with mock.patch.object( - client.chat._client._client, - "send", - return_value=returned_stream, - ): - with start_transaction(name="openai tx"): + if stream_gen_ai_spans: + items = capture_items("span") + + with mock.patch.object( + client.chat._client._client, + "send", + return_value=returned_stream, + ), start_transaction(name="openai tx"): + response_stream = await client.chat.completions.create( + model="some-model", + messages=[{"role": "user", "content": "hello"}], + stream=True, + stream_options={"include_usage": True}, + ) + async for _ in response_stream: + pass + + span = next(item.payload for item in items if item.type == "span") + assert span["attributes"]["sentry.op"] == "gen_ai.chat" + assert span["attributes"]["gen_ai.usage.input_tokens"] == 20 + assert span["attributes"]["gen_ai.usage.output_tokens"] == 10 + assert span["attributes"]["gen_ai.usage.total_tokens"] == 30 + else: + events = capture_events() + + with mock.patch.object( + client.chat._client._client, + "send", + return_value=returned_stream, + ), start_transaction(name="openai tx"): response_stream = await client.chat.completions.create( model="some-model", messages=[{"role": "user", "content": "hello"}], @@ -946,21 +1333,22 @@ async def test_streaming_chat_completion_async_with_usage_in_stream( async for _ in response_stream: pass - tx = events[0] - assert tx["type"] == "transaction" - span = tx["spans"][0] - assert span["op"] == "gen_ai.chat" - assert span["data"]["gen_ai.usage.input_tokens"] == 20 - assert span["data"]["gen_ai.usage.output_tokens"] == 10 - assert span["data"]["gen_ai.usage.total_tokens"] == 30 + tx = events[0] + assert tx["type"] == "transaction" + span = tx["spans"][0] + assert span["op"] == "gen_ai.chat" + assert span["data"]["gen_ai.usage.input_tokens"] == 20 + assert span["data"]["gen_ai.usage.output_tokens"] == 10 + assert span["data"]["gen_ai.usage.total_tokens"] == 30 # noinspection PyTypeChecker +@pytest.mark.parametrize("stream_gen_ai_spans", [True, False]) @pytest.mark.parametrize( - "messages", + "get_messages", [ pytest.param( - [ + lambda: [ { "role": "system", "content": "You are a helpful assistant.", @@ -970,7 +1358,7 @@ async def test_streaming_chat_completion_async_with_usage_in_stream( id="blocks", ), pytest.param( - [ + lambda: [ { "role": "system", "content": [ @@ -983,7 +1371,7 @@ async def test_streaming_chat_completion_async_with_usage_in_stream( id="parts", ), pytest.param( - iter( + lambda: iter( [ { "role": "system", @@ -1002,10 +1390,12 @@ async def test_streaming_chat_completion_async_with_usage_in_stream( def test_streaming_chat_completion( sentry_init, capture_events, - messages, + capture_items, + get_messages, request, get_model_response, server_side_event_chunks, + stream_gen_ai_spans, ): sentry_init( integrations=[ @@ -1016,8 +1406,8 @@ def test_streaming_chat_completion( ], traces_sample_rate=1.0, send_default_pii=True, + stream_gen_ai_spans=stream_gen_ai_spans, ) - events = capture_events() client = OpenAI(api_key="z") returned_stream = get_model_response( @@ -1067,15 +1457,17 @@ def test_streaming_chat_completion( ) ) - with mock.patch.object( - client.chat._client._client, - "send", - return_value=returned_stream, - ): - with start_transaction(name="openai tx"): + if stream_gen_ai_spans: + items = capture_items("span") + + with mock.patch.object( + client.chat._client._client, + "send", + return_value=returned_stream, + ), start_transaction(name="openai tx"): response_stream = client.chat.completions.create( model="some-model", - messages=messages, + messages=get_messages(), stream=True, max_tokens=100, presence_penalty=0.1, @@ -1086,79 +1478,160 @@ def test_streaming_chat_completion( response_string = "".join( map(lambda x: x.choices[0].delta.content, response_stream) ) - assert response_string == "hello world" - tx = events[0] - assert tx["type"] == "transaction" - span = tx["spans"][0] - assert span["op"] == "gen_ai.chat" - assert span["data"][SPANDATA.GEN_AI_SYSTEM] == "openai" - assert span["data"][SPANDATA.GEN_AI_RESPONSE_STREAMING] is True - - assert span["data"][SPANDATA.GEN_AI_REQUEST_MODEL] == "some-model" - assert span["data"][SPANDATA.GEN_AI_REQUEST_MAX_TOKENS] == 100 - assert span["data"][SPANDATA.GEN_AI_REQUEST_PRESENCE_PENALTY] == 0.1 - assert span["data"][SPANDATA.GEN_AI_REQUEST_FREQUENCY_PENALTY] == 0.2 - assert span["data"][SPANDATA.GEN_AI_REQUEST_TEMPERATURE] == 0.7 - assert span["data"][SPANDATA.GEN_AI_REQUEST_TOP_P] == 0.9 - - param_id = request.node.callspec.id - if "blocks" in param_id: - assert json.loads(span["data"][SPANDATA.GEN_AI_SYSTEM_INSTRUCTIONS]) == [ - { - "type": "text", - "content": "You are a helpful assistant.", - } - ] - else: - assert json.loads(span["data"][SPANDATA.GEN_AI_SYSTEM_INSTRUCTIONS]) == [ - { - "type": "text", - "content": "You are a helpful assistant.", - }, - { - "type": "text", - "content": "Be concise and clear.", - }, - ] + assert response_string == "hello world" + span = next(item.payload for item in items if item.type == "span") + assert span["attributes"]["sentry.op"] == "gen_ai.chat" + assert span["attributes"][SPANDATA.GEN_AI_SYSTEM] == "openai" + assert span["attributes"][SPANDATA.GEN_AI_RESPONSE_STREAMING] is True + + assert span["attributes"][SPANDATA.GEN_AI_REQUEST_MODEL] == "some-model" + assert span["attributes"][SPANDATA.GEN_AI_REQUEST_MAX_TOKENS] == 100 + assert span["attributes"][SPANDATA.GEN_AI_REQUEST_PRESENCE_PENALTY] == 0.1 + assert span["attributes"][SPANDATA.GEN_AI_REQUEST_FREQUENCY_PENALTY] == 0.2 + assert span["attributes"][SPANDATA.GEN_AI_REQUEST_TEMPERATURE] == 0.7 + assert span["attributes"][SPANDATA.GEN_AI_REQUEST_TOP_P] == 0.9 + + param_id = request.node.callspec.id + if "blocks" in param_id: + assert json.loads( + span["attributes"][SPANDATA.GEN_AI_SYSTEM_INSTRUCTIONS] + ) == [ + { + "type": "text", + "content": "You are a helpful assistant.", + } + ] + else: + assert json.loads( + span["attributes"][SPANDATA.GEN_AI_SYSTEM_INSTRUCTIONS] + ) == [ + { + "type": "text", + "content": "You are a helpful assistant.", + }, + { + "type": "text", + "content": "Be concise and clear.", + }, + ] - assert span["data"][SPANDATA.GEN_AI_RESPONSE_MODEL] == "model-id" + assert span["attributes"][SPANDATA.GEN_AI_RESPONSE_MODEL] == "model-id" - assert "hello" in span["data"][SPANDATA.GEN_AI_REQUEST_MESSAGES] - assert "hello world" in span["data"][SPANDATA.GEN_AI_RESPONSE_TEXT] + assert "hello" in span["attributes"][SPANDATA.GEN_AI_REQUEST_MESSAGES] + assert "hello world" in span["attributes"][SPANDATA.GEN_AI_RESPONSE_TEXT] - try: - import tiktoken # type: ignore # noqa # pylint: disable=unused-import + try: + import tiktoken # type: ignore # noqa # pylint: disable=unused-import - if "blocks" in param_id: - assert span["data"]["gen_ai.usage.output_tokens"] == 2 - assert span["data"]["gen_ai.usage.input_tokens"] == 7 - assert span["data"]["gen_ai.usage.total_tokens"] == 9 - else: - assert span["data"]["gen_ai.usage.output_tokens"] == 2 - assert span["data"]["gen_ai.usage.input_tokens"] == 12 - assert span["data"]["gen_ai.usage.total_tokens"] == 14 - except ImportError: - pass # if tiktoken is not installed, we can't guarantee token usage will be calculated properly + if "blocks" in param_id: + assert span["attributes"]["gen_ai.usage.output_tokens"] == 2 + assert span["attributes"]["gen_ai.usage.input_tokens"] == 7 + assert span["attributes"]["gen_ai.usage.total_tokens"] == 9 + else: + assert span["attributes"]["gen_ai.usage.output_tokens"] == 2 + assert span["attributes"]["gen_ai.usage.input_tokens"] == 12 + assert span["attributes"]["gen_ai.usage.total_tokens"] == 14 + except ImportError: + pass # if tiktoken is not installed, we can't guarantee token usage will be calculated properly + else: + events = capture_events() -# noinspection PyTypeChecker -@pytest.mark.asyncio -@pytest.mark.parametrize( - "send_default_pii, include_prompts", - [ - (True, False), - (False, True), - (False, False), - ], + with mock.patch.object( + client.chat._client._client, + "send", + return_value=returned_stream, + ), start_transaction(name="openai tx"): + response_stream = client.chat.completions.create( + model="some-model", + messages=get_messages(), + stream=True, + max_tokens=100, + presence_penalty=0.1, + frequency_penalty=0.2, + temperature=0.7, + top_p=0.9, + ) + response_string = "".join( + map(lambda x: x.choices[0].delta.content, response_stream) + ) + assert response_string == "hello world" + tx = events[0] + assert tx["type"] == "transaction" + span = tx["spans"][0] + assert span["op"] == "gen_ai.chat" + assert span["data"][SPANDATA.GEN_AI_SYSTEM] == "openai" + assert span["data"][SPANDATA.GEN_AI_RESPONSE_STREAMING] is True + + assert span["data"][SPANDATA.GEN_AI_REQUEST_MODEL] == "some-model" + assert span["data"][SPANDATA.GEN_AI_REQUEST_MAX_TOKENS] == 100 + assert span["data"][SPANDATA.GEN_AI_REQUEST_PRESENCE_PENALTY] == 0.1 + assert span["data"][SPANDATA.GEN_AI_REQUEST_FREQUENCY_PENALTY] == 0.2 + assert span["data"][SPANDATA.GEN_AI_REQUEST_TEMPERATURE] == 0.7 + assert span["data"][SPANDATA.GEN_AI_REQUEST_TOP_P] == 0.9 + + param_id = request.node.callspec.id + if "blocks" in param_id: + assert json.loads(span["data"][SPANDATA.GEN_AI_SYSTEM_INSTRUCTIONS]) == [ + { + "type": "text", + "content": "You are a helpful assistant.", + } + ] + else: + assert json.loads(span["data"][SPANDATA.GEN_AI_SYSTEM_INSTRUCTIONS]) == [ + { + "type": "text", + "content": "You are a helpful assistant.", + }, + { + "type": "text", + "content": "Be concise and clear.", + }, + ] + + assert span["data"][SPANDATA.GEN_AI_RESPONSE_MODEL] == "model-id" + + assert "hello" in span["data"][SPANDATA.GEN_AI_REQUEST_MESSAGES] + assert "hello world" in span["data"][SPANDATA.GEN_AI_RESPONSE_TEXT] + + try: + import tiktoken # type: ignore # noqa # pylint: disable=unused-import + + if "blocks" in param_id: + assert span["data"]["gen_ai.usage.output_tokens"] == 2 + assert span["data"]["gen_ai.usage.input_tokens"] == 7 + assert span["data"]["gen_ai.usage.total_tokens"] == 9 + else: + assert span["data"]["gen_ai.usage.output_tokens"] == 2 + assert span["data"]["gen_ai.usage.input_tokens"] == 12 + assert span["data"]["gen_ai.usage.total_tokens"] == 14 + + except ImportError: + pass # if tiktoken is not installed, we can't guarantee token usage will be calculated properly + + +# noinspection PyTypeChecker +@pytest.mark.parametrize("stream_gen_ai_spans", [True, False]) +@pytest.mark.asyncio +@pytest.mark.parametrize( + "send_default_pii, include_prompts", + [ + (True, False), + (False, True), + (False, False), + ], ) async def test_streaming_chat_completion_async_no_prompts( sentry_init, capture_events, + capture_items, send_default_pii, include_prompts, get_model_response, async_iterator, server_side_event_chunks, + stream_gen_ai_spans, ): sentry_init( integrations=[ @@ -1169,8 +1642,8 @@ async def test_streaming_chat_completion_async_no_prompts( ], traces_sample_rate=1.0, send_default_pii=send_default_pii, + stream_gen_ai_spans=stream_gen_ai_spans, ) - events = capture_events() client = AsyncOpenAI(api_key="z") returned_stream = get_model_response( @@ -1222,12 +1695,14 @@ async def test_streaming_chat_completion_async_no_prompts( ) ) - with mock.patch.object( - client.chat._client._client, - "send", - return_value=returned_stream, - ): - with start_transaction(name="openai tx"): + if stream_gen_ai_spans: + items = capture_items("span") + + with mock.patch.object( + client.chat._client._client, + "send", + return_value=returned_stream, + ), start_transaction(name="openai tx"): response_stream = await client.chat.completions.create( model="some-model", messages=[ @@ -1246,45 +1721,100 @@ async def test_streaming_chat_completion_async_no_prompts( async for x in response_stream: response_string += x.choices[0].delta.content - assert response_string == "hello world" - tx = events[0] - assert tx["type"] == "transaction" - span = tx["spans"][0] - assert span["op"] == "gen_ai.chat" - assert span["data"][SPANDATA.GEN_AI_SYSTEM] == "openai" - assert span["data"][SPANDATA.GEN_AI_RESPONSE_STREAMING] is True + assert response_string == "hello world" + span = next(item.payload for item in items if item.type == "span") + assert span["attributes"]["sentry.op"] == "gen_ai.chat" + assert span["attributes"][SPANDATA.GEN_AI_SYSTEM] == "openai" + assert span["attributes"][SPANDATA.GEN_AI_RESPONSE_STREAMING] is True - assert span["data"][SPANDATA.GEN_AI_REQUEST_MODEL] == "some-model" - assert span["data"][SPANDATA.GEN_AI_REQUEST_MAX_TOKENS] == 100 - assert span["data"][SPANDATA.GEN_AI_REQUEST_PRESENCE_PENALTY] == 0.1 - assert span["data"][SPANDATA.GEN_AI_REQUEST_FREQUENCY_PENALTY] == 0.2 - assert span["data"][SPANDATA.GEN_AI_REQUEST_TEMPERATURE] == 0.7 - assert span["data"][SPANDATA.GEN_AI_REQUEST_TOP_P] == 0.9 + assert span["attributes"][SPANDATA.GEN_AI_REQUEST_MODEL] == "some-model" + assert span["attributes"][SPANDATA.GEN_AI_REQUEST_MAX_TOKENS] == 100 + assert span["attributes"][SPANDATA.GEN_AI_REQUEST_PRESENCE_PENALTY] == 0.1 + assert span["attributes"][SPANDATA.GEN_AI_REQUEST_FREQUENCY_PENALTY] == 0.2 + assert span["attributes"][SPANDATA.GEN_AI_REQUEST_TEMPERATURE] == 0.7 + assert span["attributes"][SPANDATA.GEN_AI_REQUEST_TOP_P] == 0.9 - assert span["data"][SPANDATA.GEN_AI_RESPONSE_MODEL] == "model-id" + assert span["attributes"][SPANDATA.GEN_AI_RESPONSE_MODEL] == "model-id" - assert SPANDATA.GEN_AI_SYSTEM_INSTRUCTIONS not in span["data"] - assert SPANDATA.GEN_AI_REQUEST_MESSAGES not in span["data"] - assert SPANDATA.GEN_AI_RESPONSE_TEXT not in span["data"] + assert SPANDATA.GEN_AI_SYSTEM_INSTRUCTIONS not in span["attributes"] + assert SPANDATA.GEN_AI_REQUEST_MESSAGES not in span["attributes"] + assert SPANDATA.GEN_AI_RESPONSE_TEXT not in span["attributes"] - try: - import tiktoken # type: ignore # noqa # pylint: disable=unused-import + try: + import tiktoken # type: ignore # noqa # pylint: disable=unused-import - assert span["data"]["gen_ai.usage.output_tokens"] == 2 - assert span["data"]["gen_ai.usage.input_tokens"] == 7 - assert span["data"]["gen_ai.usage.total_tokens"] == 9 + assert span["attributes"]["gen_ai.usage.output_tokens"] == 2 + assert span["attributes"]["gen_ai.usage.input_tokens"] == 7 + assert span["attributes"]["gen_ai.usage.total_tokens"] == 9 - except ImportError: - pass # if tiktoken is not installed, we can't guarantee token usage will be calculated properly + except ImportError: + pass # if tiktoken is not installed, we can't guarantee token usage will be calculated properly + else: + events = capture_events() + + with mock.patch.object( + client.chat._client._client, + "send", + return_value=returned_stream, + ), start_transaction(name="openai tx"): + response_stream = await client.chat.completions.create( + model="some-model", + messages=[ + {"role": "system", "content": "You are a helpful assistant."}, + {"role": "user", "content": "hello"}, + ], + stream=True, + max_tokens=100, + presence_penalty=0.1, + frequency_penalty=0.2, + temperature=0.7, + top_p=0.9, + ) + + response_string = "" + async for x in response_stream: + response_string += x.choices[0].delta.content + + assert response_string == "hello world" + tx = events[0] + assert tx["type"] == "transaction" + span = tx["spans"][0] + assert span["op"] == "gen_ai.chat" + assert span["data"][SPANDATA.GEN_AI_SYSTEM] == "openai" + assert span["data"][SPANDATA.GEN_AI_RESPONSE_STREAMING] is True + + assert span["data"][SPANDATA.GEN_AI_REQUEST_MODEL] == "some-model" + assert span["data"][SPANDATA.GEN_AI_REQUEST_MAX_TOKENS] == 100 + assert span["data"][SPANDATA.GEN_AI_REQUEST_PRESENCE_PENALTY] == 0.1 + assert span["data"][SPANDATA.GEN_AI_REQUEST_FREQUENCY_PENALTY] == 0.2 + assert span["data"][SPANDATA.GEN_AI_REQUEST_TEMPERATURE] == 0.7 + assert span["data"][SPANDATA.GEN_AI_REQUEST_TOP_P] == 0.9 + + assert span["data"][SPANDATA.GEN_AI_RESPONSE_MODEL] == "model-id" + + assert SPANDATA.GEN_AI_SYSTEM_INSTRUCTIONS not in span["data"] + assert SPANDATA.GEN_AI_REQUEST_MESSAGES not in span["data"] + assert SPANDATA.GEN_AI_RESPONSE_TEXT not in span["data"] + + try: + import tiktoken # type: ignore # noqa # pylint: disable=unused-import + + assert span["data"]["gen_ai.usage.output_tokens"] == 2 + assert span["data"]["gen_ai.usage.input_tokens"] == 7 + assert span["data"]["gen_ai.usage.total_tokens"] == 9 + + except ImportError: + pass # if tiktoken is not installed, we can't guarantee token usage will be calculated properly # noinspection PyTypeChecker +@pytest.mark.parametrize("stream_gen_ai_spans", [True, False]) @pytest.mark.asyncio @pytest.mark.parametrize( - "messages", + "get_messages", [ pytest.param( - [ + lambda: [ { "role": "system", "content": "You are a helpful assistant.", @@ -1294,7 +1824,7 @@ async def test_streaming_chat_completion_async_no_prompts( id="blocks", ), pytest.param( - [ + lambda: [ { "role": "system", "content": [ @@ -1307,7 +1837,7 @@ async def test_streaming_chat_completion_async_no_prompts( id="parts", ), pytest.param( - iter( + lambda: iter( [ { "role": "system", @@ -1326,11 +1856,13 @@ async def test_streaming_chat_completion_async_no_prompts( async def test_streaming_chat_completion_async( sentry_init, capture_events, - messages, + capture_items, + get_messages, request, get_model_response, async_iterator, server_side_event_chunks, + stream_gen_ai_spans, ): sentry_init( integrations=[ @@ -1341,8 +1873,8 @@ async def test_streaming_chat_completion_async( ], traces_sample_rate=1.0, send_default_pii=True, + stream_gen_ai_spans=stream_gen_ai_spans, ) - events = capture_events() client = AsyncOpenAI(api_key="z") @@ -1395,15 +1927,17 @@ async def test_streaming_chat_completion_async( ) ) - with mock.patch.object( - client.chat._client._client, - "send", - return_value=returned_stream, - ): - with start_transaction(name="openai tx"): + if stream_gen_ai_spans: + items = capture_items("span") + + with mock.patch.object( + client.chat._client._client, + "send", + return_value=returned_stream, + ), start_transaction(name="openai tx"): response_stream = await client.chat.completions.create( model="some-model", - messages=messages, + messages=get_messages(), stream=True, max_tokens=100, presence_penalty=0.1, @@ -1416,118 +1950,279 @@ async def test_streaming_chat_completion_async( async for x in response_stream: response_string += x.choices[0].delta.content - assert response_string == "hello world" - tx = events[0] - assert tx["type"] == "transaction" - span = tx["spans"][0] - assert span["op"] == "gen_ai.chat" - assert span["data"][SPANDATA.GEN_AI_SYSTEM] == "openai" - assert span["data"][SPANDATA.GEN_AI_RESPONSE_STREAMING] is True - - assert span["data"][SPANDATA.GEN_AI_REQUEST_MODEL] == "some-model" - assert span["data"][SPANDATA.GEN_AI_REQUEST_MAX_TOKENS] == 100 - assert span["data"][SPANDATA.GEN_AI_REQUEST_PRESENCE_PENALTY] == 0.1 - assert span["data"][SPANDATA.GEN_AI_REQUEST_FREQUENCY_PENALTY] == 0.2 - assert span["data"][SPANDATA.GEN_AI_REQUEST_TEMPERATURE] == 0.7 - assert span["data"][SPANDATA.GEN_AI_REQUEST_TOP_P] == 0.9 - - assert span["data"][SPANDATA.GEN_AI_RESPONSE_MODEL] == "model-id" - - param_id = request.node.callspec.id - if "blocks" in param_id: - assert json.loads(span["data"][SPANDATA.GEN_AI_SYSTEM_INSTRUCTIONS]) == [ - { - "type": "text", - "content": "You are a helpful assistant.", - } - ] + assert response_string == "hello world" + span = next(item.payload for item in items if item.type == "span") + assert span["attributes"]["sentry.op"] == "gen_ai.chat" + assert span["attributes"][SPANDATA.GEN_AI_SYSTEM] == "openai" + assert span["attributes"][SPANDATA.GEN_AI_RESPONSE_STREAMING] is True + + assert span["attributes"][SPANDATA.GEN_AI_REQUEST_MODEL] == "some-model" + assert span["attributes"][SPANDATA.GEN_AI_REQUEST_MAX_TOKENS] == 100 + assert span["attributes"][SPANDATA.GEN_AI_REQUEST_PRESENCE_PENALTY] == 0.1 + assert span["attributes"][SPANDATA.GEN_AI_REQUEST_FREQUENCY_PENALTY] == 0.2 + assert span["attributes"][SPANDATA.GEN_AI_REQUEST_TEMPERATURE] == 0.7 + assert span["attributes"][SPANDATA.GEN_AI_REQUEST_TOP_P] == 0.9 + + assert span["attributes"][SPANDATA.GEN_AI_RESPONSE_MODEL] == "model-id" + + param_id = request.node.callspec.id + if "blocks" in param_id: + assert json.loads( + span["attributes"][SPANDATA.GEN_AI_SYSTEM_INSTRUCTIONS] + ) == [ + { + "type": "text", + "content": "You are a helpful assistant.", + } + ] + else: + assert json.loads( + span["attributes"][SPANDATA.GEN_AI_SYSTEM_INSTRUCTIONS] + ) == [ + { + "type": "text", + "content": "You are a helpful assistant.", + }, + { + "type": "text", + "content": "Be concise and clear.", + }, + ] + + assert "hello" in span["attributes"][SPANDATA.GEN_AI_REQUEST_MESSAGES] + assert "hello world" in span["attributes"][SPANDATA.GEN_AI_RESPONSE_TEXT] + + try: + import tiktoken # type: ignore # noqa # pylint: disable=unused-import + + if "blocks" in param_id: + assert span["attributes"]["gen_ai.usage.output_tokens"] == 2 + assert span["attributes"]["gen_ai.usage.input_tokens"] == 7 + assert span["attributes"]["gen_ai.usage.total_tokens"] == 9 + else: + assert span["attributes"]["gen_ai.usage.output_tokens"] == 2 + assert span["attributes"]["gen_ai.usage.input_tokens"] == 12 + assert span["attributes"]["gen_ai.usage.total_tokens"] == 14 + + except ImportError: + pass # if tiktoken is not installed, we can't guarantee token usage will be calculated properly else: - assert json.loads(span["data"][SPANDATA.GEN_AI_SYSTEM_INSTRUCTIONS]) == [ - { - "type": "text", - "content": "You are a helpful assistant.", - }, - { - "type": "text", - "content": "Be concise and clear.", - }, - ] + events = capture_events() + + with mock.patch.object( + client.chat._client._client, + "send", + return_value=returned_stream, + ), start_transaction(name="openai tx"): + response_stream = await client.chat.completions.create( + model="some-model", + messages=get_messages(), + stream=True, + max_tokens=100, + presence_penalty=0.1, + frequency_penalty=0.2, + temperature=0.7, + top_p=0.9, + ) + + response_string = "" + async for x in response_stream: + response_string += x.choices[0].delta.content - assert "hello" in span["data"][SPANDATA.GEN_AI_REQUEST_MESSAGES] - assert "hello world" in span["data"][SPANDATA.GEN_AI_RESPONSE_TEXT] + assert response_string == "hello world" + tx = events[0] + assert tx["type"] == "transaction" + span = tx["spans"][0] + assert span["op"] == "gen_ai.chat" + assert span["data"][SPANDATA.GEN_AI_SYSTEM] == "openai" + assert span["data"][SPANDATA.GEN_AI_RESPONSE_STREAMING] is True - try: - import tiktoken # type: ignore # noqa # pylint: disable=unused-import + assert span["data"][SPANDATA.GEN_AI_REQUEST_MODEL] == "some-model" + assert span["data"][SPANDATA.GEN_AI_REQUEST_MAX_TOKENS] == 100 + assert span["data"][SPANDATA.GEN_AI_REQUEST_PRESENCE_PENALTY] == 0.1 + assert span["data"][SPANDATA.GEN_AI_REQUEST_FREQUENCY_PENALTY] == 0.2 + assert span["data"][SPANDATA.GEN_AI_REQUEST_TEMPERATURE] == 0.7 + assert span["data"][SPANDATA.GEN_AI_REQUEST_TOP_P] == 0.9 + + assert span["data"][SPANDATA.GEN_AI_RESPONSE_MODEL] == "model-id" + param_id = request.node.callspec.id if "blocks" in param_id: - assert span["data"]["gen_ai.usage.output_tokens"] == 2 - assert span["data"]["gen_ai.usage.input_tokens"] == 7 - assert span["data"]["gen_ai.usage.total_tokens"] == 9 + assert json.loads(span["data"][SPANDATA.GEN_AI_SYSTEM_INSTRUCTIONS]) == [ + { + "type": "text", + "content": "You are a helpful assistant.", + } + ] else: - assert span["data"]["gen_ai.usage.output_tokens"] == 2 - assert span["data"]["gen_ai.usage.input_tokens"] == 12 - assert span["data"]["gen_ai.usage.total_tokens"] == 14 + assert json.loads(span["data"][SPANDATA.GEN_AI_SYSTEM_INSTRUCTIONS]) == [ + { + "type": "text", + "content": "You are a helpful assistant.", + }, + { + "type": "text", + "content": "Be concise and clear.", + }, + ] - except ImportError: - pass # if tiktoken is not installed, we can't guarantee token usage will be calculated properly + assert "hello" in span["data"][SPANDATA.GEN_AI_REQUEST_MESSAGES] + assert "hello world" in span["data"][SPANDATA.GEN_AI_RESPONSE_TEXT] + try: + import tiktoken # type: ignore # noqa # pylint: disable=unused-import -def test_bad_chat_completion(sentry_init, capture_events): - sentry_init(integrations=[OpenAIIntegration()], traces_sample_rate=1.0) - events = capture_events() + if "blocks" in param_id: + assert span["data"]["gen_ai.usage.output_tokens"] == 2 + assert span["data"]["gen_ai.usage.input_tokens"] == 7 + assert span["data"]["gen_ai.usage.total_tokens"] == 9 + else: + assert span["data"]["gen_ai.usage.output_tokens"] == 2 + assert span["data"]["gen_ai.usage.input_tokens"] == 12 + assert span["data"]["gen_ai.usage.total_tokens"] == 14 - client = OpenAI(api_key="z") - client.chat.completions._post = mock.Mock( - side_effect=OpenAIError("API rate limit reached") + except ImportError: + pass # if tiktoken is not installed, we can't guarantee token usage will be calculated properly + + +@pytest.mark.parametrize("stream_gen_ai_spans", [True, False]) +def test_bad_chat_completion( + sentry_init, + capture_events, + capture_items, + stream_gen_ai_spans, +): + sentry_init( + integrations=[OpenAIIntegration()], + traces_sample_rate=1.0, + stream_gen_ai_spans=stream_gen_ai_spans, ) - with pytest.raises(OpenAIError): - client.chat.completions.create( - model="some-model", - messages=[{"role": "system", "content": "hello"}], - ) - (event,) = events - assert event["level"] == "error" + if stream_gen_ai_spans: + items = capture_items("event") + client = OpenAI(api_key="z") + client.chat.completions._post = mock.Mock( + side_effect=OpenAIError("API rate limit reached") + ) + with pytest.raises(OpenAIError): + client.chat.completions.create( + model="some-model", + messages=[{"role": "system", "content": "hello"}], + ) -def test_span_status_error(sentry_init, capture_events): - sentry_init(integrations=[OpenAIIntegration()], traces_sample_rate=1.0) - events = capture_events() + (event,) = (item.payload for item in items if item.type == "event") + else: + events = capture_events() - with start_transaction(name="test"): client = OpenAI(api_key="z") client.chat.completions._post = mock.Mock( side_effect=OpenAIError("API rate limit reached") ) with pytest.raises(OpenAIError): client.chat.completions.create( - model="some-model", messages=[{"role": "system", "content": "hello"}] + model="some-model", + messages=[{"role": "system", "content": "hello"}], + ) + + (event,) = events + + assert event["level"] == "error" + + +@pytest.mark.parametrize("stream_gen_ai_spans", [True, False]) +def test_span_status_error( + sentry_init, + capture_events, + capture_items, + stream_gen_ai_spans, +): + sentry_init( + integrations=[OpenAIIntegration()], + traces_sample_rate=1.0, + stream_gen_ai_spans=stream_gen_ai_spans, + ) + + if stream_gen_ai_spans: + items = capture_items("event", "transaction", "span") + + with start_transaction(name="test"): + client = OpenAI(api_key="z") + client.chat.completions._post = mock.Mock( + side_effect=OpenAIError("API rate limit reached") + ) + with pytest.raises(OpenAIError): + client.chat.completions.create( + model="some-model", + messages=[{"role": "system", "content": "hello"}], + ) + + (error,) = (item.payload for item in items if item.type == "event") + assert error["level"] == "error" + + spans = [item.payload for item in items if item.type == "span"] + assert spans[0]["status"] == "error" + else: + events = capture_events() + + with start_transaction(name="test"): + client = OpenAI(api_key="z") + client.chat.completions._post = mock.Mock( + side_effect=OpenAIError("API rate limit reached") ) + with pytest.raises(OpenAIError): + client.chat.completions.create( + model="some-model", + messages=[{"role": "system", "content": "hello"}], + ) - (error, transaction) = events - assert error["level"] == "error" - assert transaction["spans"][0]["status"] == "internal_error" - assert transaction["spans"][0]["tags"]["status"] == "internal_error" + (error, transaction) = events + assert error["level"] == "error" + assert transaction["spans"][0]["status"] == "internal_error" + assert transaction["spans"][0]["tags"]["status"] == "internal_error" +@pytest.mark.parametrize("stream_gen_ai_spans", [True, False]) @pytest.mark.asyncio -async def test_bad_chat_completion_async(sentry_init, capture_events): - sentry_init(integrations=[OpenAIIntegration()], traces_sample_rate=1.0) - events = capture_events() +async def test_bad_chat_completion_async( + sentry_init, + capture_events, + capture_items, + stream_gen_ai_spans, +): + sentry_init( + integrations=[OpenAIIntegration()], + traces_sample_rate=1.0, + stream_gen_ai_spans=stream_gen_ai_spans, + ) client = AsyncOpenAI(api_key="z") client.chat.completions._post = AsyncMock( side_effect=OpenAIError("API rate limit reached") ) - with pytest.raises(OpenAIError): - await client.chat.completions.create( - model="some-model", messages=[{"role": "system", "content": "hello"}] - ) + if stream_gen_ai_spans: + items = capture_items("event") + + with pytest.raises(OpenAIError): + await client.chat.completions.create( + model="some-model", messages=[{"role": "system", "content": "hello"}] + ) + + (event,) = (item.payload for item in items if item.type == "event") + else: + events = capture_events() + + with pytest.raises(OpenAIError): + await client.chat.completions.create( + model="some-model", messages=[{"role": "system", "content": "hello"}] + ) + + (event,) = events - (event,) = events assert event["level"] == "error" +@pytest.mark.parametrize("stream_gen_ai_spans", [True, False]) @pytest.mark.parametrize( "send_default_pii, include_prompts", [ @@ -1537,14 +2232,19 @@ async def test_bad_chat_completion_async(sentry_init, capture_events): ], ) def test_embeddings_create_no_pii( - sentry_init, capture_events, send_default_pii, include_prompts + sentry_init, + capture_events, + capture_items, + send_default_pii, + include_prompts, + stream_gen_ai_spans, ): sentry_init( integrations=[OpenAIIntegration(include_prompts=include_prompts)], traces_sample_rate=1.0, send_default_pii=send_default_pii, + stream_gen_ai_spans=stream_gen_ai_spans, ) - events = capture_events() client = OpenAI(api_key="z") @@ -1559,60 +2259,87 @@ def test_embeddings_create_no_pii( ) client.embeddings._post = mock.Mock(return_value=returned_embedding) - with start_transaction(name="openai tx"): - response = client.embeddings.create( - input="hello", model="text-embedding-3-large" + + if stream_gen_ai_spans: + items = capture_items("span") + + with start_transaction(name="openai tx"): + response = client.embeddings.create( + input="hello", model="text-embedding-3-large" + ) + + assert len(response.data[0].embedding) == 3 + + span = next(item.payload for item in items if item.type == "span") + assert span["attributes"]["sentry.op"] == "gen_ai.embeddings" + assert span["attributes"][SPANDATA.GEN_AI_SYSTEM] == "openai" + assert ( + span["attributes"][SPANDATA.GEN_AI_REQUEST_MODEL] + == "text-embedding-3-large" ) - assert len(response.data[0].embedding) == 3 + assert SPANDATA.GEN_AI_EMBEDDINGS_INPUT not in span["attributes"] + + assert span["attributes"]["gen_ai.usage.input_tokens"] == 20 + assert span["attributes"]["gen_ai.usage.total_tokens"] == 30 + else: + events = capture_events() + + with start_transaction(name="openai tx"): + response = client.embeddings.create( + input="hello", model="text-embedding-3-large" + ) - tx = events[0] - assert tx["type"] == "transaction" - span = tx["spans"][0] - assert span["op"] == "gen_ai.embeddings" - assert span["data"][SPANDATA.GEN_AI_SYSTEM] == "openai" - assert span["data"][SPANDATA.GEN_AI_REQUEST_MODEL] == "text-embedding-3-large" + assert len(response.data[0].embedding) == 3 - assert SPANDATA.GEN_AI_EMBEDDINGS_INPUT not in span["data"] + tx = events[0] + assert tx["type"] == "transaction" + span = tx["spans"][0] + assert span["op"] == "gen_ai.embeddings" + assert span["data"][SPANDATA.GEN_AI_SYSTEM] == "openai" + assert span["data"][SPANDATA.GEN_AI_REQUEST_MODEL] == "text-embedding-3-large" - assert span["data"]["gen_ai.usage.input_tokens"] == 20 - assert span["data"]["gen_ai.usage.total_tokens"] == 30 + assert SPANDATA.GEN_AI_EMBEDDINGS_INPUT not in span["data"] + assert span["data"]["gen_ai.usage.input_tokens"] == 20 + assert span["data"]["gen_ai.usage.total_tokens"] == 30 + +@pytest.mark.parametrize("stream_gen_ai_spans", [True, False]) @pytest.mark.parametrize( - "input", + "get_input", [ pytest.param( - "hello", + lambda: "hello", id="string", ), pytest.param( - ["First text", "Second text", "Third text"], + lambda: ["First text", "Second text", "Third text"], id="string_sequence", ), pytest.param( - iter(["First text", "Second text", "Third text"]), + lambda: iter(["First text", "Second text", "Third text"]), id="string_iterable", ), pytest.param( - [5, 8, 13, 21, 34], + lambda: [5, 8, 13, 21, 34], id="tokens", ), pytest.param( - iter( + lambda: iter( [5, 8, 13, 21, 34], ), id="token_iterable", ), pytest.param( - [ + lambda: [ [5, 8, 13, 21, 34], [8, 13, 21, 34, 55], ], id="tokens_sequence", ), pytest.param( - iter( + lambda: iter( [ [5, 8, 13, 21, 34], [8, 13, 21, 34, 55], @@ -1622,13 +2349,20 @@ def test_embeddings_create_no_pii( ), ], ) -def test_embeddings_create(sentry_init, capture_events, input, request): +def test_embeddings_create( + sentry_init, + capture_events, + capture_items, + get_input, + request, + stream_gen_ai_spans, +): sentry_init( integrations=[OpenAIIntegration(include_prompts=True)], traces_sample_rate=1.0, send_default_pii=True, + stream_gen_ai_spans=stream_gen_ai_spans, ) - events = capture_events() client = OpenAI(api_key="z") @@ -1643,45 +2377,111 @@ def test_embeddings_create(sentry_init, capture_events, input, request): ) client.embeddings._post = mock.Mock(return_value=returned_embedding) - with start_transaction(name="openai tx"): - response = client.embeddings.create(input=input, model="text-embedding-3-large") - - assert len(response.data[0].embedding) == 3 - - tx = events[0] - assert tx["type"] == "transaction" - span = tx["spans"][0] - assert span["op"] == "gen_ai.embeddings" - assert span["data"][SPANDATA.GEN_AI_SYSTEM] == "openai" - assert span["data"][SPANDATA.GEN_AI_REQUEST_MODEL] == "text-embedding-3-large" - - param_id = request.node.callspec.id - if param_id == "string": - assert json.loads(span["data"][SPANDATA.GEN_AI_EMBEDDINGS_INPUT]) == ["hello"] - elif param_id == "string_sequence" or param_id == "string_iterable": - assert json.loads(span["data"][SPANDATA.GEN_AI_EMBEDDINGS_INPUT]) == [ - "First text", - "Second text", - "Third text", - ] - elif param_id == "tokens" or param_id == "token_iterable": - assert json.loads(span["data"][SPANDATA.GEN_AI_EMBEDDINGS_INPUT]) == [ - 5, - 8, - 13, - 21, - 34, - ] + + if stream_gen_ai_spans: + items = capture_items("span") + + with start_transaction(name="openai tx"): + response = client.embeddings.create( + input=get_input(), model="text-embedding-3-large" + ) + + assert len(response.data[0].embedding) == 3 + + span = next(item.payload for item in items if item.type == "span") + assert span["attributes"]["sentry.op"] == "gen_ai.embeddings" + assert span["attributes"][SPANDATA.GEN_AI_SYSTEM] == "openai" + assert ( + span["attributes"][SPANDATA.GEN_AI_REQUEST_MODEL] + == "text-embedding-3-large" + ) + + param_id = request.node.callspec.id + if ( + "string" in param_id + and "string_sequence" not in param_id + and "string_iterable" not in param_id + ): + assert json.loads(span["attributes"][SPANDATA.GEN_AI_EMBEDDINGS_INPUT]) == [ + "hello" + ] + elif "string_sequence" in param_id or "string_iterable" in param_id: + assert json.loads(span["attributes"][SPANDATA.GEN_AI_EMBEDDINGS_INPUT]) == [ + "First text", + "Second text", + "Third text", + ] + elif ( + "tokens" in param_id or "token_iterable" in param_id + ) and "tokens_sequence" not in param_id: + assert json.loads(span["attributes"][SPANDATA.GEN_AI_EMBEDDINGS_INPUT]) == [ + 5, + 8, + 13, + 21, + 34, + ] + else: + assert json.loads(span["attributes"][SPANDATA.GEN_AI_EMBEDDINGS_INPUT]) == [ + [5, 8, 13, 21, 34], + [8, 13, 21, 34, 55], + ] + + assert span["attributes"]["gen_ai.usage.input_tokens"] == 20 + assert span["attributes"]["gen_ai.usage.total_tokens"] == 30 else: - assert json.loads(span["data"][SPANDATA.GEN_AI_EMBEDDINGS_INPUT]) == [ - [5, 8, 13, 21, 34], - [8, 13, 21, 34, 55], - ] + events = capture_events() + + with start_transaction(name="openai tx"): + response = client.embeddings.create( + input=get_input(), model="text-embedding-3-large" + ) + + assert len(response.data[0].embedding) == 3 + + tx = events[0] + assert tx["type"] == "transaction" + span = tx["spans"][0] + assert span["op"] == "gen_ai.embeddings" + assert span["data"][SPANDATA.GEN_AI_SYSTEM] == "openai" + assert span["data"][SPANDATA.GEN_AI_REQUEST_MODEL] == "text-embedding-3-large" + + param_id = request.node.callspec.id + if ( + "string" in param_id + and "string_sequence" not in param_id + and "string_iterable" not in param_id + ): + assert json.loads(span["data"][SPANDATA.GEN_AI_EMBEDDINGS_INPUT]) == [ + "hello" + ] + elif "string_sequence" in param_id or "string_iterable" in param_id: + assert json.loads(span["data"][SPANDATA.GEN_AI_EMBEDDINGS_INPUT]) == [ + "First text", + "Second text", + "Third text", + ] + elif ( + "tokens" in param_id or "token_iterable" in param_id + ) and "tokens_sequence" not in param_id: + assert json.loads(span["data"][SPANDATA.GEN_AI_EMBEDDINGS_INPUT]) == [ + 5, + 8, + 13, + 21, + 34, + ] + else: + assert json.loads(span["data"][SPANDATA.GEN_AI_EMBEDDINGS_INPUT]) == [ + [5, 8, 13, 21, 34], + [8, 13, 21, 34, 55], + ] - assert span["data"]["gen_ai.usage.input_tokens"] == 20 - assert span["data"]["gen_ai.usage.total_tokens"] == 30 + assert span["data"]["gen_ai.usage.input_tokens"] == 20 + assert span["data"]["gen_ai.usage.total_tokens"] == 30 +@pytest.mark.parametrize("stream_gen_ai_spans", [True, False]) @pytest.mark.asyncio @pytest.mark.parametrize( "send_default_pii, include_prompts", @@ -1692,14 +2492,19 @@ def test_embeddings_create(sentry_init, capture_events, input, request): ], ) async def test_embeddings_create_async_no_pii( - sentry_init, capture_events, send_default_pii, include_prompts + sentry_init, + capture_events, + capture_items, + send_default_pii, + include_prompts, + stream_gen_ai_spans, ): sentry_init( integrations=[OpenAIIntegration(include_prompts=include_prompts)], traces_sample_rate=1.0, send_default_pii=send_default_pii, + stream_gen_ai_spans=stream_gen_ai_spans, ) - events = capture_events() client = AsyncOpenAI(api_key="z") @@ -1714,61 +2519,88 @@ async def test_embeddings_create_async_no_pii( ) client.embeddings._post = AsyncMock(return_value=returned_embedding) - with start_transaction(name="openai tx"): - response = await client.embeddings.create( - input="hello", model="text-embedding-3-large" + + if stream_gen_ai_spans: + items = capture_items("span") + + with start_transaction(name="openai tx"): + response = await client.embeddings.create( + input="hello", model="text-embedding-3-large" + ) + + assert len(response.data[0].embedding) == 3 + + span = next(item.payload for item in items if item.type == "span") + assert span["attributes"]["sentry.op"] == "gen_ai.embeddings" + assert span["attributes"][SPANDATA.GEN_AI_SYSTEM] == "openai" + assert ( + span["attributes"][SPANDATA.GEN_AI_REQUEST_MODEL] + == "text-embedding-3-large" ) - assert len(response.data[0].embedding) == 3 + assert SPANDATA.GEN_AI_EMBEDDINGS_INPUT not in span["attributes"] + + assert span["attributes"]["gen_ai.usage.input_tokens"] == 20 + assert span["attributes"]["gen_ai.usage.total_tokens"] == 30 + else: + events = capture_events() + + with start_transaction(name="openai tx"): + response = await client.embeddings.create( + input="hello", model="text-embedding-3-large" + ) + + assert len(response.data[0].embedding) == 3 - tx = events[0] - assert tx["type"] == "transaction" - span = tx["spans"][0] - assert span["op"] == "gen_ai.embeddings" - assert span["data"][SPANDATA.GEN_AI_SYSTEM] == "openai" - assert span["data"][SPANDATA.GEN_AI_REQUEST_MODEL] == "text-embedding-3-large" + tx = events[0] + assert tx["type"] == "transaction" + span = tx["spans"][0] + assert span["op"] == "gen_ai.embeddings" + assert span["data"][SPANDATA.GEN_AI_SYSTEM] == "openai" + assert span["data"][SPANDATA.GEN_AI_REQUEST_MODEL] == "text-embedding-3-large" - assert SPANDATA.GEN_AI_EMBEDDINGS_INPUT not in span["data"] + assert SPANDATA.GEN_AI_EMBEDDINGS_INPUT not in span["data"] - assert span["data"]["gen_ai.usage.input_tokens"] == 20 - assert span["data"]["gen_ai.usage.total_tokens"] == 30 + assert span["data"]["gen_ai.usage.input_tokens"] == 20 + assert span["data"]["gen_ai.usage.total_tokens"] == 30 +@pytest.mark.parametrize("stream_gen_ai_spans", [True, False]) @pytest.mark.asyncio @pytest.mark.parametrize( - "input", + "get_input", [ pytest.param( - "hello", + lambda: "hello", id="string", ), pytest.param( - ["First text", "Second text", "Third text"], + lambda: ["First text", "Second text", "Third text"], id="string_sequence", ), pytest.param( - iter(["First text", "Second text", "Third text"]), + lambda: iter(["First text", "Second text", "Third text"]), id="string_iterable", ), pytest.param( - [5, 8, 13, 21, 34], + lambda: [5, 8, 13, 21, 34], id="tokens", ), pytest.param( - iter( + lambda: iter( [5, 8, 13, 21, 34], ), id="token_iterable", ), pytest.param( - [ + lambda: [ [5, 8, 13, 21, 34], [8, 13, 21, 34, 55], ], id="tokens_sequence", ), pytest.param( - iter( + lambda: iter( [ [5, 8, 13, 21, 34], [8, 13, 21, 34, 55], @@ -1778,13 +2610,20 @@ async def test_embeddings_create_async_no_pii( ), ], ) -async def test_embeddings_create_async(sentry_init, capture_events, input, request): +async def test_embeddings_create_async( + sentry_init, + capture_events, + capture_items, + get_input, + request, + stream_gen_ai_spans, +): sentry_init( integrations=[OpenAIIntegration(include_prompts=True)], traces_sample_rate=1.0, send_default_pii=True, + stream_gen_ai_spans=stream_gen_ai_spans, ) - events = capture_events() client = AsyncOpenAI(api_key="z") @@ -1799,60 +2638,129 @@ async def test_embeddings_create_async(sentry_init, capture_events, input, reque ) client.embeddings._post = AsyncMock(return_value=returned_embedding) - with start_transaction(name="openai tx"): - response = await client.embeddings.create( - input=input, model="text-embedding-3-large" - ) - assert len(response.data[0].embedding) == 3 - - tx = events[0] - assert tx["type"] == "transaction" - span = tx["spans"][0] - assert span["op"] == "gen_ai.embeddings" - assert span["data"][SPANDATA.GEN_AI_SYSTEM] == "openai" - assert span["data"][SPANDATA.GEN_AI_REQUEST_MODEL] == "text-embedding-3-large" - - param_id = request.node.callspec.id - if param_id == "string": - assert json.loads(span["data"][SPANDATA.GEN_AI_EMBEDDINGS_INPUT]) == ["hello"] - elif param_id == "string_sequence" or param_id == "string_iterable": - assert json.loads(span["data"][SPANDATA.GEN_AI_EMBEDDINGS_INPUT]) == [ - "First text", - "Second text", - "Third text", - ] - elif param_id == "tokens" or param_id == "token_iterable": - assert json.loads(span["data"][SPANDATA.GEN_AI_EMBEDDINGS_INPUT]) == [ - 5, - 8, - 13, - 21, - 34, - ] + if stream_gen_ai_spans: + items = capture_items("span") + + with start_transaction(name="openai tx"): + response = await client.embeddings.create( + input=get_input(), model="text-embedding-3-large" + ) + + assert len(response.data[0].embedding) == 3 + + span = next(item.payload for item in items if item.type == "span") + assert span["attributes"]["sentry.op"] == "gen_ai.embeddings" + assert span["attributes"][SPANDATA.GEN_AI_SYSTEM] == "openai" + assert ( + span["attributes"][SPANDATA.GEN_AI_REQUEST_MODEL] + == "text-embedding-3-large" + ) + + param_id = request.node.callspec.id + if ( + "string" in param_id + and "string_sequence" not in param_id + and "string_iterable" not in param_id + ): + assert json.loads(span["attributes"][SPANDATA.GEN_AI_EMBEDDINGS_INPUT]) == [ + "hello" + ] + elif "string_sequence" in param_id or "string_iterable" in param_id: + assert json.loads(span["attributes"][SPANDATA.GEN_AI_EMBEDDINGS_INPUT]) == [ + "First text", + "Second text", + "Third text", + ] + elif ( + "tokens" in param_id or "token_iterable" in param_id + ) and "tokens_sequence" not in param_id: + assert json.loads(span["attributes"][SPANDATA.GEN_AI_EMBEDDINGS_INPUT]) == [ + 5, + 8, + 13, + 21, + 34, + ] + else: + assert json.loads(span["attributes"][SPANDATA.GEN_AI_EMBEDDINGS_INPUT]) == [ + [5, 8, 13, 21, 34], + [8, 13, 21, 34, 55], + ] + + assert span["attributes"]["gen_ai.usage.input_tokens"] == 20 + assert span["attributes"]["gen_ai.usage.total_tokens"] == 30 else: - assert json.loads(span["data"][SPANDATA.GEN_AI_EMBEDDINGS_INPUT]) == [ - [5, 8, 13, 21, 34], - [8, 13, 21, 34, 55], - ] + events = capture_events() + + with start_transaction(name="openai tx"): + response = await client.embeddings.create( + input=get_input(), model="text-embedding-3-large" + ) + + assert len(response.data[0].embedding) == 3 + + tx = events[0] + assert tx["type"] == "transaction" + span = tx["spans"][0] + assert span["op"] == "gen_ai.embeddings" + assert span["data"][SPANDATA.GEN_AI_SYSTEM] == "openai" + assert span["data"][SPANDATA.GEN_AI_REQUEST_MODEL] == "text-embedding-3-large" + + param_id = request.node.callspec.id + if ( + "string" in param_id + and "string_sequence" not in param_id + and "string_iterable" not in param_id + ): + assert json.loads(span["data"][SPANDATA.GEN_AI_EMBEDDINGS_INPUT]) == [ + "hello" + ] + elif "string_sequence" in param_id or "string_iterable" in param_id: + assert json.loads(span["data"][SPANDATA.GEN_AI_EMBEDDINGS_INPUT]) == [ + "First text", + "Second text", + "Third text", + ] + elif ( + "tokens" in param_id or "token_iterable" in param_id + ) and "tokens_sequence" not in param_id: + assert json.loads(span["data"][SPANDATA.GEN_AI_EMBEDDINGS_INPUT]) == [ + 5, + 8, + 13, + 21, + 34, + ] + else: + assert json.loads(span["data"][SPANDATA.GEN_AI_EMBEDDINGS_INPUT]) == [ + [5, 8, 13, 21, 34], + [8, 13, 21, 34, 55], + ] - assert span["data"]["gen_ai.usage.input_tokens"] == 20 - assert span["data"]["gen_ai.usage.total_tokens"] == 30 + assert span["data"]["gen_ai.usage.input_tokens"] == 20 + assert span["data"]["gen_ai.usage.total_tokens"] == 30 +@pytest.mark.parametrize("stream_gen_ai_spans", [True, False]) @pytest.mark.parametrize( "send_default_pii, include_prompts", [(True, True), (True, False), (False, True), (False, False)], ) def test_embeddings_create_raises_error( - sentry_init, capture_events, send_default_pii, include_prompts + sentry_init, + capture_events, + capture_items, + send_default_pii, + include_prompts, + stream_gen_ai_spans, ): sentry_init( integrations=[OpenAIIntegration(include_prompts=include_prompts)], traces_sample_rate=1.0, send_default_pii=send_default_pii, + stream_gen_ai_spans=stream_gen_ai_spans, ) - events = capture_events() client = OpenAI(api_key="z") @@ -1860,27 +2768,44 @@ def test_embeddings_create_raises_error( side_effect=OpenAIError("API rate limit reached") ) - with pytest.raises(OpenAIError): - client.embeddings.create(input="hello", model="text-embedding-3-large") + if stream_gen_ai_spans: + items = capture_items("event") + + with pytest.raises(OpenAIError): + client.embeddings.create(input="hello", model="text-embedding-3-large") + + (event,) = (item.payload for item in items if item.type == "event") + else: + events = capture_events() + + with pytest.raises(OpenAIError): + client.embeddings.create(input="hello", model="text-embedding-3-large") + + (event,) = events - (event,) = events assert event["level"] == "error" +@pytest.mark.parametrize("stream_gen_ai_spans", [True, False]) @pytest.mark.asyncio @pytest.mark.parametrize( "send_default_pii, include_prompts", [(True, True), (True, False), (False, True), (False, False)], ) async def test_embeddings_create_raises_error_async( - sentry_init, capture_events, send_default_pii, include_prompts + sentry_init, + capture_events, + capture_items, + send_default_pii, + include_prompts, + stream_gen_ai_spans, ): sentry_init( integrations=[OpenAIIntegration(include_prompts=include_prompts)], traces_sample_rate=1.0, send_default_pii=send_default_pii, + stream_gen_ai_spans=stream_gen_ai_spans, ) - events = capture_events() client = AsyncOpenAI(api_key="z") @@ -1888,21 +2813,41 @@ async def test_embeddings_create_raises_error_async( side_effect=OpenAIError("API rate limit reached") ) - with pytest.raises(OpenAIError): - await client.embeddings.create(input="hello", model="text-embedding-3-large") + if stream_gen_ai_spans: + items = capture_items("event") + + with pytest.raises(OpenAIError): + await client.embeddings.create( + input="hello", model="text-embedding-3-large" + ) + + (event,) = (item.payload for item in items if item.type == "event") + else: + events = capture_events() + + with pytest.raises(OpenAIError): + await client.embeddings.create( + input="hello", model="text-embedding-3-large" + ) + + (event,) = events - (event,) = events assert event["level"] == "error" +@pytest.mark.parametrize("stream_gen_ai_spans", [True, False]) def test_span_origin_nonstreaming_chat( - sentry_init, capture_events, nonstreaming_chat_completions_model_response + sentry_init, + capture_events, + capture_items, + nonstreaming_chat_completions_model_response, + stream_gen_ai_spans, ): sentry_init( integrations=[OpenAIIntegration()], traces_sample_rate=1.0, + stream_gen_ai_spans=stream_gen_ai_spans, ) - events = capture_events() client = OpenAI(api_key="z") client.chat.completions._post = mock.Mock( @@ -1919,26 +2864,47 @@ def test_span_origin_nonstreaming_chat( ) ) - with start_transaction(name="openai tx"): - client.chat.completions.create( - model="some-model", messages=[{"role": "system", "content": "hello"}] - ) + if stream_gen_ai_spans: + items = capture_items("transaction", "span") + + with start_transaction(name="openai tx"): + client.chat.completions.create( + model="some-model", messages=[{"role": "system", "content": "hello"}] + ) + + (event,) = (item.payload for item in items if item.type == "transaction") + assert event["contexts"]["trace"]["origin"] == "manual" + + spans = [item.payload for item in items if item.type == "span"] + assert spans[0]["attributes"]["sentry.origin"] == "auto.ai.openai" + else: + events = capture_events() + + with start_transaction(name="openai tx"): + client.chat.completions.create( + model="some-model", messages=[{"role": "system", "content": "hello"}] + ) - (event,) = events + (event,) = events - assert event["contexts"]["trace"]["origin"] == "manual" - assert event["spans"][0]["origin"] == "auto.ai.openai" + assert event["contexts"]["trace"]["origin"] == "manual" + assert event["spans"][0]["origin"] == "auto.ai.openai" +@pytest.mark.parametrize("stream_gen_ai_spans", [True, False]) @pytest.mark.asyncio async def test_span_origin_nonstreaming_chat_async( - sentry_init, capture_events, nonstreaming_chat_completions_model_response + sentry_init, + capture_events, + capture_items, + nonstreaming_chat_completions_model_response, + stream_gen_ai_spans, ): sentry_init( integrations=[OpenAIIntegration()], traces_sample_rate=1.0, + stream_gen_ai_spans=stream_gen_ai_spans, ) - events = capture_events() client = AsyncOpenAI(api_key="z") client.chat.completions._post = AsyncMock( @@ -1955,23 +2921,45 @@ async def test_span_origin_nonstreaming_chat_async( ) ) - with start_transaction(name="openai tx"): - await client.chat.completions.create( - model="some-model", messages=[{"role": "system", "content": "hello"}] - ) + if stream_gen_ai_spans: + items = capture_items("transaction", "span") + + with start_transaction(name="openai tx"): + await client.chat.completions.create( + model="some-model", messages=[{"role": "system", "content": "hello"}] + ) + + (event,) = (item.payload for item in items if item.type == "transaction") + assert event["contexts"]["trace"]["origin"] == "manual" + + spans = [item.payload for item in items if item.type == "span"] + assert spans[0]["attributes"]["sentry.origin"] == "auto.ai.openai" + else: + events = capture_events() + + with start_transaction(name="openai tx"): + await client.chat.completions.create( + model="some-model", messages=[{"role": "system", "content": "hello"}] + ) - (event,) = events + (event,) = events - assert event["contexts"]["trace"]["origin"] == "manual" - assert event["spans"][0]["origin"] == "auto.ai.openai" + assert event["contexts"]["trace"]["origin"] == "manual" + assert event["spans"][0]["origin"] == "auto.ai.openai" -def test_span_origin_streaming_chat(sentry_init, capture_events): +@pytest.mark.parametrize("stream_gen_ai_spans", [True, False]) +def test_span_origin_streaming_chat( + sentry_init, + capture_events, + capture_items, + stream_gen_ai_spans, +): sentry_init( integrations=[OpenAIIntegration()], traces_sample_rate=1.0, + stream_gen_ai_spans=stream_gen_ai_spans, ) - events = capture_events() client = OpenAI(api_key="z") returned_stream = Stream(cast_to=None, response=None, client=client) @@ -2011,29 +2999,53 @@ def test_span_origin_streaming_chat(sentry_init, capture_events): ), ] - client.chat.completions._post = mock.Mock(return_value=returned_stream) - with start_transaction(name="openai tx"): - response_stream = client.chat.completions.create( - model="some-model", messages=[{"role": "system", "content": "hello"}] - ) + if stream_gen_ai_spans: + items = capture_items("transaction", "span") + + client.chat.completions._post = mock.Mock(return_value=returned_stream) + with start_transaction(name="openai tx"): + response_stream = client.chat.completions.create( + model="some-model", messages=[{"role": "system", "content": "hello"}] + ) + + "".join(map(lambda x: x.choices[0].delta.content, response_stream)) + + (event,) = (item.payload for item in items if item.type == "transaction") + assert event["contexts"]["trace"]["origin"] == "manual" - "".join(map(lambda x: x.choices[0].delta.content, response_stream)) + spans = [item.payload for item in items if item.type == "span"] + assert spans[0]["attributes"]["sentry.origin"] == "auto.ai.openai" + else: + events = capture_events() + + client.chat.completions._post = mock.Mock(return_value=returned_stream) + with start_transaction(name="openai tx"): + response_stream = client.chat.completions.create( + model="some-model", messages=[{"role": "system", "content": "hello"}] + ) - (event,) = events + "".join(map(lambda x: x.choices[0].delta.content, response_stream)) - assert event["contexts"]["trace"]["origin"] == "manual" - assert event["spans"][0]["origin"] == "auto.ai.openai" + (event,) = events + assert event["contexts"]["trace"]["origin"] == "manual" + assert event["spans"][0]["origin"] == "auto.ai.openai" + +@pytest.mark.parametrize("stream_gen_ai_spans", [True, False]) @pytest.mark.asyncio async def test_span_origin_streaming_chat_async( - sentry_init, capture_events, async_iterator + sentry_init, + capture_events, + capture_items, + async_iterator, + stream_gen_ai_spans, ): sentry_init( integrations=[OpenAIIntegration()], traces_sample_rate=1.0, + stream_gen_ai_spans=stream_gen_ai_spans, ) - events = capture_events() client = AsyncOpenAI(api_key="z") returned_stream = AsyncStream(cast_to=None, response=None, client=client) @@ -2078,27 +3090,54 @@ async def test_span_origin_streaming_chat_async( ) client.chat.completions._post = AsyncMock(return_value=returned_stream) - with start_transaction(name="openai tx"): - response_stream = await client.chat.completions.create( - model="some-model", messages=[{"role": "system", "content": "hello"}] - ) - async for _ in response_stream: - pass - # "".join(map(lambda x: x.choices[0].delta.content, response_stream)) + if stream_gen_ai_spans: + items = capture_items("transaction", "span") + + with start_transaction(name="openai tx"): + response_stream = await client.chat.completions.create( + model="some-model", messages=[{"role": "system", "content": "hello"}] + ) + async for _ in response_stream: + pass + + # "".join(map(lambda x: x.choices[0].delta.content, response_stream)) + + (event,) = (item.payload for item in items if item.type == "transaction") + assert event["contexts"]["trace"]["origin"] == "manual" + + spans = [item.payload for item in items if item.type == "span"] + assert spans[0]["attributes"]["sentry.origin"] == "auto.ai.openai" + else: + events = capture_events() + + with start_transaction(name="openai tx"): + response_stream = await client.chat.completions.create( + model="some-model", messages=[{"role": "system", "content": "hello"}] + ) + async for _ in response_stream: + pass + + # "".join(map(lambda x: x.choices[0].delta.content, response_stream)) - (event,) = events + (event,) = events - assert event["contexts"]["trace"]["origin"] == "manual" - assert event["spans"][0]["origin"] == "auto.ai.openai" + assert event["contexts"]["trace"]["origin"] == "manual" + assert event["spans"][0]["origin"] == "auto.ai.openai" -def test_span_origin_embeddings(sentry_init, capture_events): +@pytest.mark.parametrize("stream_gen_ai_spans", [True, False]) +def test_span_origin_embeddings( + sentry_init, + capture_events, + capture_items, + stream_gen_ai_spans, +): sentry_init( integrations=[OpenAIIntegration()], traces_sample_rate=1.0, + stream_gen_ai_spans=stream_gen_ai_spans, ) - events = capture_events() client = OpenAI(api_key="z") @@ -2113,22 +3152,43 @@ def test_span_origin_embeddings(sentry_init, capture_events): ) client.embeddings._post = mock.Mock(return_value=returned_embedding) - with start_transaction(name="openai tx"): - client.embeddings.create(input="hello", model="text-embedding-3-large") - (event,) = events + if stream_gen_ai_spans: + items = capture_items("transaction", "span") + + with start_transaction(name="openai tx"): + client.embeddings.create(input="hello", model="text-embedding-3-large") + + (event,) = [item.payload for item in items if item.type == "transaction"] + assert event["contexts"]["trace"]["origin"] == "manual" + + spans = [item.payload for item in items if item.type == "span"] + assert spans[0]["attributes"]["sentry.origin"] == "auto.ai.openai" + else: + events = capture_events() + + with start_transaction(name="openai tx"): + client.embeddings.create(input="hello", model="text-embedding-3-large") + + (event,) = events - assert event["contexts"]["trace"]["origin"] == "manual" - assert event["spans"][0]["origin"] == "auto.ai.openai" + assert event["contexts"]["trace"]["origin"] == "manual" + assert event["spans"][0]["origin"] == "auto.ai.openai" +@pytest.mark.parametrize("stream_gen_ai_spans", [True, False]) @pytest.mark.asyncio -async def test_span_origin_embeddings_async(sentry_init, capture_events): +async def test_span_origin_embeddings_async( + sentry_init, + capture_events, + capture_items, + stream_gen_ai_spans, +): sentry_init( integrations=[OpenAIIntegration()], traces_sample_rate=1.0, + stream_gen_ai_spans=stream_gen_ai_spans, ) - events = capture_events() client = AsyncOpenAI(api_key="z") @@ -2143,13 +3203,32 @@ async def test_span_origin_embeddings_async(sentry_init, capture_events): ) client.embeddings._post = AsyncMock(return_value=returned_embedding) - with start_transaction(name="openai tx"): - await client.embeddings.create(input="hello", model="text-embedding-3-large") - (event,) = events + if stream_gen_ai_spans: + items = capture_items("transaction", "span") + + with start_transaction(name="openai tx"): + await client.embeddings.create( + input="hello", model="text-embedding-3-large" + ) + + (event,) = [item.payload for item in items if item.type == "transaction"] + assert event["contexts"]["trace"]["origin"] == "manual" - assert event["contexts"]["trace"]["origin"] == "manual" - assert event["spans"][0]["origin"] == "auto.ai.openai" + spans = [item.payload for item in items if item.type == "span"] + assert spans[0]["attributes"]["sentry.origin"] == "auto.ai.openai" + else: + events = capture_events() + + with start_transaction(name="openai tx"): + await client.embeddings.create( + input="hello", model="text-embedding-3-large" + ) + + (event,) = events + + assert event["contexts"]["trace"]["origin"] == "manual" + assert event["spans"][0]["origin"] == "auto.ai.openai" def test_completions_token_usage_from_response(): @@ -2514,56 +3593,111 @@ def count_tokens(msg): ) +@pytest.mark.parametrize("stream_gen_ai_spans", [True, False]) @pytest.mark.skipif(SKIP_RESPONSES_TESTS, reason="Responses API not available") -def test_ai_client_span_responses_api_no_pii(sentry_init, capture_events): +def test_ai_client_span_responses_api_no_pii( + sentry_init, + capture_events, + capture_items, + stream_gen_ai_spans, +): sentry_init( integrations=[OpenAIIntegration()], traces_sample_rate=1.0, + stream_gen_ai_spans=stream_gen_ai_spans, ) - events = capture_events() client = OpenAI(api_key="z") client.responses._post = mock.Mock(return_value=EXAMPLE_RESPONSE) - with start_transaction(name="openai tx"): - client.responses.create( - model="gpt-4o", - instructions="You are a coding assistant that talks like a pirate.", - input="How do I check if a Python object is an instance of a class?", - max_output_tokens=100, - temperature=0.7, - top_p=0.9, - ) + if stream_gen_ai_spans: + items = capture_items("span") + + with start_transaction(name="openai tx"): + client.responses.create( + model="gpt-4o", + instructions="You are a coding assistant that talks like a pirate.", + input="How do I check if a Python object is an instance of a class?", + max_output_tokens=100, + temperature=0.7, + top_p=0.9, + ) - (transaction,) = events - spans = transaction["spans"] - - assert len(spans) == 1 - assert spans[0]["op"] == "gen_ai.responses" - assert spans[0]["origin"] == "auto.ai.openai" - assert spans[0]["data"] == { - "gen_ai.operation.name": "responses", - "gen_ai.request.max_tokens": 100, - "gen_ai.request.temperature": 0.7, - "gen_ai.request.top_p": 0.9, - "gen_ai.request.model": "gpt-4o", - "gen_ai.response.model": "response-model-id", - "gen_ai.response.streaming": False, - "gen_ai.system": "openai", - "gen_ai.usage.input_tokens": 20, - "gen_ai.usage.input_tokens.cached": 5, - "gen_ai.usage.output_tokens": 10, - "gen_ai.usage.output_tokens.reasoning": 8, - "gen_ai.usage.total_tokens": 30, - "thread.id": mock.ANY, - "thread.name": mock.ANY, - } - - assert "gen_ai.system_instructions" not in spans[0]["data"] - assert "gen_ai.request.messages" not in spans[0]["data"] - assert "gen_ai.response.text" not in spans[0]["data"] + spans = [item.payload for item in items if item.type == "span"] + + assert len(spans) == 1 + assert spans[0]["attributes"] == { + "gen_ai.operation.name": "responses", + "gen_ai.request.max_tokens": 100, + "gen_ai.request.temperature": 0.7, + "gen_ai.request.top_p": 0.9, + "gen_ai.request.model": "gpt-4o", + "gen_ai.response.model": "response-model-id", + "gen_ai.response.streaming": False, + "gen_ai.system": "openai", + "gen_ai.usage.input_tokens": 20, + "gen_ai.usage.input_tokens.cached": 5, + "gen_ai.usage.output_tokens": 10, + "gen_ai.usage.output_tokens.reasoning": 8, + "gen_ai.usage.total_tokens": 30, + "sentry.environment": "production", + "sentry.op": "gen_ai.responses", + "sentry.origin": "auto.ai.openai", + "sentry.release": mock.ANY, + "sentry.sdk.name": "sentry.python", + "sentry.sdk.version": mock.ANY, + "sentry.segment.id": mock.ANY, + "sentry.segment.name": "openai tx", + "thread.id": mock.ANY, + "thread.name": mock.ANY, + } + + assert "gen_ai.system_instructions" not in spans[0]["attributes"] + assert "gen_ai.request.messages" not in spans[0]["attributes"] + assert "gen_ai.response.text" not in spans[0]["attributes"] + else: + events = capture_events() + with start_transaction(name="openai tx"): + client.responses.create( + model="gpt-4o", + instructions="You are a coding assistant that talks like a pirate.", + input="How do I check if a Python object is an instance of a class?", + max_output_tokens=100, + temperature=0.7, + top_p=0.9, + ) + (transaction,) = events + spans = transaction["spans"] + + assert len(spans) == 1 + assert spans[0]["op"] == "gen_ai.responses" + assert spans[0]["origin"] == "auto.ai.openai" + assert spans[0]["data"] == { + "gen_ai.operation.name": "responses", + "gen_ai.request.max_tokens": 100, + "gen_ai.request.temperature": 0.7, + "gen_ai.request.top_p": 0.9, + "gen_ai.request.model": "gpt-4o", + "gen_ai.response.model": "response-model-id", + "gen_ai.response.streaming": False, + "gen_ai.system": "openai", + "gen_ai.usage.input_tokens": 20, + "gen_ai.usage.input_tokens.cached": 5, + "gen_ai.usage.output_tokens": 10, + "gen_ai.usage.output_tokens.reasoning": 8, + "gen_ai.usage.total_tokens": 30, + "thread.id": mock.ANY, + "thread.name": mock.ANY, + } + + assert "gen_ai.system_instructions" not in spans[0]["data"] + assert "gen_ai.request.messages" not in spans[0]["data"] + assert "gen_ai.response.text" not in spans[0]["data"] + + +@pytest.mark.parametrize("stream_gen_ai_spans", [True, False]) @pytest.mark.parametrize( "instructions", ( @@ -2630,286 +3764,550 @@ def test_ai_client_span_responses_api_no_pii(sentry_init, capture_events): ) @pytest.mark.skipif(SKIP_RESPONSES_TESTS, reason="Responses API not available") def test_ai_client_span_responses_api( - sentry_init, capture_events, instructions, input, request + sentry_init, + capture_events, + capture_items, + instructions, + input, + request, + stream_gen_ai_spans, ): sentry_init( integrations=[OpenAIIntegration(include_prompts=True)], traces_sample_rate=1.0, send_default_pii=True, + stream_gen_ai_spans=stream_gen_ai_spans, ) - events = capture_events() - - client = OpenAI(api_key="z") - client.responses._post = mock.Mock(return_value=EXAMPLE_RESPONSE) - - with start_transaction(name="openai tx"): - client.responses.create( - model="gpt-4o", - instructions=instructions, - input=input, - max_output_tokens=100, - temperature=0.7, - top_p=0.9, - ) - - (transaction,) = events - spans = transaction["spans"] - - assert len(spans) == 1 - assert spans[0]["op"] == "gen_ai.responses" - assert spans[0]["origin"] == "auto.ai.openai" - - expected_data = { - "gen_ai.operation.name": "responses", - "gen_ai.request.max_tokens": 100, - "gen_ai.request.temperature": 0.7, - "gen_ai.request.top_p": 0.9, - "gen_ai.system": "openai", - "gen_ai.response.model": "response-model-id", - "gen_ai.response.streaming": False, - "gen_ai.usage.input_tokens": 20, - "gen_ai.usage.input_tokens.cached": 5, - "gen_ai.usage.output_tokens": 10, - "gen_ai.usage.output_tokens.reasoning": 8, - "gen_ai.usage.total_tokens": 30, - "gen_ai.request.model": "gpt-4o", - "gen_ai.response.text": "the model response", - "thread.id": mock.ANY, - "thread.name": mock.ANY, - } - - param_id = request.node.callspec.id - if "string" in param_id and ( - instructions is None or isinstance(instructions, Omit) - ): # type: ignore - expected_data.update( - { - "gen_ai.request.messages": safe_serialize( - ["How do I check if a Python object is an instance of a class?"] - ), - } - ) - elif "string" in param_id: - expected_data.update( - { - "gen_ai.system_instructions": safe_serialize( - [ - { - "type": "text", - "content": "You are a coding assistant that talks like a pirate.", - } - ] - ), - "gen_ai.request.messages": safe_serialize( - ["How do I check if a Python object is an instance of a class?"] - ), - } - ) - elif "blocks_no_type" in param_id and ( - instructions is None or isinstance(instructions, Omit) - ): # type: ignore - expected_data.update( - { - "gen_ai.system_instructions": safe_serialize( - [{"type": "text", "content": "You are a helpful assistant."}] - ), - "gen_ai.request.messages": safe_serialize( - [{"role": "user", "content": "hello"}] - ), - } - ) - elif "blocks_no_type" in param_id: - expected_data.update( - { - "gen_ai.system_instructions": safe_serialize( - [ - { - "type": "text", - "content": "You are a coding assistant that talks like a pirate.", - }, - {"type": "text", "content": "You are a helpful assistant."}, - ] - ), - "gen_ai.request.messages": safe_serialize( - [{"role": "user", "content": "hello"}] - ), - } - ) - elif "blocks" in param_id and ( - instructions is None or isinstance(instructions, Omit) - ): # type: ignore - expected_data.update( - { - "gen_ai.system_instructions": safe_serialize( - [{"type": "text", "content": "You are a helpful assistant."}] - ), - "gen_ai.request.messages": safe_serialize( - [{"type": "message", "role": "user", "content": "hello"}] - ), - } - ) - elif "blocks" in param_id: - expected_data.update( - { - "gen_ai.system_instructions": safe_serialize( - [ - { - "type": "text", - "content": "You are a coding assistant that talks like a pirate.", - }, - {"type": "text", "content": "You are a helpful assistant."}, - ] - ), - "gen_ai.request.messages": safe_serialize( - [{"type": "message", "role": "user", "content": "hello"}] - ), - } - ) - elif "parts_no_type" in param_id and ( - instructions is None or isinstance(instructions, Omit) - ): # type: ignore - expected_data.update( - { - "gen_ai.system_instructions": safe_serialize( - [ - {"type": "text", "content": "You are a helpful assistant."}, - {"type": "text", "content": "Be concise and clear."}, - ] - ), - "gen_ai.request.messages": safe_serialize( - [{"role": "user", "content": "hello"}] - ), - } - ) - elif "parts_no_type" in param_id: - expected_data.update( - { - "gen_ai.system_instructions": safe_serialize( - [ - { - "type": "text", - "content": "You are a coding assistant that talks like a pirate.", - }, - {"type": "text", "content": "You are a helpful assistant."}, - {"type": "text", "content": "Be concise and clear."}, - ] - ), - "gen_ai.request.messages": safe_serialize( - [{"role": "user", "content": "hello"}] - ), - } - ) - elif instructions is None or isinstance(instructions, Omit): # type: ignore - expected_data.update( - { - "gen_ai.system_instructions": safe_serialize( - [ - {"type": "text", "content": "You are a helpful assistant."}, - {"type": "text", "content": "Be concise and clear."}, - ] - ), - "gen_ai.request.messages": safe_serialize( - [{"type": "message", "role": "user", "content": "hello"}] - ), - } - ) - else: - expected_data.update( - { - "gen_ai.system_instructions": safe_serialize( - [ - { - "type": "text", - "content": "You are a coding assistant that talks like a pirate.", - }, - {"type": "text", "content": "You are a helpful assistant."}, - {"type": "text", "content": "Be concise and clear."}, - ] - ), - "gen_ai.request.messages": safe_serialize( - [{"type": "message", "role": "user", "content": "hello"}] - ), - } - ) - - assert spans[0]["data"] == expected_data - - -@pytest.mark.parametrize( - "conversation, expected_id", - [ - pytest.param(omit, None, id="omit"), - pytest.param(None, None, id="none"), - pytest.param("conv_abc123", "conv_abc123", id="string"), - pytest.param({"id": "conv_abc123"}, "conv_abc123", id="dict"), - ], -) -@pytest.mark.skipif(SKIP_RESPONSES_TESTS, reason="Responses API not available") -def test_responses_api_conversation_id( - sentry_init, capture_events, conversation, expected_id -): - sentry_init( - integrations=[OpenAIIntegration()], - traces_sample_rate=1.0, - ) - events = capture_events() client = OpenAI(api_key="z") client.responses._post = mock.Mock(return_value=EXAMPLE_RESPONSE) - with start_transaction(name="openai tx"): - client.responses.create( - model="gpt-4o", - input="hello", - conversation=conversation, - ) - - (transaction,) = events - (span,) = transaction["spans"] - - if expected_id is None: - assert "gen_ai.conversation.id" not in span["data"] - else: - assert span["data"]["gen_ai.conversation.id"] == expected_id - - -@pytest.mark.skipif(SKIP_RESPONSES_TESTS, reason="Responses API not available") -def test_error_in_responses_api(sentry_init, capture_events): - sentry_init( - integrations=[OpenAIIntegration(include_prompts=True)], - traces_sample_rate=1.0, - send_default_pii=True, - ) - events = capture_events() - - client = OpenAI(api_key="z") - client.responses._post = mock.Mock( - side_effect=OpenAIError("API rate limit reached") - ) + if stream_gen_ai_spans: + items = capture_items("span") - with start_transaction(name="openai tx"): - with pytest.raises(OpenAIError): + with start_transaction(name="openai tx"): client.responses.create( model="gpt-4o", - instructions="You are a coding assistant that talks like a pirate.", - input="How do I check if a Python object is an instance of a class?", + instructions=instructions, + input=input, + max_output_tokens=100, + temperature=0.7, + top_p=0.9, ) - (error_event, transaction_event) = events - - assert transaction_event["type"] == "transaction" - # make sure the span where the error occurred is captured - assert transaction_event["spans"][0]["op"] == "gen_ai.responses" - - assert error_event["level"] == "error" - assert error_event["exception"]["values"][0]["type"] == "OpenAIError" - - assert ( + spans = [item.payload for item in items if item.type == "span"] + + assert len(spans) == 1 + + expected_data = { + "gen_ai.operation.name": "responses", + "gen_ai.request.max_tokens": 100, + "gen_ai.request.temperature": 0.7, + "gen_ai.request.top_p": 0.9, + "gen_ai.system": "openai", + "gen_ai.response.model": "response-model-id", + "gen_ai.response.streaming": False, + "gen_ai.usage.input_tokens": 20, + "gen_ai.usage.input_tokens.cached": 5, + "gen_ai.usage.output_tokens": 10, + "gen_ai.usage.output_tokens.reasoning": 8, + "gen_ai.usage.total_tokens": 30, + "gen_ai.request.model": "gpt-4o", + "gen_ai.response.text": "the model response", + "sentry.environment": "production", + "sentry.op": "gen_ai.responses", + "sentry.origin": "auto.ai.openai", + "sentry.release": mock.ANY, + "sentry.sdk.name": "sentry.python", + "sentry.sdk.version": mock.ANY, + "sentry.segment.id": mock.ANY, + "sentry.segment.name": "openai tx", + "thread.id": mock.ANY, + "thread.name": mock.ANY, + } + + param_id = request.node.callspec.id + if "string" in param_id and ( + instructions is None or isinstance(instructions, Omit) + ): # type: ignore + expected_data.update( + { + "gen_ai.request.messages": safe_serialize( + ["How do I check if a Python object is an instance of a class?"] + ), + } + ) + elif "string" in param_id: + expected_data.update( + { + "gen_ai.system_instructions": safe_serialize( + [ + { + "type": "text", + "content": "You are a coding assistant that talks like a pirate.", + } + ] + ), + "gen_ai.request.messages": safe_serialize( + ["How do I check if a Python object is an instance of a class?"] + ), + } + ) + elif "blocks_no_type" in param_id and ( + instructions is None or isinstance(instructions, Omit) + ): # type: ignore + expected_data.update( + { + "gen_ai.system_instructions": safe_serialize( + [{"type": "text", "content": "You are a helpful assistant."}] + ), + "gen_ai.request.messages": safe_serialize( + [{"role": "user", "content": "hello"}] + ), + } + ) + elif "blocks_no_type" in param_id: + expected_data.update( + { + "gen_ai.system_instructions": safe_serialize( + [ + { + "type": "text", + "content": "You are a coding assistant that talks like a pirate.", + }, + {"type": "text", "content": "You are a helpful assistant."}, + ] + ), + "gen_ai.request.messages": safe_serialize( + [{"role": "user", "content": "hello"}] + ), + } + ) + elif "blocks" in param_id and ( + instructions is None or isinstance(instructions, Omit) + ): # type: ignore + expected_data.update( + { + "gen_ai.system_instructions": safe_serialize( + [{"type": "text", "content": "You are a helpful assistant."}] + ), + "gen_ai.request.messages": safe_serialize( + [{"type": "message", "role": "user", "content": "hello"}] + ), + } + ) + elif "blocks" in param_id: + expected_data.update( + { + "gen_ai.system_instructions": safe_serialize( + [ + { + "type": "text", + "content": "You are a coding assistant that talks like a pirate.", + }, + {"type": "text", "content": "You are a helpful assistant."}, + ] + ), + "gen_ai.request.messages": safe_serialize( + [{"type": "message", "role": "user", "content": "hello"}] + ), + } + ) + elif "parts_no_type" in param_id and ( + instructions is None or isinstance(instructions, Omit) + ): # type: ignore + expected_data.update( + { + "gen_ai.system_instructions": safe_serialize( + [ + {"type": "text", "content": "You are a helpful assistant."}, + {"type": "text", "content": "Be concise and clear."}, + ] + ), + "gen_ai.request.messages": safe_serialize( + [{"role": "user", "content": "hello"}] + ), + } + ) + elif "parts_no_type" in param_id: + expected_data.update( + { + "gen_ai.system_instructions": safe_serialize( + [ + { + "type": "text", + "content": "You are a coding assistant that talks like a pirate.", + }, + {"type": "text", "content": "You are a helpful assistant."}, + {"type": "text", "content": "Be concise and clear."}, + ] + ), + "gen_ai.request.messages": safe_serialize( + [{"role": "user", "content": "hello"}] + ), + } + ) + elif instructions is None or isinstance(instructions, Omit): # type: ignore + expected_data.update( + { + "gen_ai.system_instructions": safe_serialize( + [ + {"type": "text", "content": "You are a helpful assistant."}, + {"type": "text", "content": "Be concise and clear."}, + ] + ), + "gen_ai.request.messages": safe_serialize( + [{"type": "message", "role": "user", "content": "hello"}] + ), + } + ) + else: + expected_data.update( + { + "gen_ai.system_instructions": safe_serialize( + [ + { + "type": "text", + "content": "You are a coding assistant that talks like a pirate.", + }, + {"type": "text", "content": "You are a helpful assistant."}, + {"type": "text", "content": "Be concise and clear."}, + ] + ), + "gen_ai.request.messages": safe_serialize( + [{"type": "message", "role": "user", "content": "hello"}] + ), + } + ) + + assert spans[0]["attributes"] == expected_data + else: + events = capture_events() + + with start_transaction(name="openai tx"): + client.responses.create( + model="gpt-4o", + instructions=instructions, + input=input, + max_output_tokens=100, + temperature=0.7, + top_p=0.9, + ) + + (transaction,) = events + spans = transaction["spans"] + + assert len(spans) == 1 + assert spans[0]["op"] == "gen_ai.responses" + assert spans[0]["origin"] == "auto.ai.openai" + + expected_data = { + "gen_ai.operation.name": "responses", + "gen_ai.request.max_tokens": 100, + "gen_ai.request.temperature": 0.7, + "gen_ai.request.top_p": 0.9, + "gen_ai.system": "openai", + "gen_ai.response.model": "response-model-id", + "gen_ai.response.streaming": False, + "gen_ai.usage.input_tokens": 20, + "gen_ai.usage.input_tokens.cached": 5, + "gen_ai.usage.output_tokens": 10, + "gen_ai.usage.output_tokens.reasoning": 8, + "gen_ai.usage.total_tokens": 30, + "gen_ai.request.model": "gpt-4o", + "gen_ai.response.text": "the model response", + "thread.id": mock.ANY, + "thread.name": mock.ANY, + } + + param_id = request.node.callspec.id + if "string" in param_id and ( + instructions is None or isinstance(instructions, Omit) + ): # type: ignore + expected_data.update( + { + "gen_ai.request.messages": safe_serialize( + ["How do I check if a Python object is an instance of a class?"] + ), + } + ) + elif "string" in param_id: + expected_data.update( + { + "gen_ai.system_instructions": safe_serialize( + [ + { + "type": "text", + "content": "You are a coding assistant that talks like a pirate.", + } + ] + ), + "gen_ai.request.messages": safe_serialize( + ["How do I check if a Python object is an instance of a class?"] + ), + } + ) + elif "blocks_no_type" in param_id and ( + instructions is None or isinstance(instructions, Omit) + ): # type: ignore + expected_data.update( + { + "gen_ai.system_instructions": safe_serialize( + [{"type": "text", "content": "You are a helpful assistant."}] + ), + "gen_ai.request.messages": safe_serialize( + [{"role": "user", "content": "hello"}] + ), + } + ) + elif "blocks_no_type" in param_id: + expected_data.update( + { + "gen_ai.system_instructions": safe_serialize( + [ + { + "type": "text", + "content": "You are a coding assistant that talks like a pirate.", + }, + {"type": "text", "content": "You are a helpful assistant."}, + ] + ), + "gen_ai.request.messages": safe_serialize( + [{"role": "user", "content": "hello"}] + ), + } + ) + elif "blocks" in param_id and ( + instructions is None or isinstance(instructions, Omit) + ): # type: ignore + expected_data.update( + { + "gen_ai.system_instructions": safe_serialize( + [{"type": "text", "content": "You are a helpful assistant."}] + ), + "gen_ai.request.messages": safe_serialize( + [{"type": "message", "role": "user", "content": "hello"}] + ), + } + ) + elif "blocks" in param_id: + expected_data.update( + { + "gen_ai.system_instructions": safe_serialize( + [ + { + "type": "text", + "content": "You are a coding assistant that talks like a pirate.", + }, + {"type": "text", "content": "You are a helpful assistant."}, + ] + ), + "gen_ai.request.messages": safe_serialize( + [{"type": "message", "role": "user", "content": "hello"}] + ), + } + ) + elif "parts_no_type" in param_id and ( + instructions is None or isinstance(instructions, Omit) + ): # type: ignore + expected_data.update( + { + "gen_ai.system_instructions": safe_serialize( + [ + {"type": "text", "content": "You are a helpful assistant."}, + {"type": "text", "content": "Be concise and clear."}, + ] + ), + "gen_ai.request.messages": safe_serialize( + [{"role": "user", "content": "hello"}] + ), + } + ) + elif "parts_no_type" in param_id: + expected_data.update( + { + "gen_ai.system_instructions": safe_serialize( + [ + { + "type": "text", + "content": "You are a coding assistant that talks like a pirate.", + }, + {"type": "text", "content": "You are a helpful assistant."}, + {"type": "text", "content": "Be concise and clear."}, + ] + ), + "gen_ai.request.messages": safe_serialize( + [{"role": "user", "content": "hello"}] + ), + } + ) + elif instructions is None or isinstance(instructions, Omit): # type: ignore + expected_data.update( + { + "gen_ai.system_instructions": safe_serialize( + [ + {"type": "text", "content": "You are a helpful assistant."}, + {"type": "text", "content": "Be concise and clear."}, + ] + ), + "gen_ai.request.messages": safe_serialize( + [{"type": "message", "role": "user", "content": "hello"}] + ), + } + ) + else: + expected_data.update( + { + "gen_ai.system_instructions": safe_serialize( + [ + { + "type": "text", + "content": "You are a coding assistant that talks like a pirate.", + }, + {"type": "text", "content": "You are a helpful assistant."}, + {"type": "text", "content": "Be concise and clear."}, + ] + ), + "gen_ai.request.messages": safe_serialize( + [{"type": "message", "role": "user", "content": "hello"}] + ), + } + ) + + assert spans[0]["data"] == expected_data + + +@pytest.mark.parametrize("stream_gen_ai_spans", [True, False]) +@pytest.mark.parametrize( + "conversation, expected_id", + [ + pytest.param(omit, None, id="omit"), + pytest.param(None, None, id="none"), + pytest.param("conv_abc123", "conv_abc123", id="string"), + pytest.param({"id": "conv_abc123"}, "conv_abc123", id="dict"), + ], +) +@pytest.mark.skipif(SKIP_RESPONSES_TESTS, reason="Responses API not available") +def test_responses_api_conversation_id( + sentry_init, + capture_events, + capture_items, + conversation, + expected_id, + stream_gen_ai_spans, +): + sentry_init( + integrations=[OpenAIIntegration()], + traces_sample_rate=1.0, + stream_gen_ai_spans=stream_gen_ai_spans, + ) + + client = OpenAI(api_key="z") + client.responses._post = mock.Mock(return_value=EXAMPLE_RESPONSE) + + if stream_gen_ai_spans: + items = capture_items("span") + + with start_transaction(name="openai tx"): + client.responses.create( + model="gpt-4o", + input="hello", + conversation=conversation, + ) + + (span,) = (item.payload for item in items if item.type == "span") + + if expected_id is None: + assert "gen_ai.conversation.id" not in span["attributes"] + else: + assert span["attributes"]["gen_ai.conversation.id"] == expected_id + else: + events = capture_events() + + with start_transaction(name="openai tx"): + client.responses.create( + model="gpt-4o", + input="hello", + conversation=conversation, + ) + + (transaction,) = events + (span,) = transaction["spans"] + + if expected_id is None: + assert "gen_ai.conversation.id" not in span["data"] + else: + assert span["data"]["gen_ai.conversation.id"] == expected_id + + +@pytest.mark.parametrize("stream_gen_ai_spans", [True, False]) +@pytest.mark.skipif(SKIP_RESPONSES_TESTS, reason="Responses API not available") +def test_error_in_responses_api( + sentry_init, + capture_events, + capture_items, + stream_gen_ai_spans, +): + sentry_init( + integrations=[OpenAIIntegration(include_prompts=True)], + traces_sample_rate=1.0, + send_default_pii=True, + stream_gen_ai_spans=stream_gen_ai_spans, + ) + + client = OpenAI(api_key="z") + client.responses._post = mock.Mock( + side_effect=OpenAIError("API rate limit reached") + ) + + if stream_gen_ai_spans: + items = capture_items("event", "transaction", "span") + + with start_transaction(name="openai tx"), pytest.raises(OpenAIError): + client.responses.create( + model="gpt-4o", + instructions="You are a coding assistant that talks like a pirate.", + input="How do I check if a Python object is an instance of a class?", + ) + + # make sure the span where the error occurred is captured + spans = [item.payload for item in items if item.type == "span"] + assert spans[0]["attributes"]["sentry.op"] == "gen_ai.responses" + + (error_event,) = (item.payload for item in items if item.type == "event") + + assert error_event["level"] == "error" + assert error_event["exception"]["values"][0]["type"] == "OpenAIError" + + (transaction_event,) = ( + item.payload for item in items if item.type == "transaction" + ) + else: + events = capture_events() + + with start_transaction(name="openai tx"), pytest.raises(OpenAIError): + client.responses.create( + model="gpt-4o", + instructions="You are a coding assistant that talks like a pirate.", + input="How do I check if a Python object is an instance of a class?", + ) + + (error_event, transaction_event) = events + + assert transaction_event["type"] == "transaction" + # make sure the span where the error occurred is captured + assert transaction_event["spans"][0]["op"] == "gen_ai.responses" + + assert error_event["level"] == "error" + assert error_event["exception"]["values"][0]["type"] == "OpenAIError" + + assert ( error_event["contexts"]["trace"]["trace_id"] == transaction_event["contexts"]["trace"]["trace_id"] ) +@pytest.mark.parametrize("stream_gen_ai_spans", [True, False]) @pytest.mark.asyncio @pytest.mark.skipif(SKIP_RESPONSES_TESTS, reason="Responses API not available") @pytest.mark.parametrize( @@ -2977,212 +4375,422 @@ def test_error_in_responses_api(sentry_init, capture_events): ], ) async def test_ai_client_span_responses_async_api( - sentry_init, capture_events, instructions, input, request + sentry_init, + capture_events, + capture_items, + instructions, + input, + request, + stream_gen_ai_spans, ): sentry_init( integrations=[OpenAIIntegration(include_prompts=True)], traces_sample_rate=1.0, send_default_pii=True, + stream_gen_ai_spans=stream_gen_ai_spans, ) - events = capture_events() client = AsyncOpenAI(api_key="z") client.responses._post = AsyncMock(return_value=EXAMPLE_RESPONSE) - with start_transaction(name="openai tx"): - await client.responses.create( - model="gpt-4o", - instructions=instructions, - input=input, - max_output_tokens=100, - temperature=0.7, - top_p=0.9, - ) + if stream_gen_ai_spans: + items = capture_items("span") - (transaction,) = events - spans = transaction["spans"] - - assert len(spans) == 1 - assert spans[0]["op"] == "gen_ai.responses" - assert spans[0]["origin"] == "auto.ai.openai" - - expected_data = { - "gen_ai.operation.name": "responses", - "gen_ai.request.max_tokens": 100, - "gen_ai.request.temperature": 0.7, - "gen_ai.request.top_p": 0.9, - "gen_ai.request.messages": '["How do I check if a Python object is an instance of a class?"]', - "gen_ai.request.model": "gpt-4o", - "gen_ai.response.model": "response-model-id", - "gen_ai.response.streaming": False, - "gen_ai.system": "openai", - "gen_ai.usage.input_tokens": 20, - "gen_ai.usage.input_tokens.cached": 5, - "gen_ai.usage.output_tokens": 10, - "gen_ai.usage.output_tokens.reasoning": 8, - "gen_ai.usage.total_tokens": 30, - "gen_ai.response.text": "the model response", - "thread.id": mock.ANY, - "thread.name": mock.ANY, - } - - param_id = request.node.callspec.id - if "string" in param_id and ( - instructions is None or isinstance(instructions, Omit) - ): # type: ignore - expected_data.update( - { - "gen_ai.request.messages": safe_serialize( - ["How do I check if a Python object is an instance of a class?"] - ), - } - ) - elif "string" in param_id: - expected_data.update( - { - "gen_ai.system_instructions": safe_serialize( - [ - { - "type": "text", - "content": "You are a coding assistant that talks like a pirate.", - } - ] - ), - "gen_ai.request.messages": safe_serialize( - ["How do I check if a Python object is an instance of a class?"] - ), - } - ) - elif "blocks_no_type" in param_id and ( - instructions is None or isinstance(instructions, Omit) - ): # type: ignore - expected_data.update( - { - "gen_ai.system_instructions": safe_serialize( - [{"type": "text", "content": "You are a helpful assistant."}] - ), - "gen_ai.request.messages": safe_serialize( - [{"role": "user", "content": "hello"}] - ), - } - ) - elif "blocks_no_type" in param_id: - expected_data.update( - { - "gen_ai.system_instructions": safe_serialize( - [ - { - "type": "text", - "content": "You are a coding assistant that talks like a pirate.", - }, - {"type": "text", "content": "You are a helpful assistant."}, - ] - ), - "gen_ai.request.messages": safe_serialize( - [{"role": "user", "content": "hello"}] - ), - } - ) - elif "blocks" in param_id and ( - instructions is None or isinstance(instructions, Omit) - ): # type: ignore - expected_data.update( - { - "gen_ai.system_instructions": safe_serialize( - [{"type": "text", "content": "You are a helpful assistant."}] - ), - "gen_ai.request.messages": safe_serialize( - [{"type": "message", "role": "user", "content": "hello"}] - ), - } - ) - elif "blocks" in param_id: - expected_data.update( - { - "gen_ai.system_instructions": safe_serialize( - [ - { - "type": "text", - "content": "You are a coding assistant that talks like a pirate.", - }, - {"type": "text", "content": "You are a helpful assistant."}, - ] - ), - "gen_ai.request.messages": safe_serialize( - [{"type": "message", "role": "user", "content": "hello"}] - ), - } - ) - elif "parts_no_type" in param_id and ( - instructions is None or isinstance(instructions, Omit) - ): # type: ignore - expected_data.update( - { - "gen_ai.system_instructions": safe_serialize( - [ - {"type": "text", "content": "You are a helpful assistant."}, - {"type": "text", "content": "Be concise and clear."}, - ] - ), - "gen_ai.request.messages": safe_serialize( - [{"role": "user", "content": "hello"}] - ), - } - ) - elif "parts_no_type" in param_id: - expected_data.update( - { - "gen_ai.system_instructions": safe_serialize( - [ - { - "type": "text", - "content": "You are a coding assistant that talks like a pirate.", - }, - {"type": "text", "content": "You are a helpful assistant."}, - {"type": "text", "content": "Be concise and clear."}, - ] - ), - "gen_ai.request.messages": safe_serialize( - [{"role": "user", "content": "hello"}] - ), - } - ) - elif instructions is None or isinstance(instructions, Omit): # type: ignore - expected_data.update( - { - "gen_ai.system_instructions": safe_serialize( - [ - {"type": "text", "content": "You are a helpful assistant."}, - {"type": "text", "content": "Be concise and clear."}, - ] - ), - "gen_ai.request.messages": safe_serialize( - [{"type": "message", "role": "user", "content": "hello"}] - ), - } - ) + with start_transaction(name="openai tx"): + await client.responses.create( + model="gpt-4o", + instructions=instructions, + input=input, + max_output_tokens=100, + temperature=0.7, + top_p=0.9, + ) + + spans = [item.payload for item in items if item.type == "span"] + + assert len(spans) == 1 + + expected_data = { + "gen_ai.operation.name": "responses", + "gen_ai.request.max_tokens": 100, + "gen_ai.request.temperature": 0.7, + "gen_ai.request.top_p": 0.9, + "gen_ai.request.messages": '["How do I check if a Python object is an instance of a class?"]', + "gen_ai.request.model": "gpt-4o", + "gen_ai.response.model": "response-model-id", + "gen_ai.response.streaming": False, + "gen_ai.system": "openai", + "gen_ai.usage.input_tokens": 20, + "gen_ai.usage.input_tokens.cached": 5, + "gen_ai.usage.output_tokens": 10, + "gen_ai.usage.output_tokens.reasoning": 8, + "gen_ai.usage.total_tokens": 30, + "gen_ai.response.text": "the model response", + "sentry.environment": "production", + "sentry.op": "gen_ai.responses", + "sentry.origin": "auto.ai.openai", + "sentry.release": mock.ANY, + "sentry.sdk.name": "sentry.python", + "sentry.sdk.version": mock.ANY, + "sentry.segment.id": mock.ANY, + "sentry.segment.name": "openai tx", + "thread.id": mock.ANY, + "thread.name": mock.ANY, + } + + param_id = request.node.callspec.id + if "string" in param_id and ( + instructions is None or isinstance(instructions, Omit) + ): # type: ignore + expected_data.update( + { + "gen_ai.request.messages": safe_serialize( + ["How do I check if a Python object is an instance of a class?"] + ), + } + ) + elif "string" in param_id: + expected_data.update( + { + "gen_ai.system_instructions": safe_serialize( + [ + { + "type": "text", + "content": "You are a coding assistant that talks like a pirate.", + } + ] + ), + "gen_ai.request.messages": safe_serialize( + ["How do I check if a Python object is an instance of a class?"] + ), + } + ) + elif "blocks_no_type" in param_id and ( + instructions is None or isinstance(instructions, Omit) + ): # type: ignore + expected_data.update( + { + "gen_ai.system_instructions": safe_serialize( + [{"type": "text", "content": "You are a helpful assistant."}] + ), + "gen_ai.request.messages": safe_serialize( + [{"role": "user", "content": "hello"}] + ), + } + ) + elif "blocks_no_type" in param_id: + expected_data.update( + { + "gen_ai.system_instructions": safe_serialize( + [ + { + "type": "text", + "content": "You are a coding assistant that talks like a pirate.", + }, + {"type": "text", "content": "You are a helpful assistant."}, + ] + ), + "gen_ai.request.messages": safe_serialize( + [{"role": "user", "content": "hello"}] + ), + } + ) + elif "blocks" in param_id and ( + instructions is None or isinstance(instructions, Omit) + ): # type: ignore + expected_data.update( + { + "gen_ai.system_instructions": safe_serialize( + [{"type": "text", "content": "You are a helpful assistant."}] + ), + "gen_ai.request.messages": safe_serialize( + [{"type": "message", "role": "user", "content": "hello"}] + ), + } + ) + elif "blocks" in param_id: + expected_data.update( + { + "gen_ai.system_instructions": safe_serialize( + [ + { + "type": "text", + "content": "You are a coding assistant that talks like a pirate.", + }, + {"type": "text", "content": "You are a helpful assistant."}, + ] + ), + "gen_ai.request.messages": safe_serialize( + [{"type": "message", "role": "user", "content": "hello"}] + ), + } + ) + elif "parts_no_type" in param_id and ( + instructions is None or isinstance(instructions, Omit) + ): # type: ignore + expected_data.update( + { + "gen_ai.system_instructions": safe_serialize( + [ + {"type": "text", "content": "You are a helpful assistant."}, + {"type": "text", "content": "Be concise and clear."}, + ] + ), + "gen_ai.request.messages": safe_serialize( + [{"role": "user", "content": "hello"}] + ), + } + ) + elif "parts_no_type" in param_id: + expected_data.update( + { + "gen_ai.system_instructions": safe_serialize( + [ + { + "type": "text", + "content": "You are a coding assistant that talks like a pirate.", + }, + {"type": "text", "content": "You are a helpful assistant."}, + {"type": "text", "content": "Be concise and clear."}, + ] + ), + "gen_ai.request.messages": safe_serialize( + [{"role": "user", "content": "hello"}] + ), + } + ) + elif instructions is None or isinstance(instructions, Omit): # type: ignore + expected_data.update( + { + "gen_ai.system_instructions": safe_serialize( + [ + {"type": "text", "content": "You are a helpful assistant."}, + {"type": "text", "content": "Be concise and clear."}, + ] + ), + "gen_ai.request.messages": safe_serialize( + [{"type": "message", "role": "user", "content": "hello"}] + ), + } + ) + else: + expected_data.update( + { + "gen_ai.system_instructions": safe_serialize( + [ + { + "type": "text", + "content": "You are a coding assistant that talks like a pirate.", + }, + {"type": "text", "content": "You are a helpful assistant."}, + {"type": "text", "content": "Be concise and clear."}, + ] + ), + "gen_ai.request.messages": safe_serialize( + [{"type": "message", "role": "user", "content": "hello"}] + ), + } + ) + + assert spans[0]["attributes"] == expected_data else: - expected_data.update( - { - "gen_ai.system_instructions": safe_serialize( - [ - { - "type": "text", - "content": "You are a coding assistant that talks like a pirate.", - }, - {"type": "text", "content": "You are a helpful assistant."}, - {"type": "text", "content": "Be concise and clear."}, - ] - ), - "gen_ai.request.messages": safe_serialize( - [{"type": "message", "role": "user", "content": "hello"}] - ), - } - ) + events = capture_events() + + with start_transaction(name="openai tx"): + await client.responses.create( + model="gpt-4o", + instructions=instructions, + input=input, + max_output_tokens=100, + temperature=0.7, + top_p=0.9, + ) + + (transaction,) = events + spans = transaction["spans"] + + assert len(spans) == 1 + assert spans[0]["op"] == "gen_ai.responses" + assert spans[0]["origin"] == "auto.ai.openai" + + expected_data = { + "gen_ai.operation.name": "responses", + "gen_ai.request.max_tokens": 100, + "gen_ai.request.temperature": 0.7, + "gen_ai.request.top_p": 0.9, + "gen_ai.request.messages": '["How do I check if a Python object is an instance of a class?"]', + "gen_ai.request.model": "gpt-4o", + "gen_ai.response.model": "response-model-id", + "gen_ai.response.streaming": False, + "gen_ai.system": "openai", + "gen_ai.usage.input_tokens": 20, + "gen_ai.usage.input_tokens.cached": 5, + "gen_ai.usage.output_tokens": 10, + "gen_ai.usage.output_tokens.reasoning": 8, + "gen_ai.usage.total_tokens": 30, + "gen_ai.response.text": "the model response", + "thread.id": mock.ANY, + "thread.name": mock.ANY, + } + + param_id = request.node.callspec.id + if "string" in param_id and ( + instructions is None or isinstance(instructions, Omit) + ): # type: ignore + expected_data.update( + { + "gen_ai.request.messages": safe_serialize( + ["How do I check if a Python object is an instance of a class?"] + ), + } + ) + elif "string" in param_id: + expected_data.update( + { + "gen_ai.system_instructions": safe_serialize( + [ + { + "type": "text", + "content": "You are a coding assistant that talks like a pirate.", + } + ] + ), + "gen_ai.request.messages": safe_serialize( + ["How do I check if a Python object is an instance of a class?"] + ), + } + ) + elif "blocks_no_type" in param_id and ( + instructions is None or isinstance(instructions, Omit) + ): # type: ignore + expected_data.update( + { + "gen_ai.system_instructions": safe_serialize( + [{"type": "text", "content": "You are a helpful assistant."}] + ), + "gen_ai.request.messages": safe_serialize( + [{"role": "user", "content": "hello"}] + ), + } + ) + elif "blocks_no_type" in param_id: + expected_data.update( + { + "gen_ai.system_instructions": safe_serialize( + [ + { + "type": "text", + "content": "You are a coding assistant that talks like a pirate.", + }, + {"type": "text", "content": "You are a helpful assistant."}, + ] + ), + "gen_ai.request.messages": safe_serialize( + [{"role": "user", "content": "hello"}] + ), + } + ) + elif "blocks" in param_id and ( + instructions is None or isinstance(instructions, Omit) + ): # type: ignore + expected_data.update( + { + "gen_ai.system_instructions": safe_serialize( + [{"type": "text", "content": "You are a helpful assistant."}] + ), + "gen_ai.request.messages": safe_serialize( + [{"type": "message", "role": "user", "content": "hello"}] + ), + } + ) + elif "blocks" in param_id: + expected_data.update( + { + "gen_ai.system_instructions": safe_serialize( + [ + { + "type": "text", + "content": "You are a coding assistant that talks like a pirate.", + }, + {"type": "text", "content": "You are a helpful assistant."}, + ] + ), + "gen_ai.request.messages": safe_serialize( + [{"type": "message", "role": "user", "content": "hello"}] + ), + } + ) + elif "parts_no_type" in param_id and ( + instructions is None or isinstance(instructions, Omit) + ): # type: ignore + expected_data.update( + { + "gen_ai.system_instructions": safe_serialize( + [ + {"type": "text", "content": "You are a helpful assistant."}, + {"type": "text", "content": "Be concise and clear."}, + ] + ), + "gen_ai.request.messages": safe_serialize( + [{"role": "user", "content": "hello"}] + ), + } + ) + elif "parts_no_type" in param_id: + expected_data.update( + { + "gen_ai.system_instructions": safe_serialize( + [ + { + "type": "text", + "content": "You are a coding assistant that talks like a pirate.", + }, + {"type": "text", "content": "You are a helpful assistant."}, + {"type": "text", "content": "Be concise and clear."}, + ] + ), + "gen_ai.request.messages": safe_serialize( + [{"role": "user", "content": "hello"}] + ), + } + ) + elif instructions is None or isinstance(instructions, Omit): # type: ignore + expected_data.update( + { + "gen_ai.system_instructions": safe_serialize( + [ + {"type": "text", "content": "You are a helpful assistant."}, + {"type": "text", "content": "Be concise and clear."}, + ] + ), + "gen_ai.request.messages": safe_serialize( + [{"type": "message", "role": "user", "content": "hello"}] + ), + } + ) + else: + expected_data.update( + { + "gen_ai.system_instructions": safe_serialize( + [ + { + "type": "text", + "content": "You are a coding assistant that talks like a pirate.", + }, + {"type": "text", "content": "You are a helpful assistant."}, + {"type": "text", "content": "Be concise and clear."}, + ] + ), + "gen_ai.request.messages": safe_serialize( + [{"type": "message", "role": "user", "content": "hello"}] + ), + } + ) - assert spans[0]["data"] == expected_data + assert spans[0]["data"] == expected_data +@pytest.mark.parametrize("stream_gen_ai_spans", [True, False]) @pytest.mark.asyncio @pytest.mark.parametrize( "instructions", @@ -3252,31 +4860,247 @@ async def test_ai_client_span_responses_async_api( async def test_ai_client_span_streaming_responses_async_api( sentry_init, capture_events, + capture_items, instructions, input, request, get_model_response, async_iterator, server_side_event_chunks, + stream_gen_ai_spans, ): sentry_init( integrations=[OpenAIIntegration(include_prompts=True)], traces_sample_rate=1.0, send_default_pii=True, + stream_gen_ai_spans=stream_gen_ai_spans, ) - events = capture_events() client = AsyncOpenAI(api_key="z") returned_stream = get_model_response( async_iterator(server_side_event_chunks(EXAMPLE_RESPONSES_STREAM)) ) - with mock.patch.object( - client.responses._client._client, - "send", - return_value=returned_stream, - ): - with start_transaction(name="openai tx"): + if stream_gen_ai_spans: + items = capture_items("span") + + with mock.patch.object( + client.responses._client._client, + "send", + return_value=returned_stream, + ), start_transaction(name="openai tx"): + result = await client.responses.create( + model="gpt-4o", + instructions=instructions, + input=input, + stream=True, + max_output_tokens=100, + temperature=0.7, + top_p=0.9, + ) + async for _ in result: + pass + + spans = [item.payload for item in items if item.type == "span"] + spans = [ + span + for span in spans + if span["attributes"]["sentry.op"] == OP.GEN_AI_RESPONSES + ] + + assert len(spans) == 1 + + expected_data = { + "gen_ai.operation.name": "responses", + "gen_ai.request.max_tokens": 100, + "gen_ai.request.temperature": 0.7, + "gen_ai.request.top_p": 0.9, + "gen_ai.response.model": "response-model-id", + "gen_ai.response.streaming": True, + "gen_ai.system": "openai", + "gen_ai.response.time_to_first_token": mock.ANY, + "gen_ai.usage.input_tokens": 20, + "gen_ai.usage.input_tokens.cached": 5, + "gen_ai.usage.output_tokens": 10, + "gen_ai.usage.output_tokens.reasoning": 8, + "gen_ai.usage.total_tokens": 30, + "gen_ai.request.model": "gpt-4o", + "gen_ai.response.text": "hello world", + "sentry.environment": "production", + "sentry.op": "gen_ai.responses", + "sentry.origin": "auto.ai.openai", + "sentry.release": mock.ANY, + "sentry.sdk.name": "sentry.python", + "sentry.sdk.version": mock.ANY, + "sentry.segment.id": mock.ANY, + "sentry.segment.name": "openai tx", + "thread.id": mock.ANY, + "thread.name": mock.ANY, + } + + param_id = request.node.callspec.id + if "string" in param_id and ( + instructions is None or isinstance(instructions, Omit) + ): # type: ignore + expected_data.update( + { + "gen_ai.request.messages": safe_serialize( + ["How do I check if a Python object is an instance of a class?"] + ), + } + ) + elif "string" in param_id: + expected_data.update( + { + "gen_ai.system_instructions": safe_serialize( + [ + { + "type": "text", + "content": "You are a coding assistant that talks like a pirate.", + } + ] + ), + "gen_ai.request.messages": safe_serialize( + ["How do I check if a Python object is an instance of a class?"] + ), + } + ) + elif "blocks_no_type" in param_id and ( + instructions is None or isinstance(instructions, Omit) + ): # type: ignore + expected_data.update( + { + "gen_ai.system_instructions": safe_serialize( + [{"type": "text", "content": "You are a helpful assistant."}] + ), + "gen_ai.request.messages": safe_serialize( + [{"role": "user", "content": "hello"}] + ), + } + ) + elif "blocks_no_type" in param_id: + expected_data.update( + { + "gen_ai.system_instructions": safe_serialize( + [ + { + "type": "text", + "content": "You are a coding assistant that talks like a pirate.", + }, + {"type": "text", "content": "You are a helpful assistant."}, + ] + ), + "gen_ai.request.messages": safe_serialize( + [{"role": "user", "content": "hello"}] + ), + } + ) + elif "blocks" in param_id and ( + instructions is None or isinstance(instructions, Omit) + ): # type: ignore + expected_data.update( + { + "gen_ai.system_instructions": safe_serialize( + [{"type": "text", "content": "You are a helpful assistant."}] + ), + "gen_ai.request.messages": safe_serialize( + [{"type": "message", "role": "user", "content": "hello"}] + ), + } + ) + elif "blocks" in param_id: + expected_data.update( + { + "gen_ai.system_instructions": safe_serialize( + [ + { + "type": "text", + "content": "You are a coding assistant that talks like a pirate.", + }, + {"type": "text", "content": "You are a helpful assistant."}, + ] + ), + "gen_ai.request.messages": safe_serialize( + [{"type": "message", "role": "user", "content": "hello"}] + ), + } + ) + elif "parts_no_type" in param_id and ( + instructions is None or isinstance(instructions, Omit) + ): # type: ignore + expected_data.update( + { + "gen_ai.system_instructions": safe_serialize( + [ + {"type": "text", "content": "You are a helpful assistant."}, + {"type": "text", "content": "Be concise and clear."}, + ] + ), + "gen_ai.request.messages": safe_serialize( + [{"role": "user", "content": "hello"}] + ), + } + ) + elif "parts_no_type" in param_id: + expected_data.update( + { + "gen_ai.system_instructions": safe_serialize( + [ + { + "type": "text", + "content": "You are a coding assistant that talks like a pirate.", + }, + {"type": "text", "content": "You are a helpful assistant."}, + {"type": "text", "content": "Be concise and clear."}, + ] + ), + "gen_ai.request.messages": safe_serialize( + [{"role": "user", "content": "hello"}] + ), + } + ) + elif instructions is None or isinstance(instructions, Omit): # type: ignore + expected_data.update( + { + "gen_ai.system_instructions": safe_serialize( + [ + {"type": "text", "content": "You are a helpful assistant."}, + {"type": "text", "content": "Be concise and clear."}, + ] + ), + "gen_ai.request.messages": safe_serialize( + [{"type": "message", "role": "user", "content": "hello"}] + ), + } + ) + else: + expected_data.update( + { + "gen_ai.system_instructions": safe_serialize( + [ + { + "type": "text", + "content": "You are a coding assistant that talks like a pirate.", + }, + {"type": "text", "content": "You are a helpful assistant."}, + {"type": "text", "content": "Be concise and clear."}, + ] + ), + "gen_ai.request.messages": safe_serialize( + [{"type": "message", "role": "user", "content": "hello"}] + ), + } + ) + + assert spans[0]["attributes"] == expected_data + else: + events = capture_events() + + with mock.patch.object( + client.responses._client._client, + "send", + return_value=returned_stream, + ), start_transaction(name="openai tx"): result = await client.responses.create( model="gpt-4o", instructions=instructions, @@ -3289,220 +5113,252 @@ async def test_ai_client_span_streaming_responses_async_api( async for _ in result: pass - (transaction,) = events - spans = [span for span in transaction["spans"] if span["op"] == OP.GEN_AI_RESPONSES] - - assert len(spans) == 1 - assert spans[0]["origin"] == "auto.ai.openai" - - expected_data = { - "gen_ai.operation.name": "responses", - "gen_ai.request.max_tokens": 100, - "gen_ai.request.temperature": 0.7, - "gen_ai.request.top_p": 0.9, - "gen_ai.response.model": "response-model-id", - "gen_ai.response.streaming": True, - "gen_ai.system": "openai", - "gen_ai.response.time_to_first_token": mock.ANY, - "gen_ai.usage.input_tokens": 20, - "gen_ai.usage.input_tokens.cached": 5, - "gen_ai.usage.output_tokens": 10, - "gen_ai.usage.output_tokens.reasoning": 8, - "gen_ai.usage.total_tokens": 30, - "gen_ai.request.model": "gpt-4o", - "gen_ai.response.text": "hello world", - "thread.id": mock.ANY, - "thread.name": mock.ANY, - } - - param_id = request.node.callspec.id - if "string" in param_id and ( - instructions is None or isinstance(instructions, Omit) - ): # type: ignore - expected_data.update( - { - "gen_ai.request.messages": safe_serialize( - ["How do I check if a Python object is an instance of a class?"] - ), - } - ) - elif "string" in param_id: - expected_data.update( - { - "gen_ai.system_instructions": safe_serialize( - [ - { - "type": "text", - "content": "You are a coding assistant that talks like a pirate.", - } - ] - ), - "gen_ai.request.messages": safe_serialize( - ["How do I check if a Python object is an instance of a class?"] - ), - } - ) - elif "blocks_no_type" in param_id and ( - instructions is None or isinstance(instructions, Omit) - ): # type: ignore - expected_data.update( - { - "gen_ai.system_instructions": safe_serialize( - [{"type": "text", "content": "You are a helpful assistant."}] - ), - "gen_ai.request.messages": safe_serialize( - [{"role": "user", "content": "hello"}] - ), - } - ) - elif "blocks_no_type" in param_id: - expected_data.update( - { - "gen_ai.system_instructions": safe_serialize( - [ - { - "type": "text", - "content": "You are a coding assistant that talks like a pirate.", - }, - {"type": "text", "content": "You are a helpful assistant."}, - ] - ), - "gen_ai.request.messages": safe_serialize( - [{"role": "user", "content": "hello"}] - ), - } - ) - elif "blocks" in param_id and ( - instructions is None or isinstance(instructions, Omit) - ): # type: ignore - expected_data.update( - { - "gen_ai.system_instructions": safe_serialize( - [{"type": "text", "content": "You are a helpful assistant."}] - ), - "gen_ai.request.messages": safe_serialize( - [{"type": "message", "role": "user", "content": "hello"}] - ), - } - ) - elif "blocks" in param_id: - expected_data.update( - { - "gen_ai.system_instructions": safe_serialize( - [ - { - "type": "text", - "content": "You are a coding assistant that talks like a pirate.", - }, - {"type": "text", "content": "You are a helpful assistant."}, - ] - ), - "gen_ai.request.messages": safe_serialize( - [{"type": "message", "role": "user", "content": "hello"}] - ), - } - ) - elif "parts_no_type" in param_id and ( - instructions is None or isinstance(instructions, Omit) - ): # type: ignore - expected_data.update( - { - "gen_ai.system_instructions": safe_serialize( - [ - {"type": "text", "content": "You are a helpful assistant."}, - {"type": "text", "content": "Be concise and clear."}, - ] - ), - "gen_ai.request.messages": safe_serialize( - [{"role": "user", "content": "hello"}] - ), - } - ) - elif "parts_no_type" in param_id: - expected_data.update( - { - "gen_ai.system_instructions": safe_serialize( - [ - { - "type": "text", - "content": "You are a coding assistant that talks like a pirate.", - }, - {"type": "text", "content": "You are a helpful assistant."}, - {"type": "text", "content": "Be concise and clear."}, - ] - ), - "gen_ai.request.messages": safe_serialize( - [{"role": "user", "content": "hello"}] - ), - } - ) - elif instructions is None or isinstance(instructions, Omit): # type: ignore - expected_data.update( - { - "gen_ai.system_instructions": safe_serialize( - [ - {"type": "text", "content": "You are a helpful assistant."}, - {"type": "text", "content": "Be concise and clear."}, - ] - ), - "gen_ai.request.messages": safe_serialize( - [{"type": "message", "role": "user", "content": "hello"}] - ), - } - ) - else: - expected_data.update( - { - "gen_ai.system_instructions": safe_serialize( - [ - { - "type": "text", - "content": "You are a coding assistant that talks like a pirate.", - }, - {"type": "text", "content": "You are a helpful assistant."}, - {"type": "text", "content": "Be concise and clear."}, - ] - ), - "gen_ai.request.messages": safe_serialize( - [{"type": "message", "role": "user", "content": "hello"}] - ), - } - ) + (transaction,) = events + spans = [ + span for span in transaction["spans"] if span["op"] == OP.GEN_AI_RESPONSES + ] + + assert len(spans) == 1 + assert spans[0]["origin"] == "auto.ai.openai" + + expected_data = { + "gen_ai.operation.name": "responses", + "gen_ai.request.max_tokens": 100, + "gen_ai.request.temperature": 0.7, + "gen_ai.request.top_p": 0.9, + "gen_ai.response.model": "response-model-id", + "gen_ai.response.streaming": True, + "gen_ai.system": "openai", + "gen_ai.response.time_to_first_token": mock.ANY, + "gen_ai.usage.input_tokens": 20, + "gen_ai.usage.input_tokens.cached": 5, + "gen_ai.usage.output_tokens": 10, + "gen_ai.usage.output_tokens.reasoning": 8, + "gen_ai.usage.total_tokens": 30, + "gen_ai.request.model": "gpt-4o", + "gen_ai.response.text": "hello world", + "thread.id": mock.ANY, + "thread.name": mock.ANY, + } + + param_id = request.node.callspec.id + if "string" in param_id and ( + instructions is None or isinstance(instructions, Omit) + ): # type: ignore + expected_data.update( + { + "gen_ai.request.messages": safe_serialize( + ["How do I check if a Python object is an instance of a class?"] + ), + } + ) + elif "string" in param_id: + expected_data.update( + { + "gen_ai.system_instructions": safe_serialize( + [ + { + "type": "text", + "content": "You are a coding assistant that talks like a pirate.", + } + ] + ), + "gen_ai.request.messages": safe_serialize( + ["How do I check if a Python object is an instance of a class?"] + ), + } + ) + elif "blocks_no_type" in param_id and ( + instructions is None or isinstance(instructions, Omit) + ): # type: ignore + expected_data.update( + { + "gen_ai.system_instructions": safe_serialize( + [{"type": "text", "content": "You are a helpful assistant."}] + ), + "gen_ai.request.messages": safe_serialize( + [{"role": "user", "content": "hello"}] + ), + } + ) + elif "blocks_no_type" in param_id: + expected_data.update( + { + "gen_ai.system_instructions": safe_serialize( + [ + { + "type": "text", + "content": "You are a coding assistant that talks like a pirate.", + }, + {"type": "text", "content": "You are a helpful assistant."}, + ] + ), + "gen_ai.request.messages": safe_serialize( + [{"role": "user", "content": "hello"}] + ), + } + ) + elif "blocks" in param_id and ( + instructions is None or isinstance(instructions, Omit) + ): # type: ignore + expected_data.update( + { + "gen_ai.system_instructions": safe_serialize( + [{"type": "text", "content": "You are a helpful assistant."}] + ), + "gen_ai.request.messages": safe_serialize( + [{"type": "message", "role": "user", "content": "hello"}] + ), + } + ) + elif "blocks" in param_id: + expected_data.update( + { + "gen_ai.system_instructions": safe_serialize( + [ + { + "type": "text", + "content": "You are a coding assistant that talks like a pirate.", + }, + {"type": "text", "content": "You are a helpful assistant."}, + ] + ), + "gen_ai.request.messages": safe_serialize( + [{"type": "message", "role": "user", "content": "hello"}] + ), + } + ) + elif "parts_no_type" in param_id and ( + instructions is None or isinstance(instructions, Omit) + ): # type: ignore + expected_data.update( + { + "gen_ai.system_instructions": safe_serialize( + [ + {"type": "text", "content": "You are a helpful assistant."}, + {"type": "text", "content": "Be concise and clear."}, + ] + ), + "gen_ai.request.messages": safe_serialize( + [{"role": "user", "content": "hello"}] + ), + } + ) + elif "parts_no_type" in param_id: + expected_data.update( + { + "gen_ai.system_instructions": safe_serialize( + [ + { + "type": "text", + "content": "You are a coding assistant that talks like a pirate.", + }, + {"type": "text", "content": "You are a helpful assistant."}, + {"type": "text", "content": "Be concise and clear."}, + ] + ), + "gen_ai.request.messages": safe_serialize( + [{"role": "user", "content": "hello"}] + ), + } + ) + elif instructions is None or isinstance(instructions, Omit): # type: ignore + expected_data.update( + { + "gen_ai.system_instructions": safe_serialize( + [ + {"type": "text", "content": "You are a helpful assistant."}, + {"type": "text", "content": "Be concise and clear."}, + ] + ), + "gen_ai.request.messages": safe_serialize( + [{"type": "message", "role": "user", "content": "hello"}] + ), + } + ) + else: + expected_data.update( + { + "gen_ai.system_instructions": safe_serialize( + [ + { + "type": "text", + "content": "You are a coding assistant that talks like a pirate.", + }, + {"type": "text", "content": "You are a helpful assistant."}, + {"type": "text", "content": "Be concise and clear."}, + ] + ), + "gen_ai.request.messages": safe_serialize( + [{"type": "message", "role": "user", "content": "hello"}] + ), + } + ) - assert spans[0]["data"] == expected_data + assert spans[0]["data"] == expected_data +@pytest.mark.parametrize("stream_gen_ai_spans", [True, False]) @pytest.mark.asyncio @pytest.mark.skipif(SKIP_RESPONSES_TESTS, reason="Responses API not available") -async def test_error_in_responses_async_api(sentry_init, capture_events): +async def test_error_in_responses_async_api( + sentry_init, + capture_events, + capture_items, + stream_gen_ai_spans, +): sentry_init( integrations=[OpenAIIntegration(include_prompts=True)], traces_sample_rate=1.0, send_default_pii=True, + stream_gen_ai_spans=stream_gen_ai_spans, ) - events = capture_events() client = AsyncOpenAI(api_key="z") client.responses._post = AsyncMock( side_effect=OpenAIError("API rate limit reached") ) - with start_transaction(name="openai tx"): - with pytest.raises(OpenAIError): + if stream_gen_ai_spans: + items = capture_items("event", "transaction", "span") + + with start_transaction(name="openai tx"), pytest.raises(OpenAIError): + await client.responses.create( + model="gpt-4o", + instructions="You are a coding assistant that talks like a pirate.", + input="How do I check if a Python object is an instance of a class?", + ) + + # make sure the span where the error occurred is captured + spans = [item.payload for item in items if item.type == "span"] + assert spans[0]["attributes"]["sentry.op"] == "gen_ai.responses" + + (error_event,) = (item.payload for item in items if item.type == "event") + + assert error_event["level"] == "error" + assert error_event["exception"]["values"][0]["type"] == "OpenAIError" + + (transaction_event,) = ( + item.payload for item in items if item.type == "transaction" + ) + else: + events = capture_events() + + with start_transaction(name="openai tx"), pytest.raises(OpenAIError): await client.responses.create( model="gpt-4o", instructions="You are a coding assistant that talks like a pirate.", input="How do I check if a Python object is an instance of a class?", ) - (error_event, transaction_event) = events + (error_event, transaction_event) = events - assert transaction_event["type"] == "transaction" - # make sure the span where the error occurred is captured - assert transaction_event["spans"][0]["op"] == "gen_ai.responses" + assert transaction_event["type"] == "transaction" + # make sure the span where the error occurred is captured + assert transaction_event["spans"][0]["op"] == "gen_ai.responses" - assert error_event["level"] == "error" - assert error_event["exception"]["values"][0]["type"] == "OpenAIError" + assert error_event["level"] == "error" + assert error_event["exception"]["values"][0]["type"] == "OpenAIError" assert ( error_event["contexts"]["trace"]["trace_id"] @@ -3583,6 +5439,7 @@ async def test_error_in_responses_async_api(sentry_init, capture_events): ] +@pytest.mark.parametrize("stream_gen_ai_spans", [True, False]) @pytest.mark.parametrize( "send_default_pii, include_prompts", [(True, True), (True, False), (False, True), (False, False)], @@ -3591,10 +5448,12 @@ async def test_error_in_responses_async_api(sentry_init, capture_events): def test_streaming_responses_api( sentry_init, capture_events, + capture_items, send_default_pii, include_prompts, get_model_response, server_side_event_chunks, + stream_gen_ai_spans, ): sentry_init( integrations=[ @@ -3604,8 +5463,8 @@ def test_streaming_responses_api( ], traces_sample_rate=1.0, send_default_pii=send_default_pii, + stream_gen_ai_spans=stream_gen_ai_spans, ) - events = capture_events() client = OpenAI(api_key="z") returned_stream = get_model_response( @@ -3614,12 +5473,14 @@ def test_streaming_responses_api( ) ) - with mock.patch.object( - client.responses._client._client, - "send", - return_value=returned_stream, - ): - with start_transaction(name="openai tx"): + if stream_gen_ai_spans: + items = capture_items("span") + + with mock.patch.object( + client.responses._client._client, + "send", + return_value=returned_stream, + ), start_transaction(name="openai tx"): response_stream = client.responses.create( model="some-model", input="hello", @@ -3634,30 +5495,74 @@ def test_streaming_responses_api( if hasattr(item, "delta"): response_string += item.delta - assert response_string == "hello world" + assert response_string == "hello world" - (transaction,) = events - (span,) = transaction["spans"] - assert span["op"] == "gen_ai.responses" - assert span["data"][SPANDATA.GEN_AI_SYSTEM] == "openai" - assert span["data"][SPANDATA.GEN_AI_REQUEST_MAX_TOKENS] == 100 - assert span["data"][SPANDATA.GEN_AI_REQUEST_TEMPERATURE] == 0.7 - assert span["data"][SPANDATA.GEN_AI_REQUEST_TOP_P] == 0.9 + (span,) = (item.payload for item in items if item.type == "span") + assert span["attributes"]["sentry.op"] == "gen_ai.responses" + assert span["attributes"][SPANDATA.GEN_AI_SYSTEM] == "openai" + assert span["attributes"][SPANDATA.GEN_AI_REQUEST_MAX_TOKENS] == 100 + assert span["attributes"][SPANDATA.GEN_AI_REQUEST_TEMPERATURE] == 0.7 + assert span["attributes"][SPANDATA.GEN_AI_REQUEST_TOP_P] == 0.9 - assert span["data"][SPANDATA.GEN_AI_RESPONSE_MODEL] == "response-model-id" + assert span["attributes"][SPANDATA.GEN_AI_RESPONSE_MODEL] == "response-model-id" - if send_default_pii and include_prompts: - assert span["data"][SPANDATA.GEN_AI_REQUEST_MESSAGES] == '["hello"]' - assert span["data"][SPANDATA.GEN_AI_RESPONSE_TEXT] == "hello world" + if send_default_pii and include_prompts: + assert span["attributes"][SPANDATA.GEN_AI_REQUEST_MESSAGES] == '["hello"]' + assert span["attributes"][SPANDATA.GEN_AI_RESPONSE_TEXT] == "hello world" + else: + assert SPANDATA.GEN_AI_REQUEST_MESSAGES not in span["attributes"] + assert SPANDATA.GEN_AI_RESPONSE_TEXT not in span["attributes"] + + assert span["attributes"]["gen_ai.usage.input_tokens"] == 20 + assert span["attributes"]["gen_ai.usage.output_tokens"] == 10 + assert span["attributes"]["gen_ai.usage.total_tokens"] == 30 else: - assert SPANDATA.GEN_AI_REQUEST_MESSAGES not in span["data"] - assert SPANDATA.GEN_AI_RESPONSE_TEXT not in span["data"] + events = capture_events() + + with mock.patch.object( + client.responses._client._client, + "send", + return_value=returned_stream, + ), start_transaction(name="openai tx"): + response_stream = client.responses.create( + model="some-model", + input="hello", + stream=True, + max_output_tokens=100, + temperature=0.7, + top_p=0.9, + ) + + response_string = "" + for item in response_stream: + if hasattr(item, "delta"): + response_string += item.delta + + assert response_string == "hello world" + + (transaction,) = events + (span,) = transaction["spans"] + assert span["op"] == "gen_ai.responses" + assert span["data"][SPANDATA.GEN_AI_SYSTEM] == "openai" + assert span["data"][SPANDATA.GEN_AI_REQUEST_MAX_TOKENS] == 100 + assert span["data"][SPANDATA.GEN_AI_REQUEST_TEMPERATURE] == 0.7 + assert span["data"][SPANDATA.GEN_AI_REQUEST_TOP_P] == 0.9 - assert span["data"]["gen_ai.usage.input_tokens"] == 20 - assert span["data"]["gen_ai.usage.output_tokens"] == 10 - assert span["data"]["gen_ai.usage.total_tokens"] == 30 + assert span["data"][SPANDATA.GEN_AI_RESPONSE_MODEL] == "response-model-id" + if send_default_pii and include_prompts: + assert span["data"][SPANDATA.GEN_AI_REQUEST_MESSAGES] == '["hello"]' + assert span["data"][SPANDATA.GEN_AI_RESPONSE_TEXT] == "hello world" + else: + assert SPANDATA.GEN_AI_REQUEST_MESSAGES not in span["data"] + assert SPANDATA.GEN_AI_RESPONSE_TEXT not in span["data"] + + assert span["data"]["gen_ai.usage.input_tokens"] == 20 + assert span["data"]["gen_ai.usage.output_tokens"] == 10 + assert span["data"]["gen_ai.usage.total_tokens"] == 30 + +@pytest.mark.parametrize("stream_gen_ai_spans", [True, False]) @pytest.mark.asyncio @pytest.mark.parametrize( "send_default_pii, include_prompts", @@ -3667,11 +5572,13 @@ def test_streaming_responses_api( async def test_streaming_responses_api_async( sentry_init, capture_events, + capture_items, send_default_pii, include_prompts, get_model_response, async_iterator, server_side_event_chunks, + stream_gen_ai_spans, ): sentry_init( integrations=[ @@ -3681,20 +5588,22 @@ async def test_streaming_responses_api_async( ], traces_sample_rate=1.0, send_default_pii=send_default_pii, + stream_gen_ai_spans=stream_gen_ai_spans, ) - events = capture_events() client = AsyncOpenAI(api_key="z") returned_stream = get_model_response( async_iterator(server_side_event_chunks(EXAMPLE_RESPONSES_STREAM)) ) - with mock.patch.object( - client.responses._client._client, - "send", - return_value=returned_stream, - ): - with start_transaction(name="openai tx"): + if stream_gen_ai_spans: + items = capture_items("span") + + with mock.patch.object( + client.responses._client._client, + "send", + return_value=returned_stream, + ), start_transaction(name="openai tx"): response_stream = await client.responses.create( model="some-model", input="hello", @@ -3709,30 +5618,74 @@ async def test_streaming_responses_api_async( if hasattr(item, "delta"): response_string += item.delta - assert response_string == "hello world" + assert response_string == "hello world" + + (span,) = (item.payload for item in items if item.type == "span") + assert span["attributes"]["sentry.op"] == "gen_ai.responses" + assert span["attributes"][SPANDATA.GEN_AI_SYSTEM] == "openai" + assert span["attributes"][SPANDATA.GEN_AI_REQUEST_MAX_TOKENS] == 100 + assert span["attributes"][SPANDATA.GEN_AI_REQUEST_TEMPERATURE] == 0.7 + assert span["attributes"][SPANDATA.GEN_AI_REQUEST_TOP_P] == 0.9 - (transaction,) = events - (span,) = transaction["spans"] - assert span["op"] == "gen_ai.responses" - assert span["data"][SPANDATA.GEN_AI_SYSTEM] == "openai" - assert span["data"][SPANDATA.GEN_AI_REQUEST_MAX_TOKENS] == 100 - assert span["data"][SPANDATA.GEN_AI_REQUEST_TEMPERATURE] == 0.7 - assert span["data"][SPANDATA.GEN_AI_REQUEST_TOP_P] == 0.9 + assert span["attributes"][SPANDATA.GEN_AI_RESPONSE_MODEL] == "response-model-id" - assert span["data"][SPANDATA.GEN_AI_RESPONSE_MODEL] == "response-model-id" + if send_default_pii and include_prompts: + assert span["attributes"][SPANDATA.GEN_AI_REQUEST_MESSAGES] == '["hello"]' + assert span["attributes"][SPANDATA.GEN_AI_RESPONSE_TEXT] == "hello world" + else: + assert SPANDATA.GEN_AI_REQUEST_MESSAGES not in span["attributes"] + assert SPANDATA.GEN_AI_RESPONSE_TEXT not in span["attributes"] - if send_default_pii and include_prompts: - assert span["data"][SPANDATA.GEN_AI_REQUEST_MESSAGES] == '["hello"]' - assert span["data"][SPANDATA.GEN_AI_RESPONSE_TEXT] == "hello world" + assert span["attributes"]["gen_ai.usage.input_tokens"] == 20 + assert span["attributes"]["gen_ai.usage.output_tokens"] == 10 + assert span["attributes"]["gen_ai.usage.total_tokens"] == 30 else: - assert SPANDATA.GEN_AI_REQUEST_MESSAGES not in span["data"] - assert SPANDATA.GEN_AI_RESPONSE_TEXT not in span["data"] + events = capture_events() + + with mock.patch.object( + client.responses._client._client, + "send", + return_value=returned_stream, + ), start_transaction(name="openai tx"): + response_stream = await client.responses.create( + model="some-model", + input="hello", + stream=True, + max_output_tokens=100, + temperature=0.7, + top_p=0.9, + ) + + response_string = "" + async for item in response_stream: + if hasattr(item, "delta"): + response_string += item.delta + + assert response_string == "hello world" + + (transaction,) = events + (span,) = transaction["spans"] + assert span["op"] == "gen_ai.responses" + assert span["data"][SPANDATA.GEN_AI_SYSTEM] == "openai" + assert span["data"][SPANDATA.GEN_AI_REQUEST_MAX_TOKENS] == 100 + assert span["data"][SPANDATA.GEN_AI_REQUEST_TEMPERATURE] == 0.7 + assert span["data"][SPANDATA.GEN_AI_REQUEST_TOP_P] == 0.9 + + assert span["data"][SPANDATA.GEN_AI_RESPONSE_MODEL] == "response-model-id" + + if send_default_pii and include_prompts: + assert span["data"][SPANDATA.GEN_AI_REQUEST_MESSAGES] == '["hello"]' + assert span["data"][SPANDATA.GEN_AI_RESPONSE_TEXT] == "hello world" + else: + assert SPANDATA.GEN_AI_REQUEST_MESSAGES not in span["data"] + assert SPANDATA.GEN_AI_RESPONSE_TEXT not in span["data"] - assert span["data"]["gen_ai.usage.input_tokens"] == 20 - assert span["data"]["gen_ai.usage.output_tokens"] == 10 - assert span["data"]["gen_ai.usage.total_tokens"] == 30 + assert span["data"]["gen_ai.usage.input_tokens"] == 20 + assert span["data"]["gen_ai.usage.output_tokens"] == 10 + assert span["data"]["gen_ai.usage.total_tokens"] == 30 +@pytest.mark.parametrize("stream_gen_ai_spans", [True, False]) @pytest.mark.skipif( OPENAI_VERSION <= (1, 1, 0), reason="OpenAI versions <=1.1.0 do not support the tools parameter.", @@ -3742,13 +5695,18 @@ async def test_streaming_responses_api_async( [[], None, NOT_GIVEN, omit], ) def test_empty_tools_in_chat_completion( - sentry_init, capture_events, tools, nonstreaming_chat_completions_model_response + sentry_init, + capture_events, + capture_items, + tools, + nonstreaming_chat_completions_model_response, + stream_gen_ai_spans, ): sentry_init( integrations=[OpenAIIntegration()], traces_sample_rate=1.0, + stream_gen_ai_spans=stream_gen_ai_spans, ) - events = capture_events() client = OpenAI(api_key="z") client.chat.completions._post = mock.Mock( @@ -3765,20 +5723,37 @@ def test_empty_tools_in_chat_completion( ) ) - with start_transaction(name="openai tx"): - client.chat.completions.create( - model="some-model", - messages=[{"role": "system", "content": "hello"}], - tools=tools, - ) + if stream_gen_ai_spans: + items = capture_items("span") + + with start_transaction(name="openai tx"): + client.chat.completions.create( + model="some-model", + messages=[{"role": "system", "content": "hello"}], + tools=tools, + ) + + span = next(item.payload for item in items if item.type == "span") + + assert "gen_ai.request.available_tools" not in span["attributes"] + else: + events = capture_events() + + with start_transaction(name="openai tx"): + client.chat.completions.create( + model="some-model", + messages=[{"role": "system", "content": "hello"}], + tools=tools, + ) - (event,) = events - span = event["spans"][0] + (event,) = events + span = event["spans"][0] - assert "gen_ai.request.available_tools" not in span["data"] + assert "gen_ai.request.available_tools" not in span["data"] # Test messages with mixed roles including "ai" that should be mapped to "assistant" +@pytest.mark.parametrize("stream_gen_ai_spans", [True, False]) @pytest.mark.parametrize( "test_message,expected_role", [ @@ -3796,9 +5771,11 @@ def test_empty_tools_in_chat_completion( def test_openai_message_role_mapping( sentry_init, capture_events, + capture_items, test_message, expected_role, nonstreaming_chat_completions_model_response, + stream_gen_ai_spans, ): """Test that OpenAI integration properly maps message roles like 'ai' to 'assistant'""" @@ -3806,8 +5783,8 @@ def test_openai_message_role_mapping( integrations=[OpenAIIntegration(include_prompts=True)], traces_sample_rate=1.0, send_default_pii=True, + stream_gen_ai_spans=stream_gen_ai_spans, ) - events = capture_events() client = OpenAI(api_key="z") client.chat.completions._post = mock.Mock( @@ -3826,33 +5803,53 @@ def test_openai_message_role_mapping( test_messages = [test_message] - with start_transaction(name="openai tx"): - client.chat.completions.create(model="test-model", messages=test_messages) - # Verify that the span was created correctly - (event,) = events - span = event["spans"][0] - assert span["op"] == "gen_ai.chat" - assert SPANDATA.GEN_AI_REQUEST_MESSAGES in span["data"] + if stream_gen_ai_spans: + items = capture_items("span") + + with start_transaction(name="openai tx"): + client.chat.completions.create(model="test-model", messages=test_messages) + + # Verify that the span was created correctly + span = next(item.payload for item in items if item.type == "span") + assert span["attributes"]["sentry.op"] == "gen_ai.chat" + assert SPANDATA.GEN_AI_REQUEST_MESSAGES in span["attributes"] + + stored_messages = json.loads( + span["attributes"][SPANDATA.GEN_AI_REQUEST_MESSAGES] + ) + else: + events = capture_events() + + with start_transaction(name="openai tx"): + client.chat.completions.create(model="test-model", messages=test_messages) - # Parse the stored messages - import json + # Verify that the span was created correctly + (event,) = events + span = event["spans"][0] + assert span["op"] == "gen_ai.chat" + assert SPANDATA.GEN_AI_REQUEST_MESSAGES in span["data"] - stored_messages = json.loads(span["data"][SPANDATA.GEN_AI_REQUEST_MESSAGES]) + stored_messages = json.loads(span["data"][SPANDATA.GEN_AI_REQUEST_MESSAGES]) assert len(stored_messages) == 1 assert stored_messages[0]["role"] == expected_role +@pytest.mark.parametrize("stream_gen_ai_spans", [True, False]) def test_openai_message_truncation( - sentry_init, capture_events, nonstreaming_chat_completions_model_response + sentry_init, + capture_events, + capture_items, + nonstreaming_chat_completions_model_response, + stream_gen_ai_spans, ): """Test that large messages are truncated properly in OpenAI integration.""" sentry_init( integrations=[OpenAIIntegration(include_prompts=True)], traces_sample_rate=1.0, send_default_pii=True, + stream_gen_ai_spans=stream_gen_ai_spans, ) - events = capture_events() client = OpenAI(api_key="z") client.chat.completions._post = mock.Mock( @@ -3879,22 +5876,45 @@ def test_openai_message_truncation( {"role": "user", "content": large_content}, ] - with start_transaction(name="openai tx"): - client.chat.completions.create( - model="some-model", - messages=large_messages, - ) + if stream_gen_ai_spans: + items = capture_items("transaction", "span") + + with start_transaction(name="openai tx"): + client.chat.completions.create( + model="some-model", + messages=large_messages, + ) + + span = next(item.payload for item in items if item.type == "span") + assert SPANDATA.GEN_AI_REQUEST_MESSAGES in span["attributes"] + + messages_data = span["attributes"][SPANDATA.GEN_AI_REQUEST_MESSAGES] + assert isinstance(messages_data, str) + + parsed_messages = json.loads(messages_data) + assert isinstance(parsed_messages, list) + assert len(parsed_messages) <= len(large_messages) + + (event,) = (item.payload for item in items if item.type == "transaction") + else: + events = capture_events() + + with start_transaction(name="openai tx"): + client.chat.completions.create( + model="some-model", + messages=large_messages, + ) - (event,) = events - span = event["spans"][0] - assert SPANDATA.GEN_AI_REQUEST_MESSAGES in span["data"] + (event,) = events + span = event["spans"][0] + assert SPANDATA.GEN_AI_REQUEST_MESSAGES in span["data"] - messages_data = span["data"][SPANDATA.GEN_AI_REQUEST_MESSAGES] - assert isinstance(messages_data, str) + messages_data = span["data"][SPANDATA.GEN_AI_REQUEST_MESSAGES] + assert isinstance(messages_data, str) - parsed_messages = json.loads(messages_data) - assert isinstance(parsed_messages, list) - assert len(parsed_messages) <= len(large_messages) + parsed_messages = json.loads(messages_data) + assert isinstance(parsed_messages, list) + assert len(parsed_messages) <= len(large_messages) meta_path = event["_meta"] span_meta = meta_path["spans"]["0"]["data"] @@ -3903,8 +5923,14 @@ def test_openai_message_truncation( # noinspection PyTypeChecker +@pytest.mark.parametrize("stream_gen_ai_spans", [True, False]) def test_streaming_chat_completion_ttft( - sentry_init, capture_events, get_model_response, server_side_event_chunks + sentry_init, + capture_events, + capture_items, + get_model_response, + server_side_event_chunks, + stream_gen_ai_spans, ): """ Test that streaming chat completions capture time-to-first-token (TTFT). @@ -3912,8 +5938,8 @@ def test_streaming_chat_completion_ttft( sentry_init( integrations=[OpenAIIntegration()], traces_sample_rate=1.0, + stream_gen_ai_spans=stream_gen_ai_spans, ) - events = capture_events() client = OpenAI(api_key="z") returned_stream = get_model_response( @@ -3950,12 +5976,37 @@ def test_streaming_chat_completion_ttft( ), ) - with mock.patch.object( - client.chat._client._client, - "send", - return_value=returned_stream, - ): - with start_transaction(name="openai tx"): + if stream_gen_ai_spans: + items = capture_items("span") + + with mock.patch.object( + client.chat._client._client, + "send", + return_value=returned_stream, + ), start_transaction(name="openai tx"): + response_stream = client.chat.completions.create( + model="some-model", + messages=[{"role": "user", "content": "Say hello"}], + stream=True, + ) + # Consume the stream + for _ in response_stream: + pass + + span = next(item.payload for item in items if item.type == "span") + assert span["attributes"]["sentry.op"] == "gen_ai.chat" + + # Verify TTFT is captured + assert SPANDATA.GEN_AI_RESPONSE_TIME_TO_FIRST_TOKEN in span["attributes"] + ttft = span["attributes"][SPANDATA.GEN_AI_RESPONSE_TIME_TO_FIRST_TOKEN] + else: + events = capture_events() + + with mock.patch.object( + client.chat._client._client, + "send", + return_value=returned_stream, + ), start_transaction(name="openai tx"): response_stream = client.chat.completions.create( model="some-model", messages=[{"role": "user", "content": "Say hello"}], @@ -3965,25 +6016,29 @@ def test_streaming_chat_completion_ttft( for _ in response_stream: pass - (tx,) = events - span = tx["spans"][0] - assert span["op"] == "gen_ai.chat" + (tx,) = events + span = tx["spans"][0] + assert span["op"] == "gen_ai.chat" + + # Verify TTFT is captured + assert SPANDATA.GEN_AI_RESPONSE_TIME_TO_FIRST_TOKEN in span["data"] + ttft = span["data"][SPANDATA.GEN_AI_RESPONSE_TIME_TO_FIRST_TOKEN] - # Verify TTFT is captured - assert SPANDATA.GEN_AI_RESPONSE_TIME_TO_FIRST_TOKEN in span["data"] - ttft = span["data"][SPANDATA.GEN_AI_RESPONSE_TIME_TO_FIRST_TOKEN] assert isinstance(ttft, float) assert ttft > 0 # noinspection PyTypeChecker +@pytest.mark.parametrize("stream_gen_ai_spans", [True, False]) @pytest.mark.asyncio async def test_streaming_chat_completion_ttft_async( sentry_init, capture_events, + capture_items, get_model_response, async_iterator, server_side_event_chunks, + stream_gen_ai_spans, ): """ Test that async streaming chat completions capture time-to-first-token (TTFT). @@ -3991,8 +6046,8 @@ async def test_streaming_chat_completion_ttft_async( sentry_init( integrations=[OpenAIIntegration()], traces_sample_rate=1.0, + stream_gen_ai_spans=stream_gen_ai_spans, ) - events = capture_events() client = AsyncOpenAI(api_key="z") returned_stream = get_model_response( @@ -4031,12 +6086,37 @@ async def test_streaming_chat_completion_ttft_async( ) ) - with mock.patch.object( - client.chat._client._client, - "send", - return_value=returned_stream, - ): - with start_transaction(name="openai tx"): + if stream_gen_ai_spans: + items = capture_items("span") + + with mock.patch.object( + client.chat._client._client, + "send", + return_value=returned_stream, + ), start_transaction(name="openai tx"): + response_stream = await client.chat.completions.create( + model="some-model", + messages=[{"role": "user", "content": "Say hello"}], + stream=True, + ) + # Consume the stream + async for _ in response_stream: + pass + + span = next(item.payload for item in items if item.type == "span") + assert span["attributes"]["sentry.op"] == "gen_ai.chat" + + # Verify TTFT is captured + assert SPANDATA.GEN_AI_RESPONSE_TIME_TO_FIRST_TOKEN in span["attributes"] + ttft = span["attributes"][SPANDATA.GEN_AI_RESPONSE_TIME_TO_FIRST_TOKEN] + else: + events = capture_events() + + with mock.patch.object( + client.chat._client._client, + "send", + return_value=returned_stream, + ), start_transaction(name="openai tx"): response_stream = await client.chat.completions.create( model="some-model", messages=[{"role": "user", "content": "Say hello"}], @@ -4046,21 +6126,28 @@ async def test_streaming_chat_completion_ttft_async( async for _ in response_stream: pass - (tx,) = events - span = tx["spans"][0] - assert span["op"] == "gen_ai.chat" + (tx,) = events + span = tx["spans"][0] + assert span["op"] == "gen_ai.chat" + + # Verify TTFT is captured + assert SPANDATA.GEN_AI_RESPONSE_TIME_TO_FIRST_TOKEN in span["data"] + ttft = span["data"][SPANDATA.GEN_AI_RESPONSE_TIME_TO_FIRST_TOKEN] - # Verify TTFT is captured - assert SPANDATA.GEN_AI_RESPONSE_TIME_TO_FIRST_TOKEN in span["data"] - ttft = span["data"][SPANDATA.GEN_AI_RESPONSE_TIME_TO_FIRST_TOKEN] assert isinstance(ttft, float) assert ttft > 0 # noinspection PyTypeChecker +@pytest.mark.parametrize("stream_gen_ai_spans", [True, False]) @pytest.mark.skipif(SKIP_RESPONSES_TESTS, reason="Responses API not available") def test_streaming_responses_api_ttft( - sentry_init, capture_events, get_model_response, server_side_event_chunks + sentry_init, + capture_events, + capture_items, + get_model_response, + server_side_event_chunks, + stream_gen_ai_spans, ): """ Test that streaming responses API captures time-to-first-token (TTFT). @@ -4068,20 +6155,45 @@ def test_streaming_responses_api_ttft( sentry_init( integrations=[OpenAIIntegration()], traces_sample_rate=1.0, + stream_gen_ai_spans=stream_gen_ai_spans, ) - events = capture_events() client = OpenAI(api_key="z") returned_stream = get_model_response( server_side_event_chunks(EXAMPLE_RESPONSES_STREAM) ) - with mock.patch.object( - client.responses._client._client, - "send", - return_value=returned_stream, - ): - with start_transaction(name="openai tx"): + if stream_gen_ai_spans: + items = capture_items("span") + + with mock.patch.object( + client.responses._client._client, + "send", + return_value=returned_stream, + ), start_transaction(name="openai tx"): + response_stream = client.responses.create( + model="some-model", + input="hello", + stream=True, + ) + # Consume the stream + for _ in response_stream: + pass + + span = next(item.payload for item in items if item.type == "span") + assert span["attributes"]["sentry.op"] == "gen_ai.responses" + + # Verify TTFT is captured + assert SPANDATA.GEN_AI_RESPONSE_TIME_TO_FIRST_TOKEN in span["attributes"] + ttft = span["attributes"][SPANDATA.GEN_AI_RESPONSE_TIME_TO_FIRST_TOKEN] + else: + events = capture_events() + + with mock.patch.object( + client.responses._client._client, + "send", + return_value=returned_stream, + ), start_transaction(name="openai tx"): response_stream = client.responses.create( model="some-model", input="hello", @@ -4091,26 +6203,30 @@ def test_streaming_responses_api_ttft( for _ in response_stream: pass - (tx,) = events - span = tx["spans"][0] - assert span["op"] == "gen_ai.responses" + (tx,) = events + span = tx["spans"][0] + assert span["op"] == "gen_ai.responses" + + # Verify TTFT is captured + assert SPANDATA.GEN_AI_RESPONSE_TIME_TO_FIRST_TOKEN in span["data"] + ttft = span["data"][SPANDATA.GEN_AI_RESPONSE_TIME_TO_FIRST_TOKEN] - # Verify TTFT is captured - assert SPANDATA.GEN_AI_RESPONSE_TIME_TO_FIRST_TOKEN in span["data"] - ttft = span["data"][SPANDATA.GEN_AI_RESPONSE_TIME_TO_FIRST_TOKEN] assert isinstance(ttft, float) assert ttft > 0 # noinspection PyTypeChecker +@pytest.mark.parametrize("stream_gen_ai_spans", [True, False]) @pytest.mark.asyncio @pytest.mark.skipif(SKIP_RESPONSES_TESTS, reason="Responses API not available") async def test_streaming_responses_api_ttft_async( sentry_init, capture_events, + capture_items, get_model_response, async_iterator, server_side_event_chunks, + stream_gen_ai_spans, ): """ Test that async streaming responses API captures time-to-first-token (TTFT). @@ -4118,20 +6234,45 @@ async def test_streaming_responses_api_ttft_async( sentry_init( integrations=[OpenAIIntegration()], traces_sample_rate=1.0, + stream_gen_ai_spans=stream_gen_ai_spans, ) - events = capture_events() client = AsyncOpenAI(api_key="z") returned_stream = get_model_response( async_iterator(server_side_event_chunks(EXAMPLE_RESPONSES_STREAM)) ) - with mock.patch.object( - client.responses._client._client, - "send", - return_value=returned_stream, - ): - with start_transaction(name="openai tx"): + if stream_gen_ai_spans: + items = capture_items("span") + + with mock.patch.object( + client.responses._client._client, + "send", + return_value=returned_stream, + ), start_transaction(name="openai tx"): + response_stream = await client.responses.create( + model="some-model", + input="hello", + stream=True, + ) + # Consume the stream + async for _ in response_stream: + pass + + span = next(item.payload for item in items if item.type == "span") + assert span["attributes"]["sentry.op"] == "gen_ai.responses" + + # Verify TTFT is captured + assert SPANDATA.GEN_AI_RESPONSE_TIME_TO_FIRST_TOKEN in span["attributes"] + ttft = span["attributes"][SPANDATA.GEN_AI_RESPONSE_TIME_TO_FIRST_TOKEN] + else: + events = capture_events() + + with mock.patch.object( + client.responses._client._client, + "send", + return_value=returned_stream, + ), start_transaction(name="openai tx"): response_stream = await client.responses.create( model="some-model", input="hello", @@ -4141,12 +6282,13 @@ async def test_streaming_responses_api_ttft_async( async for _ in response_stream: pass - (tx,) = events - span = tx["spans"][0] - assert span["op"] == "gen_ai.responses" + (tx,) = events + span = tx["spans"][0] + assert span["op"] == "gen_ai.responses" + + # Verify TTFT is captured + assert SPANDATA.GEN_AI_RESPONSE_TIME_TO_FIRST_TOKEN in span["data"] + ttft = span["data"][SPANDATA.GEN_AI_RESPONSE_TIME_TO_FIRST_TOKEN] - # Verify TTFT is captured - assert SPANDATA.GEN_AI_RESPONSE_TIME_TO_FIRST_TOKEN in span["data"] - ttft = span["data"][SPANDATA.GEN_AI_RESPONSE_TIME_TO_FIRST_TOKEN] assert isinstance(ttft, float) assert ttft > 0 diff --git a/tests/integrations/openai_agents/test_openai_agents.py b/tests/integrations/openai_agents/test_openai_agents.py index 7310e86df5..bf44562b14 100644 --- a/tests/integrations/openai_agents/test_openai_agents.py +++ b/tests/integrations/openai_agents/test_openai_agents.py @@ -157,13 +157,16 @@ def test_agent_custom_model(): ) +@pytest.mark.parametrize("stream_gen_ai_spans", [True, False]) @pytest.mark.asyncio async def test_agent_invocation_span_no_pii( sentry_init, capture_events, + capture_items, test_agent, nonstreaming_responses_model_response, get_model_response, + stream_gen_ai_spans, ): client = AsyncOpenAI(api_key="test-key") model = OpenAIResponsesModel(model="gpt-4", openai_client=client) @@ -173,60 +176,125 @@ async def test_agent_invocation_span_no_pii( nonstreaming_responses_model_response, serialize_pydantic=True ) - with patch.object( - agent.model._client._client, - "send", - return_value=response, - ) as _: - sentry_init( - integrations=[OpenAIAgentsIntegration()], - traces_sample_rate=1.0, - send_default_pii=False, + if stream_gen_ai_spans: + with patch.object( + agent.model._client._client, + "send", + return_value=response, + ) as _: + sentry_init( + integrations=[OpenAIAgentsIntegration()], + traces_sample_rate=1.0, + send_default_pii=False, + stream_gen_ai_spans=stream_gen_ai_spans, + ) + + items = capture_items("span", "transaction") + + result = await agents.Runner.run( + agent, "Test input", run_config=test_run_config + ) + + assert result is not None + assert result.final_output == "Hello, how can I help you?" + + spans = [item.payload for item in items if item.type == "span"] + invoke_agent_span = next( + span + for span in spans + if span["attributes"]["sentry.op"] == OP.GEN_AI_INVOKE_AGENT + ) + ai_client_span = next( + span for span in spans if span["attributes"]["sentry.op"] == OP.GEN_AI_CHAT ) - events = capture_events() + (transaction,) = (item.payload for item in items if item.type == "transaction") + + assert transaction["transaction"] == "test_agent workflow" + assert transaction["contexts"]["trace"]["origin"] == "auto.ai.openai_agents" - result = await agents.Runner.run( - agent, "Test input", run_config=test_run_config + assert invoke_agent_span["name"] == "invoke_agent test_agent" + + assert ( + SPANDATA.GEN_AI_SYSTEM_INSTRUCTIONS not in invoke_agent_span["attributes"] ) + assert "gen_ai.request.messages" not in invoke_agent_span["attributes"] + assert "gen_ai.response.text" not in invoke_agent_span["attributes"] - assert result is not None - assert result.final_output == "Hello, how can I help you?" + assert ( + invoke_agent_span["attributes"]["gen_ai.operation.name"] == "invoke_agent" + ) + assert invoke_agent_span["attributes"]["gen_ai.system"] == "openai" + assert invoke_agent_span["attributes"]["gen_ai.agent.name"] == "test_agent" + assert invoke_agent_span["attributes"]["gen_ai.request.max_tokens"] == 100 + assert invoke_agent_span["attributes"]["gen_ai.request.model"] == "gpt-4" + assert invoke_agent_span["attributes"]["gen_ai.request.temperature"] == 0.7 + assert invoke_agent_span["attributes"]["gen_ai.request.top_p"] == 1.0 + + assert ai_client_span["name"] == "chat gpt-4" + assert ai_client_span["attributes"]["gen_ai.operation.name"] == "chat" + assert ai_client_span["attributes"]["gen_ai.system"] == "openai" + assert ai_client_span["attributes"]["gen_ai.agent.name"] == "test_agent" + assert ai_client_span["attributes"]["gen_ai.request.max_tokens"] == 100 + assert ai_client_span["attributes"]["gen_ai.request.model"] == "gpt-4" + assert ai_client_span["attributes"]["gen_ai.request.temperature"] == 0.7 + assert ai_client_span["attributes"]["gen_ai.request.top_p"] == 1.0 + else: + with patch.object( + agent.model._client._client, + "send", + return_value=response, + ) as _: + sentry_init( + integrations=[OpenAIAgentsIntegration()], + traces_sample_rate=1.0, + send_default_pii=False, + stream_gen_ai_spans=stream_gen_ai_spans, + ) + events = capture_events() - (transaction,) = events - spans = transaction["spans"] - invoke_agent_span = next( - span for span in spans if span["op"] == OP.GEN_AI_INVOKE_AGENT - ) - ai_client_span = next(span for span in spans if span["op"] == OP.GEN_AI_CHAT) + result = await agents.Runner.run( + agent, "Test input", run_config=test_run_config + ) + + assert result is not None + assert result.final_output == "Hello, how can I help you?" - assert transaction["transaction"] == "test_agent workflow" - assert transaction["contexts"]["trace"]["origin"] == "auto.ai.openai_agents" + (transaction,) = events + spans = transaction["spans"] + invoke_agent_span = next( + span for span in spans if span["op"] == OP.GEN_AI_INVOKE_AGENT + ) + ai_client_span = next(span for span in spans if span["op"] == OP.GEN_AI_CHAT) + + assert transaction["transaction"] == "test_agent workflow" + assert transaction["contexts"]["trace"]["origin"] == "auto.ai.openai_agents" - assert invoke_agent_span["description"] == "invoke_agent test_agent" + assert invoke_agent_span["description"] == "invoke_agent test_agent" - assert SPANDATA.GEN_AI_SYSTEM_INSTRUCTIONS not in invoke_agent_span["data"] - assert "gen_ai.request.messages" not in invoke_agent_span["data"] - assert "gen_ai.response.text" not in invoke_agent_span["data"] + assert SPANDATA.GEN_AI_SYSTEM_INSTRUCTIONS not in invoke_agent_span["data"] + assert "gen_ai.request.messages" not in invoke_agent_span["data"] + assert "gen_ai.response.text" not in invoke_agent_span["data"] - assert invoke_agent_span["data"]["gen_ai.operation.name"] == "invoke_agent" - assert invoke_agent_span["data"]["gen_ai.system"] == "openai" - assert invoke_agent_span["data"]["gen_ai.agent.name"] == "test_agent" - assert invoke_agent_span["data"]["gen_ai.request.max_tokens"] == 100 - assert invoke_agent_span["data"]["gen_ai.request.model"] == "gpt-4" - assert invoke_agent_span["data"]["gen_ai.request.temperature"] == 0.7 - assert invoke_agent_span["data"]["gen_ai.request.top_p"] == 1.0 + assert invoke_agent_span["data"]["gen_ai.operation.name"] == "invoke_agent" + assert invoke_agent_span["data"]["gen_ai.system"] == "openai" + assert invoke_agent_span["data"]["gen_ai.agent.name"] == "test_agent" + assert invoke_agent_span["data"]["gen_ai.request.max_tokens"] == 100 + assert invoke_agent_span["data"]["gen_ai.request.model"] == "gpt-4" + assert invoke_agent_span["data"]["gen_ai.request.temperature"] == 0.7 + assert invoke_agent_span["data"]["gen_ai.request.top_p"] == 1.0 - assert ai_client_span["description"] == "chat gpt-4" - assert ai_client_span["data"]["gen_ai.operation.name"] == "chat" - assert ai_client_span["data"]["gen_ai.system"] == "openai" - assert ai_client_span["data"]["gen_ai.agent.name"] == "test_agent" - assert ai_client_span["data"]["gen_ai.request.max_tokens"] == 100 - assert ai_client_span["data"]["gen_ai.request.model"] == "gpt-4" - assert ai_client_span["data"]["gen_ai.request.temperature"] == 0.7 - assert ai_client_span["data"]["gen_ai.request.top_p"] == 1.0 + assert ai_client_span["description"] == "chat gpt-4" + assert ai_client_span["data"]["gen_ai.operation.name"] == "chat" + assert ai_client_span["data"]["gen_ai.system"] == "openai" + assert ai_client_span["data"]["gen_ai.agent.name"] == "test_agent" + assert ai_client_span["data"]["gen_ai.request.max_tokens"] == 100 + assert ai_client_span["data"]["gen_ai.request.model"] == "gpt-4" + assert ai_client_span["data"]["gen_ai.request.temperature"] == 0.7 + assert ai_client_span["data"]["gen_ai.request.top_p"] == 1.0 +@pytest.mark.parametrize("stream_gen_ai_spans", [True, False]) @pytest.mark.asyncio @pytest.mark.parametrize( "instructions", @@ -306,12 +374,14 @@ async def test_agent_invocation_span_no_pii( async def test_agent_invocation_span( sentry_init, capture_events, + capture_items, test_agent_with_instructions, nonstreaming_responses_model_response, instructions, input, request, get_model_response, + stream_gen_ai_spans, ): """ Test that the integration creates spans for agent invocations. @@ -324,156 +394,352 @@ async def test_agent_invocation_span( nonstreaming_responses_model_response, serialize_pydantic=True ) - with patch.object( - agent.model._client._client, - "send", - return_value=response, - ) as _: - sentry_init( - integrations=[OpenAIAgentsIntegration()], - traces_sample_rate=1.0, - send_default_pii=True, - ) + if stream_gen_ai_spans: + with patch.object( + agent.model._client._client, + "send", + return_value=response, + ) as _: + sentry_init( + integrations=[OpenAIAgentsIntegration()], + traces_sample_rate=1.0, + send_default_pii=True, + stream_gen_ai_spans=stream_gen_ai_spans, + ) - events = capture_events() + items = capture_items("span", "transaction") - result = await agents.Runner.run( - agent, - input, - run_config=test_run_config, - ) + result = await agents.Runner.run( + agent, + input, + run_config=test_run_config, + ) - assert result is not None - assert result.final_output == "Hello, how can I help you?" + assert result is not None + assert result.final_output == "Hello, how can I help you?" - (transaction,) = events - spans = transaction["spans"] - invoke_agent_span, ai_client_span = spans + (transaction,) = (item.payload for item in items if item.type == "transaction") - assert transaction["transaction"] == "test_agent workflow" - assert transaction["contexts"]["trace"]["origin"] == "auto.ai.openai_agents" + assert transaction["transaction"] == "test_agent workflow" + assert transaction["contexts"]["trace"]["origin"] == "auto.ai.openai_agents" - assert invoke_agent_span["description"] == "invoke_agent test_agent" + spans = [item.payload for item in items if item.type == "span"] + invoke_agent_span, ai_client_span = spans - # Only first case checks "gen_ai.request.messages" until further input handling work. - param_id = request.node.callspec.id - if "string" in param_id and instructions is None: # type: ignore - assert "gen_ai.system_instructions" not in ai_client_span["data"] + assert invoke_agent_span["name"] == "invoke_agent test_agent" - assert invoke_agent_span["data"]["gen_ai.request.messages"] == safe_serialize( - [ - {"content": [{"text": "Test input", "type": "text"}], "role": "user"}, - ] - ) + # Only first case checks "gen_ai.request.messages" until further input handling work. + param_id = request.node.callspec.id + if "string" in param_id and instructions is None: # type: ignore + assert "gen_ai.system_instructions" not in ai_client_span["attributes"] - elif "string" in param_id: - assert ai_client_span["data"]["gen_ai.system_instructions"] == safe_serialize( - [ - { - "type": "text", - "content": "You are a coding assistant that talks like a pirate.", - }, - ] - ) - elif "blocks_no_type" in param_id and instructions is None: # type: ignore - assert ai_client_span["data"]["gen_ai.system_instructions"] == safe_serialize( - [ - {"type": "text", "content": "You are a helpful assistant."}, - ] - ) - elif "blocks_no_type" in param_id: - assert ai_client_span["data"]["gen_ai.system_instructions"] == safe_serialize( - [ - { - "type": "text", - "content": "You are a coding assistant that talks like a pirate.", - }, - {"type": "text", "content": "You are a helpful assistant."}, - ] - ) - elif "blocks" in param_id and instructions is None: # type: ignore - assert ai_client_span["data"]["gen_ai.system_instructions"] == safe_serialize( - [ - {"type": "text", "content": "You are a helpful assistant."}, - ] - ) - elif "blocks" in param_id: - assert ai_client_span["data"]["gen_ai.system_instructions"] == safe_serialize( - [ - { - "type": "text", - "content": "You are a coding assistant that talks like a pirate.", - }, - {"type": "text", "content": "You are a helpful assistant."}, - ] - ) - elif "parts_no_type" in param_id and instructions is None: - assert ai_client_span["data"]["gen_ai.system_instructions"] == safe_serialize( - [ - {"type": "text", "content": "You are a helpful assistant."}, - {"type": "text", "content": "Be concise and clear."}, - ] - ) - elif "parts_no_type" in param_id: - assert ai_client_span["data"]["gen_ai.system_instructions"] == safe_serialize( - [ - { - "type": "text", - "content": "You are a coding assistant that talks like a pirate.", - }, - {"type": "text", "content": "You are a helpful assistant."}, - {"type": "text", "content": "Be concise and clear."}, - ] + assert invoke_agent_span["attributes"][ + "gen_ai.request.messages" + ] == safe_serialize( + [ + { + "content": [{"text": "Test input", "type": "text"}], + "role": "user", + }, + ] + ) + elif "string" in param_id: + assert ai_client_span["attributes"][ + "gen_ai.system_instructions" + ] == safe_serialize( + [ + { + "type": "text", + "content": "You are a coding assistant that talks like a pirate.", + }, + ] + ) + elif "blocks_no_type" in param_id and instructions is None: # type: ignore + assert ai_client_span["attributes"][ + "gen_ai.system_instructions" + ] == safe_serialize( + [ + {"type": "text", "content": "You are a helpful assistant."}, + ] + ) + elif "blocks_no_type" in param_id: + assert ai_client_span["attributes"][ + "gen_ai.system_instructions" + ] == safe_serialize( + [ + { + "type": "text", + "content": "You are a coding assistant that talks like a pirate.", + }, + {"type": "text", "content": "You are a helpful assistant."}, + ] + ) + elif "blocks" in param_id and instructions is None: # type: ignore + assert ai_client_span["attributes"][ + "gen_ai.system_instructions" + ] == safe_serialize( + [ + {"type": "text", "content": "You are a helpful assistant."}, + ] + ) + elif "blocks" in param_id: + assert ai_client_span["attributes"][ + "gen_ai.system_instructions" + ] == safe_serialize( + [ + { + "type": "text", + "content": "You are a coding assistant that talks like a pirate.", + }, + {"type": "text", "content": "You are a helpful assistant."}, + ] + ) + elif "parts_no_type" in param_id and instructions is None: + assert ai_client_span["attributes"][ + "gen_ai.system_instructions" + ] == safe_serialize( + [ + {"type": "text", "content": "You are a helpful assistant."}, + {"type": "text", "content": "Be concise and clear."}, + ] + ) + elif "parts_no_type" in param_id: + assert ai_client_span["attributes"][ + "gen_ai.system_instructions" + ] == safe_serialize( + [ + { + "type": "text", + "content": "You are a coding assistant that talks like a pirate.", + }, + {"type": "text", "content": "You are a helpful assistant."}, + {"type": "text", "content": "Be concise and clear."}, + ] + ) + elif instructions is None: # type: ignore + assert ai_client_span["attributes"][ + "gen_ai.system_instructions" + ] == safe_serialize( + [ + {"type": "text", "content": "You are a helpful assistant."}, + {"type": "text", "content": "Be concise and clear."}, + ] + ) + else: + assert ai_client_span["attributes"][ + "gen_ai.system_instructions" + ] == safe_serialize( + [ + { + "type": "text", + "content": "You are a coding assistant that talks like a pirate.", + }, + {"type": "text", "content": "You are a helpful assistant."}, + {"type": "text", "content": "Be concise and clear."}, + ] + ) + + assert ( + invoke_agent_span["attributes"]["gen_ai.response.text"] + == "Hello, how can I help you?" ) - elif instructions is None: # type: ignore - assert ai_client_span["data"]["gen_ai.system_instructions"] == safe_serialize( - [ - {"type": "text", "content": "You are a helpful assistant."}, - {"type": "text", "content": "Be concise and clear."}, - ] + + assert ( + invoke_agent_span["attributes"]["gen_ai.operation.name"] == "invoke_agent" ) + assert invoke_agent_span["attributes"]["gen_ai.system"] == "openai" + assert invoke_agent_span["attributes"]["gen_ai.agent.name"] == "test_agent" + assert invoke_agent_span["attributes"]["gen_ai.request.max_tokens"] == 100 + assert invoke_agent_span["attributes"]["gen_ai.request.model"] == "gpt-4" + assert invoke_agent_span["attributes"]["gen_ai.request.temperature"] == 0.7 + assert invoke_agent_span["attributes"]["gen_ai.request.top_p"] == 1.0 + + assert ai_client_span["name"] == "chat gpt-4" + assert ai_client_span["attributes"]["gen_ai.operation.name"] == "chat" + assert ai_client_span["attributes"]["gen_ai.system"] == "openai" + assert ai_client_span["attributes"]["gen_ai.agent.name"] == "test_agent" + assert ai_client_span["attributes"]["gen_ai.request.max_tokens"] == 100 + assert ai_client_span["attributes"]["gen_ai.request.model"] == "gpt-4" + assert ai_client_span["attributes"]["gen_ai.request.temperature"] == 0.7 + assert ai_client_span["attributes"]["gen_ai.request.top_p"] == 1.0 + else: - assert ai_client_span["data"]["gen_ai.system_instructions"] == safe_serialize( - [ - { - "type": "text", - "content": "You are a coding assistant that talks like a pirate.", - }, - {"type": "text", "content": "You are a helpful assistant."}, - {"type": "text", "content": "Be concise and clear."}, - ] - ) + with patch.object( + agent.model._client._client, + "send", + return_value=response, + ) as _: + sentry_init( + integrations=[OpenAIAgentsIntegration()], + traces_sample_rate=1.0, + send_default_pii=True, + stream_gen_ai_spans=stream_gen_ai_spans, + ) + events = capture_events() - assert ( - invoke_agent_span["data"]["gen_ai.response.text"] - == "Hello, how can I help you?" - ) + result = await agents.Runner.run( + agent, + input, + run_config=test_run_config, + ) + + assert result is not None + assert result.final_output == "Hello, how can I help you?" - assert invoke_agent_span["data"]["gen_ai.operation.name"] == "invoke_agent" - assert invoke_agent_span["data"]["gen_ai.system"] == "openai" - assert invoke_agent_span["data"]["gen_ai.agent.name"] == "test_agent" - assert invoke_agent_span["data"]["gen_ai.request.max_tokens"] == 100 - assert invoke_agent_span["data"]["gen_ai.request.model"] == "gpt-4" - assert invoke_agent_span["data"]["gen_ai.request.temperature"] == 0.7 - assert invoke_agent_span["data"]["gen_ai.request.top_p"] == 1.0 + (transaction,) = events + spans = transaction["spans"] + invoke_agent_span, ai_client_span = spans - assert ai_client_span["description"] == "chat gpt-4" - assert ai_client_span["data"]["gen_ai.operation.name"] == "chat" - assert ai_client_span["data"]["gen_ai.system"] == "openai" - assert ai_client_span["data"]["gen_ai.agent.name"] == "test_agent" - assert ai_client_span["data"]["gen_ai.request.max_tokens"] == 100 - assert ai_client_span["data"]["gen_ai.request.model"] == "gpt-4" - assert ai_client_span["data"]["gen_ai.request.temperature"] == 0.7 - assert ai_client_span["data"]["gen_ai.request.top_p"] == 1.0 + assert transaction["transaction"] == "test_agent workflow" + assert transaction["contexts"]["trace"]["origin"] == "auto.ai.openai_agents" + assert invoke_agent_span["description"] == "invoke_agent test_agent" + # Only first case checks "gen_ai.request.messages" until further input handling work. + param_id = request.node.callspec.id + if "string" in param_id and instructions is None: # type: ignore + assert "gen_ai.system_instructions" not in ai_client_span["data"] + + assert invoke_agent_span["data"][ + "gen_ai.request.messages" + ] == safe_serialize( + [ + { + "content": [{"text": "Test input", "type": "text"}], + "role": "user", + }, + ] + ) + + elif "string" in param_id: + assert ai_client_span["data"][ + "gen_ai.system_instructions" + ] == safe_serialize( + [ + { + "type": "text", + "content": "You are a coding assistant that talks like a pirate.", + }, + ] + ) + elif "blocks_no_type" in param_id and instructions is None: # type: ignore + assert ai_client_span["data"][ + "gen_ai.system_instructions" + ] == safe_serialize( + [ + {"type": "text", "content": "You are a helpful assistant."}, + ] + ) + elif "blocks_no_type" in param_id: + assert ai_client_span["data"][ + "gen_ai.system_instructions" + ] == safe_serialize( + [ + { + "type": "text", + "content": "You are a coding assistant that talks like a pirate.", + }, + {"type": "text", "content": "You are a helpful assistant."}, + ] + ) + elif "blocks" in param_id and instructions is None: # type: ignore + assert ai_client_span["data"][ + "gen_ai.system_instructions" + ] == safe_serialize( + [ + {"type": "text", "content": "You are a helpful assistant."}, + ] + ) + elif "blocks" in param_id: + assert ai_client_span["data"][ + "gen_ai.system_instructions" + ] == safe_serialize( + [ + { + "type": "text", + "content": "You are a coding assistant that talks like a pirate.", + }, + {"type": "text", "content": "You are a helpful assistant."}, + ] + ) + elif "parts_no_type" in param_id and instructions is None: + assert ai_client_span["data"][ + "gen_ai.system_instructions" + ] == safe_serialize( + [ + {"type": "text", "content": "You are a helpful assistant."}, + {"type": "text", "content": "Be concise and clear."}, + ] + ) + elif "parts_no_type" in param_id: + assert ai_client_span["data"][ + "gen_ai.system_instructions" + ] == safe_serialize( + [ + { + "type": "text", + "content": "You are a coding assistant that talks like a pirate.", + }, + {"type": "text", "content": "You are a helpful assistant."}, + {"type": "text", "content": "Be concise and clear."}, + ] + ) + elif instructions is None: # type: ignore + assert ai_client_span["data"][ + "gen_ai.system_instructions" + ] == safe_serialize( + [ + {"type": "text", "content": "You are a helpful assistant."}, + {"type": "text", "content": "Be concise and clear."}, + ] + ) + else: + assert ai_client_span["data"][ + "gen_ai.system_instructions" + ] == safe_serialize( + [ + { + "type": "text", + "content": "You are a coding assistant that talks like a pirate.", + }, + {"type": "text", "content": "You are a helpful assistant."}, + {"type": "text", "content": "Be concise and clear."}, + ] + ) + + assert ( + invoke_agent_span["data"]["gen_ai.response.text"] + == "Hello, how can I help you?" + ) + + assert invoke_agent_span["data"]["gen_ai.operation.name"] == "invoke_agent" + assert invoke_agent_span["data"]["gen_ai.system"] == "openai" + assert invoke_agent_span["data"]["gen_ai.agent.name"] == "test_agent" + assert invoke_agent_span["data"]["gen_ai.request.max_tokens"] == 100 + assert invoke_agent_span["data"]["gen_ai.request.model"] == "gpt-4" + assert invoke_agent_span["data"]["gen_ai.request.temperature"] == 0.7 + assert invoke_agent_span["data"]["gen_ai.request.top_p"] == 1.0 + + assert ai_client_span["description"] == "chat gpt-4" + assert ai_client_span["data"]["gen_ai.operation.name"] == "chat" + assert ai_client_span["data"]["gen_ai.system"] == "openai" + assert ai_client_span["data"]["gen_ai.agent.name"] == "test_agent" + assert ai_client_span["data"]["gen_ai.request.max_tokens"] == 100 + assert ai_client_span["data"]["gen_ai.request.model"] == "gpt-4" + assert ai_client_span["data"]["gen_ai.request.temperature"] == 0.7 + assert ai_client_span["data"]["gen_ai.request.top_p"] == 1.0 + + +@pytest.mark.parametrize("stream_gen_ai_spans", [True, False]) @pytest.mark.asyncio async def test_client_span_custom_model( sentry_init, capture_events, + capture_items, test_agent_custom_model, nonstreaming_responses_model_response, get_model_response, + stream_gen_ai_spans, ): """ Test that the integration uses the correct model name if a custom model is used. @@ -487,39 +753,71 @@ async def test_client_span_custom_model( nonstreaming_responses_model_response, serialize_pydantic=True ) - with patch.object( - agent.model._client._client, - "send", - return_value=response, - ) as _: - sentry_init( - integrations=[OpenAIAgentsIntegration()], - traces_sample_rate=1.0, - ) + if stream_gen_ai_spans: + with patch.object( + agent.model._client._client, + "send", + return_value=response, + ) as _: + sentry_init( + integrations=[OpenAIAgentsIntegration()], + traces_sample_rate=1.0, + stream_gen_ai_spans=stream_gen_ai_spans, + ) - events = capture_events() + items = capture_items("span") - result = await agents.Runner.run( - agent, "Test input", run_config=test_run_config + result = await agents.Runner.run( + agent, "Test input", run_config=test_run_config + ) + + assert result is not None + assert result.final_output == "Hello, how can I help you?" + + spans = [item.payload for item in items if item.type == "span"] + ai_client_span = next( + span for span in spans if span["attributes"]["sentry.op"] == OP.GEN_AI_CHAT ) - assert result is not None - assert result.final_output == "Hello, how can I help you?" + assert ai_client_span["name"] == "chat my-custom-model" + assert ai_client_span["attributes"]["gen_ai.request.model"] == "my-custom-model" + else: + with patch.object( + agent.model._client._client, + "send", + return_value=response, + ) as _: + sentry_init( + integrations=[OpenAIAgentsIntegration()], + traces_sample_rate=1.0, + stream_gen_ai_spans=stream_gen_ai_spans, + ) + events = capture_events() + + result = await agents.Runner.run( + agent, "Test input", run_config=test_run_config + ) + + assert result is not None + assert result.final_output == "Hello, how can I help you?" - (transaction,) = events - spans = transaction["spans"] - ai_client_span = next(span for span in spans if span["op"] == OP.GEN_AI_CHAT) + (transaction,) = events + spans = transaction["spans"] + ai_client_span = next(span for span in spans if span["op"] == OP.GEN_AI_CHAT) - assert ai_client_span["description"] == "chat my-custom-model" - assert ai_client_span["data"]["gen_ai.request.model"] == "my-custom-model" + assert ai_client_span["description"] == "chat my-custom-model" + assert ai_client_span["data"]["gen_ai.request.model"] == "my-custom-model" +@pytest.mark.parametrize("stream_gen_ai_spans", [True, False]) def test_agent_invocation_span_sync_no_pii( sentry_init, capture_events, + capture_items, test_agent, nonstreaming_responses_model_response, get_model_response, + stream_gen_ai_spans, ): """ Test that the integration creates spans for agent invocations. @@ -532,55 +830,119 @@ def test_agent_invocation_span_sync_no_pii( nonstreaming_responses_model_response, serialize_pydantic=True ) - with patch.object( - agent.model._client._client, - "send", - return_value=response, - ) as _: - sentry_init( - integrations=[OpenAIAgentsIntegration()], - traces_sample_rate=1.0, - send_default_pii=False, + if stream_gen_ai_spans: + with patch.object( + agent.model._client._client, + "send", + return_value=response, + ) as _: + sentry_init( + integrations=[OpenAIAgentsIntegration()], + traces_sample_rate=1.0, + send_default_pii=False, + stream_gen_ai_spans=stream_gen_ai_spans, + ) + + items = capture_items("span", "transaction") + + result = agents.Runner.run_sync( + agent, "Test input", run_config=test_run_config + ) + + assert result is not None + assert result.final_output == "Hello, how can I help you?" + + (transaction,) = (item.payload for item in items if item.type == "transaction") + + assert transaction["transaction"] == "test_agent workflow" + assert transaction["contexts"]["trace"]["origin"] == "auto.ai.openai_agents" + + spans = [item.payload for item in items if item.type == "span"] + invoke_agent_span = next( + span + for span in spans + if span["attributes"]["sentry.op"] == OP.GEN_AI_INVOKE_AGENT + ) + ai_client_span = next( + span for span in spans if span["attributes"]["sentry.op"] == OP.GEN_AI_CHAT ) - events = capture_events() + assert invoke_agent_span["name"] == "invoke_agent test_agent" + assert ( + invoke_agent_span["attributes"]["gen_ai.operation.name"] == "invoke_agent" + ) + assert invoke_agent_span["attributes"]["gen_ai.system"] == "openai" + assert invoke_agent_span["attributes"]["gen_ai.agent.name"] == "test_agent" + assert invoke_agent_span["attributes"]["gen_ai.request.max_tokens"] == 100 + assert invoke_agent_span["attributes"]["gen_ai.request.model"] == "gpt-4" + assert invoke_agent_span["attributes"]["gen_ai.request.temperature"] == 0.7 + assert invoke_agent_span["attributes"]["gen_ai.request.top_p"] == 1.0 + + assert ai_client_span["name"] == "chat gpt-4" + assert ai_client_span["attributes"]["gen_ai.operation.name"] == "chat" + assert ai_client_span["attributes"]["gen_ai.system"] == "openai" + assert ai_client_span["attributes"]["gen_ai.agent.name"] == "test_agent" + assert ai_client_span["attributes"]["gen_ai.request.max_tokens"] == 100 + assert ai_client_span["attributes"]["gen_ai.request.model"] == "gpt-4" + assert ai_client_span["attributes"]["gen_ai.request.temperature"] == 0.7 + assert ai_client_span["attributes"]["gen_ai.request.top_p"] == 1.0 + + assert ( + SPANDATA.GEN_AI_SYSTEM_INSTRUCTIONS not in invoke_agent_span["attributes"] + ) + else: + with patch.object( + agent.model._client._client, + "send", + return_value=response, + ) as _: + sentry_init( + integrations=[OpenAIAgentsIntegration()], + traces_sample_rate=1.0, + send_default_pii=False, + stream_gen_ai_spans=stream_gen_ai_spans, + ) + events = capture_events() - result = agents.Runner.run_sync(agent, "Test input", run_config=test_run_config) + result = agents.Runner.run_sync( + agent, "Test input", run_config=test_run_config + ) - assert result is not None - assert result.final_output == "Hello, how can I help you?" + assert result is not None + assert result.final_output == "Hello, how can I help you?" - (transaction,) = events - spans = transaction["spans"] - invoke_agent_span = next( - span for span in spans if span["op"] == OP.GEN_AI_INVOKE_AGENT - ) - ai_client_span = next(span for span in spans if span["op"] == OP.GEN_AI_CHAT) + (transaction,) = events + spans = transaction["spans"] + invoke_agent_span = next( + span for span in spans if span["op"] == OP.GEN_AI_INVOKE_AGENT + ) + ai_client_span = next(span for span in spans if span["op"] == OP.GEN_AI_CHAT) - assert transaction["transaction"] == "test_agent workflow" - assert transaction["contexts"]["trace"]["origin"] == "auto.ai.openai_agents" + assert transaction["transaction"] == "test_agent workflow" + assert transaction["contexts"]["trace"]["origin"] == "auto.ai.openai_agents" - assert invoke_agent_span["description"] == "invoke_agent test_agent" - assert invoke_agent_span["data"]["gen_ai.operation.name"] == "invoke_agent" - assert invoke_agent_span["data"]["gen_ai.system"] == "openai" - assert invoke_agent_span["data"]["gen_ai.agent.name"] == "test_agent" - assert invoke_agent_span["data"]["gen_ai.request.max_tokens"] == 100 - assert invoke_agent_span["data"]["gen_ai.request.model"] == "gpt-4" - assert invoke_agent_span["data"]["gen_ai.request.temperature"] == 0.7 - assert invoke_agent_span["data"]["gen_ai.request.top_p"] == 1.0 + assert invoke_agent_span["description"] == "invoke_agent test_agent" + assert invoke_agent_span["data"]["gen_ai.operation.name"] == "invoke_agent" + assert invoke_agent_span["data"]["gen_ai.system"] == "openai" + assert invoke_agent_span["data"]["gen_ai.agent.name"] == "test_agent" + assert invoke_agent_span["data"]["gen_ai.request.max_tokens"] == 100 + assert invoke_agent_span["data"]["gen_ai.request.model"] == "gpt-4" + assert invoke_agent_span["data"]["gen_ai.request.temperature"] == 0.7 + assert invoke_agent_span["data"]["gen_ai.request.top_p"] == 1.0 - assert ai_client_span["description"] == "chat gpt-4" - assert ai_client_span["data"]["gen_ai.operation.name"] == "chat" - assert ai_client_span["data"]["gen_ai.system"] == "openai" - assert ai_client_span["data"]["gen_ai.agent.name"] == "test_agent" - assert ai_client_span["data"]["gen_ai.request.max_tokens"] == 100 - assert ai_client_span["data"]["gen_ai.request.model"] == "gpt-4" - assert ai_client_span["data"]["gen_ai.request.temperature"] == 0.7 - assert ai_client_span["data"]["gen_ai.request.top_p"] == 1.0 + assert ai_client_span["description"] == "chat gpt-4" + assert ai_client_span["data"]["gen_ai.operation.name"] == "chat" + assert ai_client_span["data"]["gen_ai.system"] == "openai" + assert ai_client_span["data"]["gen_ai.agent.name"] == "test_agent" + assert ai_client_span["data"]["gen_ai.request.max_tokens"] == 100 + assert ai_client_span["data"]["gen_ai.request.model"] == "gpt-4" + assert ai_client_span["data"]["gen_ai.request.temperature"] == 0.7 + assert ai_client_span["data"]["gen_ai.request.top_p"] == 1.0 - assert SPANDATA.GEN_AI_SYSTEM_INSTRUCTIONS not in invoke_agent_span["data"] + assert SPANDATA.GEN_AI_SYSTEM_INSTRUCTIONS not in invoke_agent_span["data"] +@pytest.mark.parametrize("stream_gen_ai_spans", [True, False]) @pytest.mark.parametrize( "instructions", ( @@ -659,12 +1021,14 @@ def test_agent_invocation_span_sync_no_pii( def test_agent_invocation_span_sync( sentry_init, capture_events, + capture_items, test_agent_with_instructions, nonstreaming_responses_model_response, instructions, input, request, get_model_response, + stream_gen_ai_spans, ): """ Test that the integration creates spans for agent invocations. @@ -677,137 +1041,313 @@ def test_agent_invocation_span_sync( nonstreaming_responses_model_response, serialize_pydantic=True ) - with patch.object( - agent.model._client._client, - "send", - return_value=response, - ) as _: - sentry_init( - integrations=[OpenAIAgentsIntegration()], - traces_sample_rate=1.0, - send_default_pii=True, - ) + if stream_gen_ai_spans: + with patch.object( + agent.model._client._client, + "send", + return_value=response, + ) as _: + sentry_init( + integrations=[OpenAIAgentsIntegration()], + traces_sample_rate=1.0, + send_default_pii=True, + stream_gen_ai_spans=stream_gen_ai_spans, + ) - events = capture_events() + items = capture_items("span", "transaction") - result = agents.Runner.run_sync( - agent, - input, - run_config=test_run_config, - ) + result = agents.Runner.run_sync( + agent, + input, + run_config=test_run_config, + ) - assert result is not None - assert result.final_output == "Hello, how can I help you?" - - (transaction,) = events - spans = transaction["spans"] - invoke_agent_span, ai_client_span = spans - - assert transaction["transaction"] == "test_agent workflow" - assert transaction["contexts"]["trace"]["origin"] == "auto.ai.openai_agents" - - assert invoke_agent_span["description"] == "invoke_agent test_agent" - assert invoke_agent_span["data"]["gen_ai.operation.name"] == "invoke_agent" - assert invoke_agent_span["data"]["gen_ai.system"] == "openai" - assert invoke_agent_span["data"]["gen_ai.agent.name"] == "test_agent" - assert invoke_agent_span["data"]["gen_ai.request.max_tokens"] == 100 - assert invoke_agent_span["data"]["gen_ai.request.model"] == "gpt-4" - assert invoke_agent_span["data"]["gen_ai.request.temperature"] == 0.7 - assert invoke_agent_span["data"]["gen_ai.request.top_p"] == 1.0 - - assert ai_client_span["description"] == "chat gpt-4" - assert ai_client_span["data"]["gen_ai.operation.name"] == "chat" - assert ai_client_span["data"]["gen_ai.system"] == "openai" - assert ai_client_span["data"]["gen_ai.agent.name"] == "test_agent" - assert ai_client_span["data"]["gen_ai.request.max_tokens"] == 100 - assert ai_client_span["data"]["gen_ai.request.model"] == "gpt-4" - assert ai_client_span["data"]["gen_ai.request.temperature"] == 0.7 - assert ai_client_span["data"]["gen_ai.request.top_p"] == 1.0 - - param_id = request.node.callspec.id - if "string" in param_id and instructions is None: # type: ignore - assert "gen_ai.system_instructions" not in ai_client_span["data"] - elif "string" in param_id: - assert ai_client_span["data"]["gen_ai.system_instructions"] == safe_serialize( - [ - { - "type": "text", - "content": "You are a coding assistant that talks like a pirate.", - }, - ] - ) - elif "blocks_no_type" in param_id and instructions is None: # type: ignore - assert ai_client_span["data"]["gen_ai.system_instructions"] == safe_serialize( - [ - {"type": "text", "content": "You are a helpful assistant."}, - ] - ) - elif "blocks_no_type" in param_id: - assert ai_client_span["data"]["gen_ai.system_instructions"] == safe_serialize( - [ - { - "type": "text", - "content": "You are a coding assistant that talks like a pirate.", - }, - {"type": "text", "content": "You are a helpful assistant."}, - ] - ) - elif "blocks" in param_id and instructions is None: # type: ignore - assert ai_client_span["data"]["gen_ai.system_instructions"] == safe_serialize( - [ - {"type": "text", "content": "You are a helpful assistant."}, - ] - ) - elif "blocks" in param_id: - assert ai_client_span["data"]["gen_ai.system_instructions"] == safe_serialize( - [ - { - "type": "text", - "content": "You are a coding assistant that talks like a pirate.", - }, - {"type": "text", "content": "You are a helpful assistant."}, - ] - ) - elif "parts_no_type" in param_id and instructions is None: - assert ai_client_span["data"]["gen_ai.system_instructions"] == safe_serialize( - [ - {"type": "text", "content": "You are a helpful assistant."}, - {"type": "text", "content": "Be concise and clear."}, - ] - ) - elif "parts_no_type" in param_id: - assert ai_client_span["data"]["gen_ai.system_instructions"] == safe_serialize( - [ - { - "type": "text", - "content": "You are a coding assistant that talks like a pirate.", - }, - {"type": "text", "content": "You are a helpful assistant."}, - {"type": "text", "content": "Be concise and clear."}, - ] - ) - elif instructions is None: # type: ignore - assert ai_client_span["data"]["gen_ai.system_instructions"] == safe_serialize( - [ - {"type": "text", "content": "You are a helpful assistant."}, - {"type": "text", "content": "Be concise and clear."}, - ] + assert result is not None + assert result.final_output == "Hello, how can I help you?" + + (transaction,) = (item.payload for item in items if item.type == "transaction") + assert transaction["transaction"] == "test_agent workflow" + assert transaction["contexts"]["trace"]["origin"] == "auto.ai.openai_agents" + + spans = [item.payload for item in items if item.type == "span"] + invoke_agent_span, ai_client_span = spans + + assert invoke_agent_span["name"] == "invoke_agent test_agent" + assert ( + invoke_agent_span["attributes"]["gen_ai.operation.name"] == "invoke_agent" ) + assert invoke_agent_span["attributes"]["gen_ai.system"] == "openai" + assert invoke_agent_span["attributes"]["gen_ai.agent.name"] == "test_agent" + assert invoke_agent_span["attributes"]["gen_ai.request.max_tokens"] == 100 + assert invoke_agent_span["attributes"]["gen_ai.request.model"] == "gpt-4" + assert invoke_agent_span["attributes"]["gen_ai.request.temperature"] == 0.7 + assert invoke_agent_span["attributes"]["gen_ai.request.top_p"] == 1.0 + + assert ai_client_span["name"] == "chat gpt-4" + assert ai_client_span["attributes"]["gen_ai.operation.name"] == "chat" + assert ai_client_span["attributes"]["gen_ai.system"] == "openai" + assert ai_client_span["attributes"]["gen_ai.agent.name"] == "test_agent" + assert ai_client_span["attributes"]["gen_ai.request.max_tokens"] == 100 + assert ai_client_span["attributes"]["gen_ai.request.model"] == "gpt-4" + assert ai_client_span["attributes"]["gen_ai.request.temperature"] == 0.7 + assert ai_client_span["attributes"]["gen_ai.request.top_p"] == 1.0 + + param_id = request.node.callspec.id + if "string" in param_id and instructions is None: # type: ignore + assert "gen_ai.system_instructions" not in ai_client_span["attributes"] + elif "string" in param_id: + assert ai_client_span["attributes"][ + "gen_ai.system_instructions" + ] == safe_serialize( + [ + { + "type": "text", + "content": "You are a coding assistant that talks like a pirate.", + }, + ] + ) + elif "blocks_no_type" in param_id and instructions is None: # type: ignore + assert ai_client_span["attributes"][ + "gen_ai.system_instructions" + ] == safe_serialize( + [ + {"type": "text", "content": "You are a helpful assistant."}, + ] + ) + elif "blocks_no_type" in param_id: + assert ai_client_span["attributes"][ + "gen_ai.system_instructions" + ] == safe_serialize( + [ + { + "type": "text", + "content": "You are a coding assistant that talks like a pirate.", + }, + {"type": "text", "content": "You are a helpful assistant."}, + ] + ) + elif "blocks" in param_id and instructions is None: # type: ignore + assert ai_client_span["attributes"][ + "gen_ai.system_instructions" + ] == safe_serialize( + [ + {"type": "text", "content": "You are a helpful assistant."}, + ] + ) + elif "blocks" in param_id: + assert ai_client_span["attributes"][ + "gen_ai.system_instructions" + ] == safe_serialize( + [ + { + "type": "text", + "content": "You are a coding assistant that talks like a pirate.", + }, + {"type": "text", "content": "You are a helpful assistant."}, + ] + ) + elif "parts_no_type" in param_id and instructions is None: + assert ai_client_span["attributes"][ + "gen_ai.system_instructions" + ] == safe_serialize( + [ + {"type": "text", "content": "You are a helpful assistant."}, + {"type": "text", "content": "Be concise and clear."}, + ] + ) + elif "parts_no_type" in param_id: + assert ai_client_span["attributes"][ + "gen_ai.system_instructions" + ] == safe_serialize( + [ + { + "type": "text", + "content": "You are a coding assistant that talks like a pirate.", + }, + {"type": "text", "content": "You are a helpful assistant."}, + {"type": "text", "content": "Be concise and clear."}, + ] + ) + elif instructions is None: # type: ignore + assert ai_client_span["attributes"][ + "gen_ai.system_instructions" + ] == safe_serialize( + [ + {"type": "text", "content": "You are a helpful assistant."}, + {"type": "text", "content": "Be concise and clear."}, + ] + ) + else: + assert ai_client_span["attributes"][ + "gen_ai.system_instructions" + ] == safe_serialize( + [ + { + "type": "text", + "content": "You are a coding assistant that talks like a pirate.", + }, + {"type": "text", "content": "You are a helpful assistant."}, + {"type": "text", "content": "Be concise and clear."}, + ] + ) else: - assert ai_client_span["data"]["gen_ai.system_instructions"] == safe_serialize( - [ - { - "type": "text", - "content": "You are a coding assistant that talks like a pirate.", - }, - {"type": "text", "content": "You are a helpful assistant."}, - {"type": "text", "content": "Be concise and clear."}, - ] - ) + with patch.object( + agent.model._client._client, + "send", + return_value=response, + ) as _: + sentry_init( + integrations=[OpenAIAgentsIntegration()], + traces_sample_rate=1.0, + send_default_pii=True, + stream_gen_ai_spans=stream_gen_ai_spans, + ) + + events = capture_events() + + result = agents.Runner.run_sync( + agent, + input, + run_config=test_run_config, + ) + + assert result is not None + assert result.final_output == "Hello, how can I help you?" + + (transaction,) = events + spans = transaction["spans"] + invoke_agent_span, ai_client_span = spans + + assert transaction["transaction"] == "test_agent workflow" + assert transaction["contexts"]["trace"]["origin"] == "auto.ai.openai_agents" + + assert invoke_agent_span["description"] == "invoke_agent test_agent" + assert invoke_agent_span["data"]["gen_ai.operation.name"] == "invoke_agent" + assert invoke_agent_span["data"]["gen_ai.system"] == "openai" + assert invoke_agent_span["data"]["gen_ai.agent.name"] == "test_agent" + assert invoke_agent_span["data"]["gen_ai.request.max_tokens"] == 100 + assert invoke_agent_span["data"]["gen_ai.request.model"] == "gpt-4" + assert invoke_agent_span["data"]["gen_ai.request.temperature"] == 0.7 + assert invoke_agent_span["data"]["gen_ai.request.top_p"] == 1.0 + + assert ai_client_span["description"] == "chat gpt-4" + assert ai_client_span["data"]["gen_ai.operation.name"] == "chat" + assert ai_client_span["data"]["gen_ai.system"] == "openai" + assert ai_client_span["data"]["gen_ai.agent.name"] == "test_agent" + assert ai_client_span["data"]["gen_ai.request.max_tokens"] == 100 + assert ai_client_span["data"]["gen_ai.request.model"] == "gpt-4" + assert ai_client_span["data"]["gen_ai.request.temperature"] == 0.7 + assert ai_client_span["data"]["gen_ai.request.top_p"] == 1.0 + + param_id = request.node.callspec.id + if "string" in param_id and instructions is None: # type: ignore + assert "gen_ai.system_instructions" not in ai_client_span["data"] + elif "string" in param_id: + assert ai_client_span["data"][ + "gen_ai.system_instructions" + ] == safe_serialize( + [ + { + "type": "text", + "content": "You are a coding assistant that talks like a pirate.", + }, + ] + ) + elif "blocks_no_type" in param_id and instructions is None: # type: ignore + assert ai_client_span["data"][ + "gen_ai.system_instructions" + ] == safe_serialize( + [ + {"type": "text", "content": "You are a helpful assistant."}, + ] + ) + elif "blocks_no_type" in param_id: + assert ai_client_span["data"][ + "gen_ai.system_instructions" + ] == safe_serialize( + [ + { + "type": "text", + "content": "You are a coding assistant that talks like a pirate.", + }, + {"type": "text", "content": "You are a helpful assistant."}, + ] + ) + elif "blocks" in param_id and instructions is None: # type: ignore + assert ai_client_span["data"][ + "gen_ai.system_instructions" + ] == safe_serialize( + [ + {"type": "text", "content": "You are a helpful assistant."}, + ] + ) + elif "blocks" in param_id: + assert ai_client_span["data"][ + "gen_ai.system_instructions" + ] == safe_serialize( + [ + { + "type": "text", + "content": "You are a coding assistant that talks like a pirate.", + }, + {"type": "text", "content": "You are a helpful assistant."}, + ] + ) + elif "parts_no_type" in param_id and instructions is None: + assert ai_client_span["data"][ + "gen_ai.system_instructions" + ] == safe_serialize( + [ + {"type": "text", "content": "You are a helpful assistant."}, + {"type": "text", "content": "Be concise and clear."}, + ] + ) + elif "parts_no_type" in param_id: + assert ai_client_span["data"][ + "gen_ai.system_instructions" + ] == safe_serialize( + [ + { + "type": "text", + "content": "You are a coding assistant that talks like a pirate.", + }, + {"type": "text", "content": "You are a helpful assistant."}, + {"type": "text", "content": "Be concise and clear."}, + ] + ) + elif instructions is None: # type: ignore + assert ai_client_span["data"][ + "gen_ai.system_instructions" + ] == safe_serialize( + [ + {"type": "text", "content": "You are a helpful assistant."}, + {"type": "text", "content": "Be concise and clear."}, + ] + ) + else: + assert ai_client_span["data"][ + "gen_ai.system_instructions" + ] == safe_serialize( + [ + { + "type": "text", + "content": "You are a coding assistant that talks like a pirate.", + }, + {"type": "text", "content": "You are a helpful assistant."}, + {"type": "text", "content": "Be concise and clear."}, + ] + ) +@pytest.mark.parametrize("stream_gen_ai_spans", [True, False]) @pytest.mark.asyncio -async def test_handoff_span(sentry_init, capture_events, get_model_response): +async def test_handoff_span( + sentry_init, + capture_events, + capture_items, + get_model_response, + stream_gen_ai_spans, +): """ Test that handoff spans are created when agents hand off to other agents. """ @@ -900,41 +1440,83 @@ async def test_handoff_span(sentry_init, capture_events, get_model_response): serialize_pydantic=True, ) - with patch.object( - primary_agent.model._client._client, - "send", - side_effect=[handoff_response, final_response], - ) as _: - sentry_init( - integrations=[OpenAIAgentsIntegration()], - traces_sample_rate=1.0, - ) + if stream_gen_ai_spans: + with patch.object( + primary_agent.model._client._client, + "send", + side_effect=[handoff_response, final_response], + ) as _: + sentry_init( + integrations=[OpenAIAgentsIntegration()], + traces_sample_rate=1.0, + stream_gen_ai_spans=stream_gen_ai_spans, + ) - events = capture_events() + items = capture_items("transaction", "span") - result = await agents.Runner.run( - primary_agent, - "Please hand off to secondary agent", - run_config=test_run_config, + result = await agents.Runner.run( + primary_agent, + "Please hand off to secondary agent", + run_config=test_run_config, + ) + + assert result is not None + + spans = [item.payload for item in items if item.type == "span"] + handoff_span = next( + span + for span in spans + if span["attributes"].get("sentry.op") == OP.GEN_AI_HANDOFF ) - assert result is not None + # Verify handoff span was created + assert handoff_span is not None + assert handoff_span["name"] == "handoff from primary_agent to secondary_agent" + assert handoff_span["attributes"]["gen_ai.operation.name"] == "handoff" + else: + with patch.object( + primary_agent.model._client._client, + "send", + side_effect=[handoff_response, final_response], + ) as _: + sentry_init( + integrations=[OpenAIAgentsIntegration()], + traces_sample_rate=1.0, + stream_gen_ai_spans=stream_gen_ai_spans, + ) + events = capture_events() + + result = await agents.Runner.run( + primary_agent, + "Please hand off to secondary agent", + run_config=test_run_config, + ) + + assert result is not None - (transaction,) = events - spans = transaction["spans"] - handoff_span = next(span for span in spans if span.get("op") == OP.GEN_AI_HANDOFF) + (transaction,) = events + spans = transaction["spans"] + handoff_span = next( + span for span in spans if span.get("op") == OP.GEN_AI_HANDOFF + ) - # Verify handoff span was created - assert handoff_span is not None - assert ( - handoff_span["description"] == "handoff from primary_agent to secondary_agent" - ) - assert handoff_span["data"]["gen_ai.operation.name"] == "handoff" + # Verify handoff span was created + assert handoff_span is not None + assert ( + handoff_span["description"] + == "handoff from primary_agent to secondary_agent" + ) + assert handoff_span["data"]["gen_ai.operation.name"] == "handoff" +@pytest.mark.parametrize("stream_gen_ai_spans", [True, False]) @pytest.mark.asyncio async def test_max_turns_before_handoff_span( - sentry_init, capture_events, get_model_response + sentry_init, + capture_events, + capture_items, + get_model_response, + stream_gen_ai_spans, ): """ Example raising agents.exceptions.AgentsException after the agent invocation span is complete. @@ -1028,45 +1610,85 @@ async def test_max_turns_before_handoff_span( serialize_pydantic=True, ) - with patch.object( - primary_agent.model._client._client, - "send", - side_effect=[handoff_response, final_response], - ) as _: - sentry_init( - integrations=[OpenAIAgentsIntegration()], - traces_sample_rate=1.0, - ) + if stream_gen_ai_spans: + with patch.object( + primary_agent.model._client._client, + "send", + side_effect=[handoff_response, final_response], + ) as _: + sentry_init( + integrations=[OpenAIAgentsIntegration()], + traces_sample_rate=1.0, + stream_gen_ai_spans=stream_gen_ai_spans, + ) - events = capture_events() + items = capture_items("transaction", "span") - with pytest.raises(MaxTurnsExceeded): - await agents.Runner.run( - primary_agent, - "Please hand off to secondary agent", - run_config=test_run_config, - max_turns=1, + with pytest.raises(MaxTurnsExceeded): + await agents.Runner.run( + primary_agent, + "Please hand off to secondary agent", + run_config=test_run_config, + max_turns=1, + ) + + spans = [item.payload for item in items if item.type == "span"] + handoff_span = next( + span + for span in spans + if span["attributes"].get("sentry.op") == OP.GEN_AI_HANDOFF + ) + + # Verify handoff span was created + assert handoff_span is not None + assert handoff_span["name"] == "handoff from primary_agent to secondary_agent" + assert handoff_span["attributes"]["gen_ai.operation.name"] == "handoff" + else: + with patch.object( + primary_agent.model._client._client, + "send", + side_effect=[handoff_response, final_response], + ) as _: + sentry_init( + integrations=[OpenAIAgentsIntegration()], + traces_sample_rate=1.0, + stream_gen_ai_spans=stream_gen_ai_spans, ) + events = capture_events() + + with pytest.raises(MaxTurnsExceeded): + await agents.Runner.run( + primary_agent, + "Please hand off to secondary agent", + run_config=test_run_config, + max_turns=1, + ) - (error, transaction) = events - spans = transaction["spans"] - handoff_span = next(span for span in spans if span.get("op") == OP.GEN_AI_HANDOFF) + (error, transaction) = events + spans = transaction["spans"] + handoff_span = next( + span for span in spans if span.get("op") == OP.GEN_AI_HANDOFF + ) - # Verify handoff span was created - assert handoff_span is not None - assert ( - handoff_span["description"] == "handoff from primary_agent to secondary_agent" - ) - assert handoff_span["data"]["gen_ai.operation.name"] == "handoff" + # Verify handoff span was created + assert handoff_span is not None + assert ( + handoff_span["description"] + == "handoff from primary_agent to secondary_agent" + ) + assert handoff_span["data"]["gen_ai.operation.name"] == "handoff" +@pytest.mark.parametrize("stream_gen_ai_spans", [True, False]) @pytest.mark.asyncio async def test_tool_execution_span( sentry_init, capture_events, + capture_items, test_agent, get_model_response, responses_tool_call_model_responses, + stream_gen_ai_spans, ): """ Test tool execution span creation. @@ -1124,195 +1746,413 @@ def simple_test_tool(message: str) -> str: serialize_pydantic=True, ) - with patch.object( - agent_with_tool.model._client._client, - "send", - side_effect=[tool_response, final_response], - ) as _: - sentry_init( - integrations=[OpenAIAgentsIntegration()], - traces_sample_rate=1.0, - send_default_pii=True, + if stream_gen_ai_spans: + with patch.object( + agent_with_tool.model._client._client, + "send", + side_effect=[tool_response, final_response], + ) as _: + sentry_init( + integrations=[OpenAIAgentsIntegration()], + traces_sample_rate=1.0, + send_default_pii=True, + stream_gen_ai_spans=stream_gen_ai_spans, + ) + + items = capture_items("transaction", "span") + + await agents.Runner.run( + agent_with_tool, + "Please use the simple test tool", + run_config=test_run_config, + ) + + (transaction,) = (item.payload for item in items if item.type == "transaction") + assert transaction["transaction"] == "test_agent workflow" + assert transaction["contexts"]["trace"]["origin"] == "auto.ai.openai_agents" + + spans = [item.payload for item in items if item.type == "span"] + agent_span = next( + span + for span in spans + if span["attributes"]["sentry.op"] == OP.GEN_AI_INVOKE_AGENT + ) + ai_client_span1, ai_client_span2 = ( + span for span in spans if span["attributes"]["sentry.op"] == OP.GEN_AI_CHAT + ) + tool_span = next( + span + for span in spans + if span["attributes"]["sentry.op"] == OP.GEN_AI_EXECUTE_TOOL ) - events = capture_events() + available_tool = { + "name": "simple_test_tool", + "description": "A simple tool", + "params_json_schema": { + "properties": {"message": {"title": "Message", "type": "string"}}, + "required": ["message"], + "title": "simple_test_tool_args", + "type": "object", + "additionalProperties": False, + }, + "on_invoke_tool": mock.ANY, + "strict_json_schema": True, + "is_enabled": True, + } + + if parse_version(OPENAI_AGENTS_VERSION) >= (0, 3, 3): + available_tool.update( + {"tool_input_guardrails": None, "tool_output_guardrails": None} + ) - await agents.Runner.run( - agent_with_tool, - "Please use the simple test tool", - run_config=test_run_config, + if parse_version(OPENAI_AGENTS_VERSION) >= ( + 0, + 8, + ): + available_tool["needs_approval"] = False + if parse_version(OPENAI_AGENTS_VERSION) >= ( + 0, + 9, + 0, + ): + available_tool.update( + { + "timeout_seconds": None, + "timeout_behavior": "error_as_result", + "timeout_error_function": None, + } + ) + + assert agent_span["name"] == "invoke_agent test_agent" + assert agent_span["attributes"]["sentry.origin"] == "auto.ai.openai_agents" + assert agent_span["attributes"]["gen_ai.agent.name"] == "test_agent" + assert agent_span["attributes"]["gen_ai.operation.name"] == "invoke_agent" + + agent_span_available_tool = json.loads( + agent_span["attributes"]["gen_ai.request.available_tools"] + )[0] + + assert all(agent_span_available_tool[k] == v for k, v in available_tool.items()) + + assert agent_span["attributes"]["gen_ai.request.max_tokens"] == 100 + assert agent_span["attributes"]["gen_ai.request.model"] == "gpt-4" + assert agent_span["attributes"]["gen_ai.request.temperature"] == 0.7 + assert agent_span["attributes"]["gen_ai.request.top_p"] == 1.0 + assert agent_span["attributes"]["gen_ai.system"] == "openai" + + assert ai_client_span1["name"] == "chat gpt-4" + assert ai_client_span1["attributes"]["gen_ai.operation.name"] == "chat" + assert ai_client_span1["attributes"]["gen_ai.system"] == "openai" + assert ai_client_span1["attributes"]["gen_ai.agent.name"] == "test_agent" + + ai_client_span1_available_tool = json.loads( + ai_client_span1["attributes"]["gen_ai.request.available_tools"] + )[0] + + assert all( + ai_client_span1_available_tool[k] == v for k, v in available_tool.items() ) - (transaction,) = events - spans = transaction["spans"] - agent_span = next(span for span in spans if span["op"] == OP.GEN_AI_INVOKE_AGENT) - ai_client_span1, ai_client_span2 = ( - span for span in spans if span["op"] == OP.GEN_AI_CHAT - ) - tool_span = next(span for span in spans if span["op"] == OP.GEN_AI_EXECUTE_TOOL) - - available_tool = { - "name": "simple_test_tool", - "description": "A simple tool", - "params_json_schema": { - "properties": {"message": {"title": "Message", "type": "string"}}, - "required": ["message"], - "title": "simple_test_tool_args", - "type": "object", - "additionalProperties": False, - }, - "on_invoke_tool": mock.ANY, - "strict_json_schema": True, - "is_enabled": True, - } + assert ai_client_span1["attributes"]["gen_ai.request.max_tokens"] == 100 + assert ai_client_span1["attributes"][ + "gen_ai.request.messages" + ] == safe_serialize( + [ + { + "role": "user", + "content": [ + {"type": "text", "text": "Please use the simple test tool"} + ], + }, + ] + ) + assert ai_client_span1["attributes"]["gen_ai.request.model"] == "gpt-4" + assert ai_client_span1["attributes"]["gen_ai.request.temperature"] == 0.7 + assert ai_client_span1["attributes"]["gen_ai.request.top_p"] == 1.0 + assert ai_client_span1["attributes"]["gen_ai.usage.input_tokens"] == 10 + assert ai_client_span1["attributes"]["gen_ai.usage.input_tokens.cached"] == 0 + assert ai_client_span1["attributes"]["gen_ai.usage.output_tokens"] == 5 + assert ( + ai_client_span1["attributes"]["gen_ai.usage.output_tokens.reasoning"] == 0 + ) + assert ai_client_span1["attributes"]["gen_ai.usage.total_tokens"] == 15 + + tool_call = { + "arguments": '{"message": "hello"}', + "call_id": "call_123", + "name": "simple_test_tool", + "type": "function_call", + "id": "call_123", + "status": None, + } + + if OPENAI_VERSION >= (2, 25, 0): + tool_call["namespace"] = None + + assert json.loads( + ai_client_span1["attributes"]["gen_ai.response.tool_calls"] + ) == [tool_call] + + assert tool_span["name"] == "execute_tool simple_test_tool" + assert tool_span["attributes"]["gen_ai.agent.name"] == "test_agent" + assert tool_span["attributes"]["gen_ai.operation.name"] == "execute_tool" + + tool_span_available_tool = json.loads( + tool_span["attributes"]["gen_ai.request.available_tools"] + )[0] + + assert all(tool_span_available_tool[k] == v for k, v in available_tool.items()) + + assert tool_span["attributes"]["gen_ai.request.max_tokens"] == 100 + assert tool_span["attributes"]["gen_ai.request.model"] == "gpt-4" + assert tool_span["attributes"]["gen_ai.request.temperature"] == 0.7 + assert tool_span["attributes"]["gen_ai.request.top_p"] == 1.0 + assert tool_span["attributes"]["gen_ai.system"] == "openai" + assert tool_span["attributes"]["gen_ai.tool.description"] == "A simple tool" + assert tool_span["attributes"]["gen_ai.tool.input"] == '{"message": "hello"}' + assert tool_span["attributes"]["gen_ai.tool.name"] == "simple_test_tool" + assert ( + tool_span["attributes"]["gen_ai.tool.output"] == "Tool executed with: hello" + ) + assert ai_client_span2["name"] == "chat gpt-4" + assert ai_client_span2["attributes"]["gen_ai.agent.name"] == "test_agent" + assert ai_client_span2["attributes"]["gen_ai.operation.name"] == "chat" - if parse_version(OPENAI_AGENTS_VERSION) >= (0, 3, 3): - available_tool.update( - {"tool_input_guardrails": None, "tool_output_guardrails": None} + ai_client_span2_available_tool = json.loads( + ai_client_span2["attributes"]["gen_ai.request.available_tools"] + )[0] + + assert all( + ai_client_span2_available_tool[k] == v for k, v in available_tool.items() ) - if parse_version(OPENAI_AGENTS_VERSION) >= ( - 0, - 8, - ): - available_tool["needs_approval"] = False - if parse_version(OPENAI_AGENTS_VERSION) >= ( - 0, - 9, - 0, - ): - available_tool.update( - { - "timeout_seconds": None, - "timeout_behavior": "error_as_result", - "timeout_error_function": None, - } - ) - - assert transaction["transaction"] == "test_agent workflow" - assert transaction["contexts"]["trace"]["origin"] == "auto.ai.openai_agents" - - assert agent_span["description"] == "invoke_agent test_agent" - assert agent_span["origin"] == "auto.ai.openai_agents" - assert agent_span["data"]["gen_ai.agent.name"] == "test_agent" - assert agent_span["data"]["gen_ai.operation.name"] == "invoke_agent" - - agent_span_available_tool = json.loads( - agent_span["data"]["gen_ai.request.available_tools"] - )[0] - assert all(agent_span_available_tool[k] == v for k, v in available_tool.items()) - - assert agent_span["data"]["gen_ai.request.max_tokens"] == 100 - assert agent_span["data"]["gen_ai.request.model"] == "gpt-4" - assert agent_span["data"]["gen_ai.request.temperature"] == 0.7 - assert agent_span["data"]["gen_ai.request.top_p"] == 1.0 - assert agent_span["data"]["gen_ai.system"] == "openai" - - assert ai_client_span1["description"] == "chat gpt-4" - assert ai_client_span1["data"]["gen_ai.operation.name"] == "chat" - assert ai_client_span1["data"]["gen_ai.system"] == "openai" - assert ai_client_span1["data"]["gen_ai.agent.name"] == "test_agent" - - ai_client_span1_available_tool = json.loads( - ai_client_span1["data"]["gen_ai.request.available_tools"] - )[0] - assert all( - ai_client_span1_available_tool[k] == v for k, v in available_tool.items() - ) + assert ai_client_span2["attributes"]["gen_ai.request.max_tokens"] == 100 + assert ai_client_span2["attributes"][ + "gen_ai.request.messages" + ] == safe_serialize( + [ + { + "role": "tool", + "content": [ + { + "call_id": "call_123", + "output": "Tool executed with: hello", + "type": "function_call_output", + } + ], + }, + ] + ) + assert ai_client_span2["attributes"]["gen_ai.request.model"] == "gpt-4" + assert ai_client_span2["attributes"]["gen_ai.request.temperature"] == 0.7 + assert ai_client_span2["attributes"]["gen_ai.request.top_p"] == 1.0 + assert ( + ai_client_span2["attributes"]["gen_ai.response.text"] + == "Task completed using the tool" + ) + assert ai_client_span2["attributes"]["gen_ai.system"] == "openai" + assert ai_client_span2["attributes"]["gen_ai.usage.input_tokens.cached"] == 0 + assert ai_client_span2["attributes"]["gen_ai.usage.input_tokens"] == 15 + assert ( + ai_client_span2["attributes"]["gen_ai.usage.output_tokens.reasoning"] == 0 + ) + assert ai_client_span2["attributes"]["gen_ai.usage.output_tokens"] == 10 + assert ai_client_span2["attributes"]["gen_ai.usage.total_tokens"] == 25 - assert ai_client_span1["data"]["gen_ai.request.max_tokens"] == 100 - assert ai_client_span1["data"]["gen_ai.request.messages"] == safe_serialize( - [ - { - "role": "user", - "content": [ - {"type": "text", "text": "Please use the simple test tool"} - ], - }, - ] - ) - assert ai_client_span1["data"]["gen_ai.request.model"] == "gpt-4" - assert ai_client_span1["data"]["gen_ai.request.temperature"] == 0.7 - assert ai_client_span1["data"]["gen_ai.request.top_p"] == 1.0 - assert ai_client_span1["data"]["gen_ai.usage.input_tokens"] == 10 - assert ai_client_span1["data"]["gen_ai.usage.input_tokens.cached"] == 0 - assert ai_client_span1["data"]["gen_ai.usage.output_tokens"] == 5 - assert ai_client_span1["data"]["gen_ai.usage.output_tokens.reasoning"] == 0 - assert ai_client_span1["data"]["gen_ai.usage.total_tokens"] == 15 - - tool_call = { - "arguments": '{"message": "hello"}', - "call_id": "call_123", - "name": "simple_test_tool", - "type": "function_call", - "id": "call_123", - "status": None, - } - - if OPENAI_VERSION >= (2, 25, 0): - tool_call["namespace"] = None - - assert json.loads(ai_client_span1["data"]["gen_ai.response.tool_calls"]) == [ - tool_call - ] + else: + with patch.object( + agent_with_tool.model._client._client, + "send", + side_effect=[tool_response, final_response], + ) as _: + sentry_init( + integrations=[OpenAIAgentsIntegration()], + traces_sample_rate=1.0, + send_default_pii=True, + stream_gen_ai_spans=stream_gen_ai_spans, + ) - assert tool_span["description"] == "execute_tool simple_test_tool" - assert tool_span["data"]["gen_ai.agent.name"] == "test_agent" - assert tool_span["data"]["gen_ai.operation.name"] == "execute_tool" - - tool_span_available_tool = json.loads( - tool_span["data"]["gen_ai.request.available_tools"] - )[0] - assert all(tool_span_available_tool[k] == v for k, v in available_tool.items()) - - assert tool_span["data"]["gen_ai.request.max_tokens"] == 100 - assert tool_span["data"]["gen_ai.request.model"] == "gpt-4" - assert tool_span["data"]["gen_ai.request.temperature"] == 0.7 - assert tool_span["data"]["gen_ai.request.top_p"] == 1.0 - assert tool_span["data"]["gen_ai.system"] == "openai" - assert tool_span["data"]["gen_ai.tool.description"] == "A simple tool" - assert tool_span["data"]["gen_ai.tool.input"] == '{"message": "hello"}' - assert tool_span["data"]["gen_ai.tool.name"] == "simple_test_tool" - assert tool_span["data"]["gen_ai.tool.output"] == "Tool executed with: hello" - assert ai_client_span2["description"] == "chat gpt-4" - assert ai_client_span2["data"]["gen_ai.agent.name"] == "test_agent" - assert ai_client_span2["data"]["gen_ai.operation.name"] == "chat" - - ai_client_span2_available_tool = json.loads( - ai_client_span2["data"]["gen_ai.request.available_tools"] - )[0] - assert all( - ai_client_span2_available_tool[k] == v for k, v in available_tool.items() - ) + events = capture_events() - assert ai_client_span2["data"]["gen_ai.request.max_tokens"] == 100 - assert ai_client_span2["data"]["gen_ai.request.messages"] == safe_serialize( - [ - { - "role": "tool", - "content": [ - { - "call_id": "call_123", - "output": "Tool executed with: hello", - "type": "function_call_output", - } - ], + await agents.Runner.run( + agent_with_tool, + "Please use the simple test tool", + run_config=test_run_config, + ) + + (transaction,) = events + spans = transaction["spans"] + agent_span = next( + span for span in spans if span["op"] == OP.GEN_AI_INVOKE_AGENT + ) + ai_client_span1, ai_client_span2 = ( + span for span in spans if span["op"] == OP.GEN_AI_CHAT + ) + tool_span = next(span for span in spans if span["op"] == OP.GEN_AI_EXECUTE_TOOL) + + available_tool = { + "name": "simple_test_tool", + "description": "A simple tool", + "params_json_schema": { + "properties": {"message": {"title": "Message", "type": "string"}}, + "required": ["message"], + "title": "simple_test_tool_args", + "type": "object", + "additionalProperties": False, }, + "on_invoke_tool": mock.ANY, + "strict_json_schema": True, + "is_enabled": True, + } + + if parse_version(OPENAI_AGENTS_VERSION) >= (0, 3, 3): + available_tool.update( + {"tool_input_guardrails": None, "tool_output_guardrails": None} + ) + + if parse_version(OPENAI_AGENTS_VERSION) >= ( + 0, + 8, + ): + available_tool["needs_approval"] = False + if parse_version(OPENAI_AGENTS_VERSION) >= ( + 0, + 9, + 0, + ): + available_tool.update( + { + "timeout_seconds": None, + "timeout_behavior": "error_as_result", + "timeout_error_function": None, + } + ) + + assert transaction["transaction"] == "test_agent workflow" + assert transaction["contexts"]["trace"]["origin"] == "auto.ai.openai_agents" + + assert agent_span["description"] == "invoke_agent test_agent" + assert agent_span["origin"] == "auto.ai.openai_agents" + assert agent_span["data"]["gen_ai.agent.name"] == "test_agent" + assert agent_span["data"]["gen_ai.operation.name"] == "invoke_agent" + + agent_span_available_tool = json.loads( + agent_span["data"]["gen_ai.request.available_tools"] + )[0] + assert all(agent_span_available_tool[k] == v for k, v in available_tool.items()) + + assert agent_span["data"]["gen_ai.request.max_tokens"] == 100 + assert agent_span["data"]["gen_ai.request.model"] == "gpt-4" + assert agent_span["data"]["gen_ai.request.temperature"] == 0.7 + assert agent_span["data"]["gen_ai.request.top_p"] == 1.0 + assert agent_span["data"]["gen_ai.system"] == "openai" + + assert ai_client_span1["description"] == "chat gpt-4" + assert ai_client_span1["data"]["gen_ai.operation.name"] == "chat" + assert ai_client_span1["data"]["gen_ai.system"] == "openai" + assert ai_client_span1["data"]["gen_ai.agent.name"] == "test_agent" + + ai_client_span1_available_tool = json.loads( + ai_client_span1["data"]["gen_ai.request.available_tools"] + )[0] + assert all( + ai_client_span1_available_tool[k] == v for k, v in available_tool.items() + ) + + assert ai_client_span1["data"]["gen_ai.request.max_tokens"] == 100 + assert ai_client_span1["data"]["gen_ai.request.messages"] == safe_serialize( + [ + { + "role": "user", + "content": [ + {"type": "text", "text": "Please use the simple test tool"} + ], + }, + ] + ) + assert ai_client_span1["data"]["gen_ai.request.model"] == "gpt-4" + assert ai_client_span1["data"]["gen_ai.request.temperature"] == 0.7 + assert ai_client_span1["data"]["gen_ai.request.top_p"] == 1.0 + assert ai_client_span1["data"]["gen_ai.usage.input_tokens"] == 10 + assert ai_client_span1["data"]["gen_ai.usage.input_tokens.cached"] == 0 + assert ai_client_span1["data"]["gen_ai.usage.output_tokens"] == 5 + assert ai_client_span1["data"]["gen_ai.usage.output_tokens.reasoning"] == 0 + assert ai_client_span1["data"]["gen_ai.usage.total_tokens"] == 15 + + tool_call = { + "arguments": '{"message": "hello"}', + "call_id": "call_123", + "name": "simple_test_tool", + "type": "function_call", + "id": "call_123", + "status": None, + } + + if OPENAI_VERSION >= (2, 25, 0): + tool_call["namespace"] = None + + assert json.loads(ai_client_span1["data"]["gen_ai.response.tool_calls"]) == [ + tool_call ] - ) - assert ai_client_span2["data"]["gen_ai.request.model"] == "gpt-4" - assert ai_client_span2["data"]["gen_ai.request.temperature"] == 0.7 - assert ai_client_span2["data"]["gen_ai.request.top_p"] == 1.0 - assert ( - ai_client_span2["data"]["gen_ai.response.text"] - == "Task completed using the tool" - ) - assert ai_client_span2["data"]["gen_ai.system"] == "openai" - assert ai_client_span2["data"]["gen_ai.usage.input_tokens.cached"] == 0 - assert ai_client_span2["data"]["gen_ai.usage.input_tokens"] == 15 - assert ai_client_span2["data"]["gen_ai.usage.output_tokens.reasoning"] == 0 - assert ai_client_span2["data"]["gen_ai.usage.output_tokens"] == 10 - assert ai_client_span2["data"]["gen_ai.usage.total_tokens"] == 25 + + assert tool_span["description"] == "execute_tool simple_test_tool" + assert tool_span["data"]["gen_ai.agent.name"] == "test_agent" + assert tool_span["data"]["gen_ai.operation.name"] == "execute_tool" + + tool_span_available_tool = json.loads( + tool_span["data"]["gen_ai.request.available_tools"] + )[0] + assert all(tool_span_available_tool[k] == v for k, v in available_tool.items()) + + assert tool_span["data"]["gen_ai.request.max_tokens"] == 100 + assert tool_span["data"]["gen_ai.request.model"] == "gpt-4" + assert tool_span["data"]["gen_ai.request.temperature"] == 0.7 + assert tool_span["data"]["gen_ai.request.top_p"] == 1.0 + assert tool_span["data"]["gen_ai.system"] == "openai" + assert tool_span["data"]["gen_ai.tool.description"] == "A simple tool" + assert tool_span["data"]["gen_ai.tool.input"] == '{"message": "hello"}' + assert tool_span["data"]["gen_ai.tool.name"] == "simple_test_tool" + assert tool_span["data"]["gen_ai.tool.output"] == "Tool executed with: hello" + assert ai_client_span2["description"] == "chat gpt-4" + assert ai_client_span2["data"]["gen_ai.agent.name"] == "test_agent" + assert ai_client_span2["data"]["gen_ai.operation.name"] == "chat" + + ai_client_span2_available_tool = json.loads( + ai_client_span2["data"]["gen_ai.request.available_tools"] + )[0] + assert all( + ai_client_span2_available_tool[k] == v for k, v in available_tool.items() + ) + + assert ai_client_span2["data"]["gen_ai.request.max_tokens"] == 100 + assert ai_client_span2["data"]["gen_ai.request.messages"] == safe_serialize( + [ + { + "role": "tool", + "content": [ + { + "call_id": "call_123", + "output": "Tool executed with: hello", + "type": "function_call_output", + } + ], + }, + ] + ) + assert ai_client_span2["data"]["gen_ai.request.model"] == "gpt-4" + assert ai_client_span2["data"]["gen_ai.request.temperature"] == 0.7 + assert ai_client_span2["data"]["gen_ai.request.top_p"] == 1.0 + assert ( + ai_client_span2["data"]["gen_ai.response.text"] + == "Task completed using the tool" + ) + assert ai_client_span2["data"]["gen_ai.system"] == "openai" + assert ai_client_span2["data"]["gen_ai.usage.input_tokens.cached"] == 0 + assert ai_client_span2["data"]["gen_ai.usage.input_tokens"] == 15 + assert ai_client_span2["data"]["gen_ai.usage.output_tokens.reasoning"] == 0 + assert ai_client_span2["data"]["gen_ai.usage.output_tokens"] == 10 + assert ai_client_span2["data"]["gen_ai.usage.total_tokens"] == 25 @pytest.mark.asyncio @@ -1569,8 +2409,15 @@ async def test_hosted_mcp_tool_propagation_headers( assert hosted_mcp_tool["headers"]["baggage"] == expected_outgoing_baggage +@pytest.mark.parametrize("stream_gen_ai_spans", [True, False]) @pytest.mark.asyncio -async def test_model_behavior_error(sentry_init, capture_events, test_agent): +async def test_model_behavior_error( + sentry_init, + capture_events, + capture_items, + test_agent, + stream_gen_ai_spans, +): """ Example raising agents.exceptions.AgentsException before the agent invocation span is complete. The mocked API response indicates that "wrong_tool" was called. @@ -1584,8 +2431,62 @@ def simple_test_tool(message: str) -> str: # Create agent with the tool agent_with_tool = test_agent.clone(tools=[simple_test_tool]) - with patch.dict(os.environ, {"OPENAI_API_KEY": "test-key"}): - with patch( + if stream_gen_ai_spans: + with patch.dict(os.environ, {"OPENAI_API_KEY": "test-key"}), patch( + "agents.models.openai_responses.OpenAIResponsesModel.get_response" + ) as mock_get_response: + # Create a mock response that includes tool calls + tool_call = ResponseFunctionToolCall( + id="call_123", + call_id="call_123", + name="wrong_tool", + type="function_call", + arguments='{"message": "hello"}', + ) + + tool_response = ModelResponse( + output=[tool_call], + usage=Usage( + requests=1, input_tokens=10, output_tokens=5, total_tokens=15 + ), + response_id="resp_tool_123", + ) + + mock_get_response.side_effect = [tool_response] + + sentry_init( + integrations=[OpenAIAgentsIntegration()], + traces_sample_rate=1.0, + send_default_pii=True, + stream_gen_ai_spans=stream_gen_ai_spans, + ) + + items = capture_items("span", "transaction") + + with pytest.raises(ModelBehaviorError): + await agents.Runner.run( + agent_with_tool, + "Please use the simple test tool", + run_config=test_run_config, + ) + + (transaction,) = (item.payload for item in items if item.type == "transaction") + assert transaction["transaction"] == "test_agent workflow" + assert transaction["contexts"]["trace"]["origin"] == "auto.ai.openai_agents" + + spans = [item.payload for item in items if item.type == "span"] + + ( + agent_span, + ai_client_span1, + ) = spans + assert agent_span["name"] == "invoke_agent test_agent" + assert agent_span["attributes"]["sentry.origin"] == "auto.ai.openai_agents" + + # Error due to unrecognized tool in model response. + assert agent_span["status"] == "error" + else: + with patch.dict(os.environ, {"OPENAI_API_KEY": "test-key"}), patch( "agents.models.openai_responses.OpenAIResponsesModel.get_response" ) as mock_get_response: # Create a mock response that includes tool calls @@ -1610,44 +2511,98 @@ def simple_test_tool(message: str) -> str: sentry_init( integrations=[OpenAIAgentsIntegration()], traces_sample_rate=1.0, - send_default_pii=True, + send_default_pii=True, + stream_gen_ai_spans=stream_gen_ai_spans, + ) + events = capture_events() + + with pytest.raises(ModelBehaviorError): + await agents.Runner.run( + agent_with_tool, + "Please use the simple test tool", + run_config=test_run_config, + ) + + (error, transaction) = events + spans = transaction["spans"] + ( + agent_span, + ai_client_span1, + ) = spans + + assert transaction["transaction"] == "test_agent workflow" + assert transaction["contexts"]["trace"]["origin"] == "auto.ai.openai_agents" + + assert agent_span["description"] == "invoke_agent test_agent" + assert agent_span["origin"] == "auto.ai.openai_agents" + + # Error due to unrecognized tool in model response. + assert agent_span["status"] == "internal_error" + assert agent_span["tags"]["status"] == "internal_error" + + +@pytest.mark.parametrize("stream_gen_ai_spans", [True, False]) +@pytest.mark.asyncio +async def test_error_handling( + sentry_init, + capture_events, + capture_items, + test_agent, + stream_gen_ai_spans, +): + """ + Test error handling in agent execution. + """ + + if stream_gen_ai_spans: + with patch.dict(os.environ, {"OPENAI_API_KEY": "test-key"}), patch( + "agents.models.openai_responses.OpenAIResponsesModel.get_response" + ) as mock_get_response: + mock_get_response.side_effect = Exception("Model Error") + + sentry_init( + integrations=[ + OpenAIAgentsIntegration(), + LoggingIntegration(event_level=logging.CRITICAL), + ], + traces_sample_rate=1.0, + stream_gen_ai_spans=stream_gen_ai_spans, ) - events = capture_events() + items = capture_items("event", "span", "transaction") - with pytest.raises(ModelBehaviorError): + with pytest.raises(Exception, match="Model Error"): await agents.Runner.run( - agent_with_tool, - "Please use the simple test tool", - run_config=test_run_config, + test_agent, "Test input", run_config=test_run_config ) - (error, transaction) = events - spans = transaction["spans"] - ( - agent_span, - ai_client_span1, - ) = spans + (error_event,) = (item.payload for item in items if item.type == "event") - assert transaction["transaction"] == "test_agent workflow" - assert transaction["contexts"]["trace"]["origin"] == "auto.ai.openai_agents" + assert error_event["exception"]["values"][0]["type"] == "Exception" + assert error_event["exception"]["values"][0]["value"] == "Model Error" + assert ( + error_event["exception"]["values"][0]["mechanism"]["type"] + == "openai_agents" + ) - assert agent_span["description"] == "invoke_agent test_agent" - assert agent_span["origin"] == "auto.ai.openai_agents" + (transaction,) = (item.payload for item in items if item.type == "transaction") - # Error due to unrecognized tool in model response. - assert agent_span["status"] == "internal_error" - assert agent_span["tags"]["status"] == "internal_error" + assert transaction["transaction"] == "test_agent workflow" + assert transaction["contexts"]["trace"]["origin"] == "auto.ai.openai_agents" + spans = [item.payload for item in items if item.type == "span"] + (invoke_agent_span, ai_client_span) = spans -@pytest.mark.asyncio -async def test_error_handling(sentry_init, capture_events, test_agent): - """ - Test error handling in agent execution. - """ + assert invoke_agent_span["name"] == "invoke_agent test_agent" + assert ( + invoke_agent_span["attributes"]["sentry.origin"] == "auto.ai.openai_agents" + ) - with patch.dict(os.environ, {"OPENAI_API_KEY": "test-key"}): - with patch( + assert ai_client_span["name"] == "chat gpt-4" + assert ai_client_span["attributes"]["sentry.origin"] == "auto.ai.openai_agents" + assert ai_client_span["status"] == "error" + else: + with patch.dict(os.environ, {"OPENAI_API_KEY": "test-key"}), patch( "agents.models.openai_responses.OpenAIResponsesModel.get_response" ) as mock_get_response: mock_get_response.side_effect = Exception("Model Error") @@ -1658,8 +2613,8 @@ async def test_error_handling(sentry_init, capture_events, test_agent): LoggingIntegration(event_level=logging.CRITICAL), ], traces_sample_rate=1.0, + stream_gen_ai_spans=stream_gen_ai_spans, ) - events = capture_events() with pytest.raises(Exception, match="Model Error"): @@ -1667,32 +2622,42 @@ async def test_error_handling(sentry_init, capture_events, test_agent): test_agent, "Test input", run_config=test_run_config ) - ( - error_event, - transaction, - ) = events + ( + error_event, + transaction, + ) = events - assert error_event["exception"]["values"][0]["type"] == "Exception" - assert error_event["exception"]["values"][0]["value"] == "Model Error" - assert error_event["exception"]["values"][0]["mechanism"]["type"] == "openai_agents" + assert error_event["exception"]["values"][0]["type"] == "Exception" + assert error_event["exception"]["values"][0]["value"] == "Model Error" + assert ( + error_event["exception"]["values"][0]["mechanism"]["type"] + == "openai_agents" + ) - spans = transaction["spans"] - (invoke_agent_span, ai_client_span) = spans + spans = transaction["spans"] + (invoke_agent_span, ai_client_span) = spans - assert transaction["transaction"] == "test_agent workflow" - assert transaction["contexts"]["trace"]["origin"] == "auto.ai.openai_agents" + assert transaction["transaction"] == "test_agent workflow" + assert transaction["contexts"]["trace"]["origin"] == "auto.ai.openai_agents" - assert invoke_agent_span["description"] == "invoke_agent test_agent" - assert invoke_agent_span["origin"] == "auto.ai.openai_agents" + assert invoke_agent_span["description"] == "invoke_agent test_agent" + assert invoke_agent_span["origin"] == "auto.ai.openai_agents" - assert ai_client_span["description"] == "chat gpt-4" - assert ai_client_span["origin"] == "auto.ai.openai_agents" - assert ai_client_span["status"] == "internal_error" - assert ai_client_span["tags"]["status"] == "internal_error" + assert ai_client_span["description"] == "chat gpt-4" + assert ai_client_span["origin"] == "auto.ai.openai_agents" + assert ai_client_span["status"] == "internal_error" + assert ai_client_span["tags"]["status"] == "internal_error" +@pytest.mark.parametrize("stream_gen_ai_spans", [True, False]) @pytest.mark.asyncio -async def test_error_captures_input_data(sentry_init, capture_events, test_agent): +async def test_error_captures_input_data( + sentry_init, + capture_events, + capture_items, + test_agent, + stream_gen_ai_spans, +): """ Test that input data is captured even when the API call raises an exception. This verifies that _set_input_data is called before the API call. @@ -1711,53 +2676,134 @@ async def test_error_captures_input_data(sentry_init, capture_events, test_agent request=model_request, ) - with patch.object( - agent.model._client._client, - "send", - return_value=response, - ) as _: - sentry_init( - integrations=[ - OpenAIAgentsIntegration(), - LoggingIntegration(event_level=logging.CRITICAL), - ], - traces_sample_rate=1.0, - send_default_pii=True, + if stream_gen_ai_spans: + with patch.object( + agent.model._client._client, + "send", + return_value=response, + ) as _: + sentry_init( + integrations=[ + OpenAIAgentsIntegration(), + LoggingIntegration(event_level=logging.CRITICAL), + ], + traces_sample_rate=1.0, + send_default_pii=True, + stream_gen_ai_spans=stream_gen_ai_spans, + ) + + items = capture_items("event", "span") + + with pytest.raises(InternalServerError, match="Error code: 500"): + await agents.Runner.run(agent, "Test input", run_config=test_run_config) + + (error_event,) = (item.payload for item in items if item.type == "event") + + assert error_event["exception"]["values"][0]["type"] == "InternalServerError" + assert error_event["exception"]["values"][0]["value"] == "Error code: 500" + + spans = [item.payload for item in items if item.type == "span"] + ai_client_span = [ + s for s in spans if s["attributes"].get("sentry.op", "") == "gen_ai.chat" + ][0] + + assert ai_client_span["name"] == "chat gpt-4" + assert ai_client_span["status"] == "error" + + assert "gen_ai.request.messages" in ai_client_span["attributes"] + request_messages = safe_serialize( + [ + {"role": "user", "content": [{"type": "text", "text": "Test input"}]}, + ] + ) + assert ( + ai_client_span["attributes"]["gen_ai.request.messages"] == request_messages ) + else: + with patch.object( + agent.model._client._client, + "send", + return_value=response, + ) as _: + sentry_init( + integrations=[ + OpenAIAgentsIntegration(), + LoggingIntegration(event_level=logging.CRITICAL), + ], + traces_sample_rate=1.0, + send_default_pii=True, + stream_gen_ai_spans=stream_gen_ai_spans, + ) - events = capture_events() + events = capture_events() - with pytest.raises(InternalServerError, match="Error code: 500"): - await agents.Runner.run(agent, "Test input", run_config=test_run_config) + with pytest.raises(InternalServerError, match="Error code: 500"): + await agents.Runner.run(agent, "Test input", run_config=test_run_config) - ( - error_event, - transaction, - ) = events + ( + error_event, + transaction, + ) = events - assert error_event["exception"]["values"][0]["type"] == "InternalServerError" - assert error_event["exception"]["values"][0]["value"] == "Error code: 500" + assert error_event["exception"]["values"][0]["type"] == "InternalServerError" + assert error_event["exception"]["values"][0]["value"] == "Error code: 500" - spans = transaction["spans"] - ai_client_span = [s for s in spans if s["op"] == "gen_ai.chat"][0] + spans = transaction["spans"] + ai_client_span = [s for s in spans if s["op"] == "gen_ai.chat"][0] - assert ai_client_span["description"] == "chat gpt-4" - assert ai_client_span["status"] == "internal_error" - assert ai_client_span["tags"]["status"] == "internal_error" + assert ai_client_span["description"] == "chat gpt-4" + assert ai_client_span["status"] == "internal_error" + assert ai_client_span["tags"]["status"] == "internal_error" - assert "gen_ai.request.messages" in ai_client_span["data"] - request_messages = safe_serialize( - [ - {"role": "user", "content": [{"type": "text", "text": "Test input"}]}, - ] - ) - assert ai_client_span["data"]["gen_ai.request.messages"] == request_messages + assert "gen_ai.request.messages" in ai_client_span["data"] + request_messages = safe_serialize( + [ + {"role": "user", "content": [{"type": "text", "text": "Test input"}]}, + ] + ) + assert ai_client_span["data"]["gen_ai.request.messages"] == request_messages +@pytest.mark.parametrize("stream_gen_ai_spans", [True, False]) @pytest.mark.asyncio -async def test_span_status_error(sentry_init, capture_events, test_agent): - with patch.dict(os.environ, {"OPENAI_API_KEY": "test-key"}): - with patch( +async def test_span_status_error( + sentry_init, + capture_events, + capture_items, + test_agent, + stream_gen_ai_spans, +): + if stream_gen_ai_spans: + with patch.dict(os.environ, {"OPENAI_API_KEY": "test-key"}), patch( + "agents.models.openai_responses.OpenAIResponsesModel.get_response" + ) as mock_get_response: + mock_get_response.side_effect = ValueError("Model Error") + + sentry_init( + integrations=[ + OpenAIAgentsIntegration(), + LoggingIntegration(event_level=logging.CRITICAL), + ], + traces_sample_rate=1.0, + stream_gen_ai_spans=stream_gen_ai_spans, + ) + + items = capture_items("event", "transaction", "span") + + with pytest.raises(ValueError, match="Model Error"): + await agents.Runner.run( + test_agent, "Test input", run_config=test_run_config + ) + + (error,) = (item.payload for item in items if item.type == "event") + assert error["level"] == "error" + + spans = [item.payload for item in items if item.type == "span"] + assert spans[0]["status"] == "error" + + (transaction,) = (item.payload for item in items if item.type == "transaction") + else: + with patch.dict(os.environ, {"OPENAI_API_KEY": "test-key"}), patch( "agents.models.openai_responses.OpenAIResponsesModel.get_response" ) as mock_get_response: mock_get_response.side_effect = ValueError("Model Error") @@ -1768,6 +2814,7 @@ async def test_span_status_error(sentry_init, capture_events, test_agent): LoggingIntegration(event_level=logging.CRITICAL), ], traces_sample_rate=1.0, + stream_gen_ai_spans=stream_gen_ai_spans, ) events = capture_events() @@ -1777,16 +2824,23 @@ async def test_span_status_error(sentry_init, capture_events, test_agent): test_agent, "Test input", run_config=test_run_config ) - (error, transaction) = events - assert error["level"] == "error" - assert transaction["spans"][0]["status"] == "internal_error" - assert transaction["spans"][0]["tags"]["status"] == "internal_error" + (error, transaction) = events + assert error["level"] == "error" + assert transaction["spans"][0]["status"] == "internal_error" + assert transaction["spans"][0]["tags"]["status"] == "internal_error" + assert transaction["contexts"]["trace"]["status"] == "internal_error" +@pytest.mark.parametrize("stream_gen_ai_spans", [True, False]) @pytest.mark.asyncio async def test_mcp_tool_execution_spans( - sentry_init, capture_events, test_agent, get_model_response + sentry_init, + capture_events, + capture_items, + test_agent, + get_model_response, + stream_gen_ai_spans, ): """ Test that MCP (Model Context Protocol) tool calls create execute_tool spans. @@ -1869,52 +2923,106 @@ async def test_mcp_tool_execution_spans( serialize_pydantic=True, ) - with patch.object( - agent.model._client._client, - "send", - side_effect=[mcp_response, final_response], - ) as _: - sentry_init( - integrations=[OpenAIAgentsIntegration()], - traces_sample_rate=1.0, - send_default_pii=True, - ) + if stream_gen_ai_spans: + with patch.object( + agent.model._client._client, + "send", + side_effect=[mcp_response, final_response], + ) as _: + sentry_init( + integrations=[OpenAIAgentsIntegration()], + traces_sample_rate=1.0, + send_default_pii=True, + stream_gen_ai_spans=stream_gen_ai_spans, + ) - events = capture_events() + items = capture_items("span", "transaction") - await agents.Runner.run( - agent, - "Please use MCP tool", - run_config=test_run_config, + await agents.Runner.run( + agent, + "Please use MCP tool", + run_config=test_run_config, + ) + + spans = [item.payload for item in items if item.type == "span"] + + # Find the MCP execute_tool span + mcp_tool_span = None + for span in spans: + if span.get("name") == "execute_tool test_mcp_tool": + mcp_tool_span = span + break + + # Verify the MCP tool span was created + assert mcp_tool_span is not None, "MCP execute_tool span was not created" + assert mcp_tool_span["name"] == "execute_tool test_mcp_tool" + assert mcp_tool_span["attributes"]["gen_ai.tool.name"] == "test_mcp_tool" + assert ( + mcp_tool_span["attributes"]["gen_ai.tool.input"] + == '{"query": "search term"}' + ) + assert ( + mcp_tool_span["attributes"]["gen_ai.tool.output"] + == "MCP tool executed successfully" ) - (transaction,) = events - spans = transaction["spans"] + # Verify no error status since error was None + assert mcp_tool_span.get("status") != "error" + assert mcp_tool_span.get("tags", {}).get("status") != "error" + else: + with patch.object( + agent.model._client._client, + "send", + side_effect=[mcp_response, final_response], + ) as _: + sentry_init( + integrations=[OpenAIAgentsIntegration()], + traces_sample_rate=1.0, + send_default_pii=True, + stream_gen_ai_spans=stream_gen_ai_spans, + ) + events = capture_events() - # Find the MCP execute_tool span - mcp_tool_span = None - for span in spans: - if span.get("description") == "execute_tool test_mcp_tool": - mcp_tool_span = span - break + await agents.Runner.run( + agent, + "Please use MCP tool", + run_config=test_run_config, + ) - # Verify the MCP tool span was created - assert mcp_tool_span is not None, "MCP execute_tool span was not created" - assert mcp_tool_span["description"] == "execute_tool test_mcp_tool" - assert mcp_tool_span["data"]["gen_ai.tool.name"] == "test_mcp_tool" - assert mcp_tool_span["data"]["gen_ai.tool.input"] == '{"query": "search term"}' - assert ( - mcp_tool_span["data"]["gen_ai.tool.output"] == "MCP tool executed successfully" - ) + (transaction,) = events + spans = transaction["spans"] + + # Find the MCP execute_tool span + mcp_tool_span = None + for span in spans: + if span.get("description") == "execute_tool test_mcp_tool": + mcp_tool_span = span + break + + # Verify the MCP tool span was created + assert mcp_tool_span is not None, "MCP execute_tool span was not created" + assert mcp_tool_span["description"] == "execute_tool test_mcp_tool" + assert mcp_tool_span["data"]["gen_ai.tool.name"] == "test_mcp_tool" + assert mcp_tool_span["data"]["gen_ai.tool.input"] == '{"query": "search term"}' + assert ( + mcp_tool_span["data"]["gen_ai.tool.output"] + == "MCP tool executed successfully" + ) - # Verify no error status since error was None - assert mcp_tool_span.get("status") != "internal_error" - assert mcp_tool_span.get("tags", {}).get("status") != "internal_error" + # Verify no error status since error was None + assert mcp_tool_span.get("status") != "internal_error" + assert mcp_tool_span.get("tags", {}).get("status") != "internal_error" +@pytest.mark.parametrize("stream_gen_ai_spans", [True, False]) @pytest.mark.asyncio async def test_mcp_tool_execution_with_error( - sentry_init, capture_events, test_agent, get_model_response + sentry_init, + capture_events, + capture_items, + test_agent, + get_model_response, + stream_gen_ai_spans, ): """ Test that MCP tool calls with errors are tracked with error status. @@ -1997,50 +3105,95 @@ async def test_mcp_tool_execution_with_error( serialize_pydantic=True, ) - with patch.object( - agent.model._client._client, - "send", - side_effect=[mcp_response, final_response], - ) as _: - sentry_init( - integrations=[OpenAIAgentsIntegration()], - traces_sample_rate=1.0, - send_default_pii=True, - ) + if stream_gen_ai_spans: + with patch.object( + agent.model._client._client, + "send", + side_effect=[mcp_response, final_response], + ) as _: + sentry_init( + integrations=[OpenAIAgentsIntegration()], + traces_sample_rate=1.0, + send_default_pii=True, + stream_gen_ai_spans=stream_gen_ai_spans, + ) - events = capture_events() + items = capture_items("span", "transaction") - await agents.Runner.run( - agent, - "Please use failing MCP tool", - run_config=test_run_config, - ) + await agents.Runner.run( + agent, + "Please use failing MCP tool", + run_config=test_run_config, + ) + + spans = [item.payload for item in items if item.type == "span"] + + # Find the MCP execute_tool span with error + mcp_tool_span = None + for span in spans: + if span.get("name") == "execute_tool failing_mcp_tool": + mcp_tool_span = span + break + + # Verify the MCP tool span was created with error status + assert mcp_tool_span is not None, "MCP execute_tool span was not created" + assert mcp_tool_span["name"] == "execute_tool failing_mcp_tool" + assert mcp_tool_span["attributes"]["gen_ai.tool.name"] == "failing_mcp_tool" + assert mcp_tool_span["attributes"]["gen_ai.tool.input"] == '{"query": "test"}' + + # Verify error status was set + assert mcp_tool_span["status"] == "error" + else: + with patch.object( + agent.model._client._client, + "send", + side_effect=[mcp_response, final_response], + ) as _: + sentry_init( + integrations=[OpenAIAgentsIntegration()], + traces_sample_rate=1.0, + send_default_pii=True, + stream_gen_ai_spans=stream_gen_ai_spans, + ) + events = capture_events() + + await agents.Runner.run( + agent, + "Please use failing MCP tool", + run_config=test_run_config, + ) - (transaction,) = events - spans = transaction["spans"] + (transaction,) = events + spans = transaction["spans"] - # Find the MCP execute_tool span with error - mcp_tool_span = None - for span in spans: - if span.get("description") == "execute_tool failing_mcp_tool": - mcp_tool_span = span - break + # Find the MCP execute_tool span with error + mcp_tool_span = None + for span in spans: + if span.get("description") == "execute_tool failing_mcp_tool": + mcp_tool_span = span + break - # Verify the MCP tool span was created with error status - assert mcp_tool_span is not None, "MCP execute_tool span was not created" - assert mcp_tool_span["description"] == "execute_tool failing_mcp_tool" - assert mcp_tool_span["data"]["gen_ai.tool.name"] == "failing_mcp_tool" - assert mcp_tool_span["data"]["gen_ai.tool.input"] == '{"query": "test"}' - assert mcp_tool_span["data"]["gen_ai.tool.output"] is None + # Verify the MCP tool span was created with error status + assert mcp_tool_span is not None, "MCP execute_tool span was not created" + assert mcp_tool_span["description"] == "execute_tool failing_mcp_tool" + assert mcp_tool_span["data"]["gen_ai.tool.name"] == "failing_mcp_tool" + assert mcp_tool_span["data"]["gen_ai.tool.input"] == '{"query": "test"}' + assert mcp_tool_span["data"]["gen_ai.tool.output"] is None - # Verify error status was set - assert mcp_tool_span["status"] == "internal_error" - assert mcp_tool_span["tags"]["status"] == "internal_error" + # Verify error status was set + assert mcp_tool_span["status"] == "internal_error" + assert mcp_tool_span["tags"]["status"] == "internal_error" +@pytest.mark.parametrize("stream_gen_ai_spans", [True, False]) @pytest.mark.asyncio async def test_mcp_tool_execution_without_pii( - sentry_init, capture_events, test_agent, get_model_response + sentry_init, + capture_events, + capture_items, + test_agent, + get_model_response, + stream_gen_ai_spans, ): """ Test that MCP tool input/output are not included when send_default_pii is False. @@ -2123,52 +3276,94 @@ async def test_mcp_tool_execution_without_pii( serialize_pydantic=True, ) - with patch.object( - agent.model._client._client, - "send", - side_effect=[mcp_response, final_response], - ) as _: - sentry_init( - integrations=[OpenAIAgentsIntegration()], - traces_sample_rate=1.0, - send_default_pii=False, # PII disabled - ) + if stream_gen_ai_spans: + with patch.object( + agent.model._client._client, + "send", + side_effect=[mcp_response, final_response], + ) as _: + sentry_init( + integrations=[OpenAIAgentsIntegration()], + traces_sample_rate=1.0, + send_default_pii=False, # PII disabled + stream_gen_ai_spans=stream_gen_ai_spans, + ) - events = capture_events() + items = capture_items("span", "transaction") - await agents.Runner.run( - agent, - "Please use MCP tool", - run_config=test_run_config, - ) + await agents.Runner.run( + agent, + "Please use MCP tool", + run_config=test_run_config, + ) + + spans = [item.payload for item in items if item.type == "span"] + + # Find the MCP execute_tool span + mcp_tool_span = None + for span in spans: + if span.get("name") == "execute_tool test_mcp_tool": + mcp_tool_span = span + break + + # Verify the MCP tool span was created but without input/output + assert mcp_tool_span is not None, "MCP execute_tool span was not created" + assert mcp_tool_span["name"] == "execute_tool test_mcp_tool" + assert mcp_tool_span["attributes"]["gen_ai.tool.name"] == "test_mcp_tool" + + # Verify input and output are not included when send_default_pii is False + assert "gen_ai.tool.input" not in mcp_tool_span["attributes"] + assert "gen_ai.tool.output" not in mcp_tool_span["attributes"] + else: + with patch.object( + agent.model._client._client, + "send", + side_effect=[mcp_response, final_response], + ) as _: + sentry_init( + integrations=[OpenAIAgentsIntegration()], + traces_sample_rate=1.0, + send_default_pii=False, # PII disabled + stream_gen_ai_spans=stream_gen_ai_spans, + ) + events = capture_events() + + await agents.Runner.run( + agent, + "Please use MCP tool", + run_config=test_run_config, + ) - (transaction,) = events - spans = transaction["spans"] + (transaction,) = events + spans = transaction["spans"] - # Find the MCP execute_tool span - mcp_tool_span = None - for span in spans: - if span.get("description") == "execute_tool test_mcp_tool": - mcp_tool_span = span - break + # Find the MCP execute_tool span + mcp_tool_span = None + for span in spans: + if span.get("description") == "execute_tool test_mcp_tool": + mcp_tool_span = span + break - # Verify the MCP tool span was created but without input/output - assert mcp_tool_span is not None, "MCP execute_tool span was not created" - assert mcp_tool_span["description"] == "execute_tool test_mcp_tool" - assert mcp_tool_span["data"]["gen_ai.tool.name"] == "test_mcp_tool" + # Verify the MCP tool span was created but without input/output + assert mcp_tool_span is not None, "MCP execute_tool span was not created" + assert mcp_tool_span["description"] == "execute_tool test_mcp_tool" + assert mcp_tool_span["data"]["gen_ai.tool.name"] == "test_mcp_tool" - # Verify input and output are not included when send_default_pii is False - assert "gen_ai.tool.input" not in mcp_tool_span["data"] - assert "gen_ai.tool.output" not in mcp_tool_span["data"] + # Verify input and output are not included when send_default_pii is False + assert "gen_ai.tool.input" not in mcp_tool_span["data"] + assert "gen_ai.tool.output" not in mcp_tool_span["data"] +@pytest.mark.parametrize("stream_gen_ai_spans", [True, False]) @pytest.mark.asyncio async def test_multiple_agents_asyncio( sentry_init, capture_events, + capture_items, test_agent, nonstreaming_responses_model_response, get_model_response, + stream_gen_ai_spans, ): """ Test that multiple agents can be run at the same time in asyncio tasks @@ -2182,36 +3377,67 @@ async def test_multiple_agents_asyncio( nonstreaming_responses_model_response, serialize_pydantic=True ) - with patch.object( - agent.model._client._client, - "send", - return_value=response, - ) as _: - sentry_init( - integrations=[OpenAIAgentsIntegration()], - traces_sample_rate=1.0, - ) + if stream_gen_ai_spans: + with patch.object( + agent.model._client._client, + "send", + return_value=response, + ) as _: + sentry_init( + integrations=[OpenAIAgentsIntegration()], + traces_sample_rate=1.0, + stream_gen_ai_spans=stream_gen_ai_spans, + ) - events = capture_events() + items = capture_items("span", "transaction") - async def run(): - await agents.Runner.run( - starting_agent=agent, - input="Test input", - run_config=test_run_config, + async def run(): + await agents.Runner.run( + starting_agent=agent, + input="Test input", + run_config=test_run_config, + ) + + await asyncio.gather(*[run() for _ in range(3)]) + + txn1, txn2, txn3 = ( + item.payload for item in items if item.type == "transaction" + ) + + assert txn1["transaction"] == "test_agent workflow" + assert txn2["transaction"] == "test_agent workflow" + assert txn3["transaction"] == "test_agent workflow" + else: + with patch.object( + agent.model._client._client, + "send", + return_value=response, + ) as _: + sentry_init( + integrations=[OpenAIAgentsIntegration()], + traces_sample_rate=1.0, + stream_gen_ai_spans=stream_gen_ai_spans, ) + events = capture_events() + + async def run(): + await agents.Runner.run( + starting_agent=agent, + input="Test input", + run_config=test_run_config, + ) - await asyncio.gather(*[run() for _ in range(3)]) + await asyncio.gather(*[run() for _ in range(3)]) - assert len(events) == 3 - txn1, txn2, txn3 = events + assert len(events) == 3 + txn1, txn2, txn3 = events - assert txn1["type"] == "transaction" - assert txn1["transaction"] == "test_agent workflow" - assert txn2["type"] == "transaction" - assert txn2["transaction"] == "test_agent workflow" - assert txn3["type"] == "transaction" - assert txn3["transaction"] == "test_agent workflow" + assert txn1["type"] == "transaction" + assert txn1["transaction"] == "test_agent workflow" + assert txn2["type"] == "transaction" + assert txn2["transaction"] == "test_agent workflow" + assert txn3["type"] == "transaction" + assert txn3["transaction"] == "test_agent workflow" # Test input messages with mixed roles including "ai" @@ -2230,7 +3456,7 @@ async def run(): ], ) def test_openai_agents_message_role_mapping( - sentry_init, capture_events, test_message, expected_role + sentry_init, capture_items, test_message, expected_role ): """Test that OpenAI Agents integration properly maps message roles like 'ai' to 'assistant'""" sentry_init( @@ -2256,13 +3482,16 @@ def test_openai_agents_message_role_mapping( assert stored_messages[0]["role"] == expected_role +@pytest.mark.parametrize("stream_gen_ai_spans", [True, False]) @pytest.mark.asyncio async def test_tool_execution_error_tracing( sentry_init, capture_events, + capture_items, test_agent, get_model_response, responses_tool_call_model_responses, + stream_gen_ai_spans, ): """ Test that tool execution errors are properly tracked via error tracing patch. @@ -2327,57 +3556,103 @@ def failing_tool(message: str) -> str: serialize_pydantic=True, ) - with patch.object( - agent_with_tool.model._client._client, - "send", - side_effect=[tool_response, final_response], - ) as _: - sentry_init( - integrations=[OpenAIAgentsIntegration()], - traces_sample_rate=1.0, - send_default_pii=True, - ) + if stream_gen_ai_spans: + with patch.object( + agent_with_tool.model._client._client, + "send", + side_effect=[tool_response, final_response], + ) as _: + sentry_init( + integrations=[OpenAIAgentsIntegration()], + traces_sample_rate=1.0, + send_default_pii=True, + stream_gen_ai_spans=stream_gen_ai_spans, + ) + items = capture_items("span", "transaction") - events = capture_events() + # Note: The agents library catches tool exceptions internally, + # so we don't expect this to raise + await agents.Runner.run( + agent_with_tool, + "Please use the failing tool", + run_config=test_run_config, + ) - # Note: The agents library catches tool exceptions internally, - # so we don't expect this to raise - await agents.Runner.run( - agent_with_tool, - "Please use the failing tool", - run_config=test_run_config, - ) + spans = [item.payload for item in items if item.type == "span"] + + # Find the execute_tool span + execute_tool_span = None + for span in spans: + description = span.get("name", "") + if description is not None and description.startswith( + "execute_tool failing_tool" + ): + execute_tool_span = span + break + + # Verify the execute_tool span was created + assert execute_tool_span is not None, "execute_tool span was not created" + assert execute_tool_span["name"] == "execute_tool failing_tool" + assert execute_tool_span["attributes"]["gen_ai.tool.name"] == "failing_tool" + + # Verify error status was set (this is the key test for our patch) + # The span should be marked as error because the tool execution failed + assert execute_tool_span["status"] == "error" + else: + with patch.object( + agent_with_tool.model._client._client, + "send", + side_effect=[tool_response, final_response], + ) as _: + sentry_init( + integrations=[OpenAIAgentsIntegration()], + traces_sample_rate=1.0, + send_default_pii=True, + stream_gen_ai_spans=stream_gen_ai_spans, + ) + events = capture_events() + + # Note: The agents library catches tool exceptions internally, + # so we don't expect this to raise + await agents.Runner.run( + agent_with_tool, + "Please use the failing tool", + run_config=test_run_config, + ) - (transaction,) = events - spans = transaction["spans"] + (transaction,) = events + spans = transaction["spans"] - # Find the execute_tool span - execute_tool_span = None - for span in spans: - description = span.get("description", "") - if description is not None and description.startswith( - "execute_tool failing_tool" - ): - execute_tool_span = span - break + # Find the execute_tool span + execute_tool_span = None + for span in spans: + description = span.get("description", "") + if description is not None and description.startswith( + "execute_tool failing_tool" + ): + execute_tool_span = span + break - # Verify the execute_tool span was created - assert execute_tool_span is not None, "execute_tool span was not created" - assert execute_tool_span["description"] == "execute_tool failing_tool" - assert execute_tool_span["data"]["gen_ai.tool.name"] == "failing_tool" + # Verify the execute_tool span was created + assert execute_tool_span is not None, "execute_tool span was not created" + assert execute_tool_span["description"] == "execute_tool failing_tool" + assert execute_tool_span["data"]["gen_ai.tool.name"] == "failing_tool" - # Verify error status was set (this is the key test for our patch) - # The span should be marked as error because the tool execution failed - assert execute_tool_span["status"] == "internal_error" - assert execute_tool_span["tags"]["status"] == "internal_error" + # Verify error status was set (this is the key test for our patch) + # The span should be marked as error because the tool execution failed + assert execute_tool_span["status"] == "internal_error" + assert execute_tool_span["tags"]["status"] == "internal_error" +@pytest.mark.parametrize("stream_gen_ai_spans", [True, False]) @pytest.mark.asyncio async def test_invoke_agent_span_includes_usage_data( sentry_init, capture_events, + capture_items, test_agent, get_model_response, + stream_gen_ai_spans, ): """ Test that invoke_agent spans include aggregated usage data from context_wrapper. @@ -2426,50 +3701,94 @@ async def test_invoke_agent_span_includes_usage_data( serialize_pydantic=True, ) - with patch.object( - agent.model._client._client, - "send", - return_value=response, - ) as _: - sentry_init( - integrations=[OpenAIAgentsIntegration()], - traces_sample_rate=1.0, - send_default_pii=True, + if stream_gen_ai_spans: + with patch.object( + agent.model._client._client, + "send", + return_value=response, + ) as _: + sentry_init( + integrations=[OpenAIAgentsIntegration()], + traces_sample_rate=1.0, + send_default_pii=True, + stream_gen_ai_spans=stream_gen_ai_spans, + ) + items = capture_items("span", "transaction") + + result = await agents.Runner.run( + agent, "Test input", run_config=test_run_config + ) + + assert result is not None + + spans = [item.payload for item in items if item.type == "span"] + invoke_agent_span = next( + span + for span in spans + if span["attributes"]["sentry.op"] == OP.GEN_AI_INVOKE_AGENT ) - events = capture_events() + # Verify invoke_agent span has usage data from context_wrapper + assert invoke_agent_span["name"] == "invoke_agent test_agent" + assert "gen_ai.usage.input_tokens" in invoke_agent_span["attributes"] + assert "gen_ai.usage.output_tokens" in invoke_agent_span["attributes"] + assert "gen_ai.usage.total_tokens" in invoke_agent_span["attributes"] + + assert invoke_agent_span["attributes"]["gen_ai.usage.input_tokens"] == 10 + assert invoke_agent_span["attributes"]["gen_ai.usage.output_tokens"] == 20 + assert invoke_agent_span["attributes"]["gen_ai.usage.total_tokens"] == 30 + assert invoke_agent_span["attributes"]["gen_ai.usage.input_tokens.cached"] == 0 + assert ( + invoke_agent_span["attributes"]["gen_ai.usage.output_tokens.reasoning"] == 5 + ) + else: + with patch.object( + agent.model._client._client, + "send", + return_value=response, + ) as _: + sentry_init( + integrations=[OpenAIAgentsIntegration()], + traces_sample_rate=1.0, + send_default_pii=True, + stream_gen_ai_spans=stream_gen_ai_spans, + ) + events = capture_events() - result = await agents.Runner.run( - agent, "Test input", run_config=test_run_config - ) + result = await agents.Runner.run( + agent, "Test input", run_config=test_run_config + ) - assert result is not None + assert result is not None - (transaction,) = events - spans = transaction["spans"] - invoke_agent_span = next( - span for span in spans if span["op"] == OP.GEN_AI_INVOKE_AGENT - ) + (transaction,) = events + spans = transaction["spans"] + invoke_agent_span = next( + span for span in spans if span["op"] == OP.GEN_AI_INVOKE_AGENT + ) - # Verify invoke_agent span has usage data from context_wrapper - assert invoke_agent_span["description"] == "invoke_agent test_agent" - assert "gen_ai.usage.input_tokens" in invoke_agent_span["data"] - assert "gen_ai.usage.output_tokens" in invoke_agent_span["data"] - assert "gen_ai.usage.total_tokens" in invoke_agent_span["data"] + # Verify invoke_agent span has usage data from context_wrapper + assert invoke_agent_span["description"] == "invoke_agent test_agent" + assert "gen_ai.usage.input_tokens" in invoke_agent_span["data"] + assert "gen_ai.usage.output_tokens" in invoke_agent_span["data"] + assert "gen_ai.usage.total_tokens" in invoke_agent_span["data"] - assert invoke_agent_span["data"]["gen_ai.usage.input_tokens"] == 10 - assert invoke_agent_span["data"]["gen_ai.usage.output_tokens"] == 20 - assert invoke_agent_span["data"]["gen_ai.usage.total_tokens"] == 30 - assert invoke_agent_span["data"]["gen_ai.usage.input_tokens.cached"] == 0 - assert invoke_agent_span["data"]["gen_ai.usage.output_tokens.reasoning"] == 5 + assert invoke_agent_span["data"]["gen_ai.usage.input_tokens"] == 10 + assert invoke_agent_span["data"]["gen_ai.usage.output_tokens"] == 20 + assert invoke_agent_span["data"]["gen_ai.usage.total_tokens"] == 30 + assert invoke_agent_span["data"]["gen_ai.usage.input_tokens.cached"] == 0 + assert invoke_agent_span["data"]["gen_ai.usage.output_tokens.reasoning"] == 5 +@pytest.mark.parametrize("stream_gen_ai_spans", [True, False]) @pytest.mark.asyncio async def test_ai_client_span_includes_response_model( sentry_init, capture_events, + capture_items, test_agent, get_model_response, + stream_gen_ai_spans, ): """ Test that ai_client spans (gen_ai.chat) include the response model from the actual API response. @@ -2518,40 +3837,76 @@ async def test_ai_client_span_includes_response_model( serialize_pydantic=True, ) - with patch.object( - agent.model._client._client, - "send", - return_value=response, - ) as _: - sentry_init( - integrations=[OpenAIAgentsIntegration()], - traces_sample_rate=1.0, - send_default_pii=True, - ) + if stream_gen_ai_spans: + with patch.object( + agent.model._client._client, + "send", + return_value=response, + ) as _: + sentry_init( + integrations=[OpenAIAgentsIntegration()], + traces_sample_rate=1.0, + send_default_pii=True, + stream_gen_ai_spans=stream_gen_ai_spans, + ) + items = capture_items("span", "transaction") + + result = await agents.Runner.run( + agent, "Test input", run_config=test_run_config + ) + + assert result is not None - events = capture_events() + spans = [item.payload for item in items if item.type == "span"] + ai_client_span = next( + span for span in spans if span["attributes"]["sentry.op"] == OP.GEN_AI_CHAT + ) - result = await agents.Runner.run( - agent, "Test input", run_config=test_run_config + # Verify ai_client span has response model from API response + assert ai_client_span["name"] == "chat gpt-4" + assert "gen_ai.response.model" in ai_client_span["attributes"] + assert ( + ai_client_span["attributes"]["gen_ai.response.model"] + == "gpt-4.1-2025-04-14" ) + else: + with patch.object( + agent.model._client._client, + "send", + return_value=response, + ) as _: + sentry_init( + integrations=[OpenAIAgentsIntegration()], + traces_sample_rate=1.0, + send_default_pii=True, + stream_gen_ai_spans=stream_gen_ai_spans, + ) + events = capture_events() - assert result is not None + result = await agents.Runner.run( + agent, "Test input", run_config=test_run_config + ) + + assert result is not None - (transaction,) = events - spans = transaction["spans"] - ai_client_span = next(span for span in spans if span["op"] == OP.GEN_AI_CHAT) + (transaction,) = events + spans = transaction["spans"] + ai_client_span = next(span for span in spans if span["op"] == OP.GEN_AI_CHAT) - # Verify ai_client span has response model from API response - assert ai_client_span["description"] == "chat gpt-4" - assert "gen_ai.response.model" in ai_client_span["data"] - assert ai_client_span["data"]["gen_ai.response.model"] == "gpt-4.1-2025-04-14" + # Verify ai_client span has response model from API response + assert ai_client_span["description"] == "chat gpt-4" + assert "gen_ai.response.model" in ai_client_span["data"] + assert ai_client_span["data"]["gen_ai.response.model"] == "gpt-4.1-2025-04-14" +@pytest.mark.parametrize("stream_gen_ai_spans", [True, False]) @pytest.mark.asyncio async def test_ai_client_span_response_model_with_chat_completions( sentry_init, capture_events, + capture_items, get_model_response, + stream_gen_ai_spans, ): """ Test that response model is captured when using ChatCompletions API (not Responses API). @@ -2606,36 +3961,76 @@ async def test_ai_client_span_response_model_with_chat_completions( serialize_pydantic=True, ) - with patch.object( - agent.model._client._client, - "send", - return_value=response, - ) as _: - sentry_init( - integrations=[OpenAIAgentsIntegration()], - traces_sample_rate=1.0, - ) + if stream_gen_ai_spans: + with patch.object( + agent.model._client._client, + "send", + return_value=response, + ) as _: + sentry_init( + integrations=[OpenAIAgentsIntegration()], + traces_sample_rate=1.0, + stream_gen_ai_spans=stream_gen_ai_spans, + ) + + items = capture_items("span", "transaction") + + result = await agents.Runner.run( + agent, "Test input", run_config=test_run_config + ) + + assert result is not None - events = capture_events() + spans = [item.payload for item in items if item.type == "span"] + ai_client_span = next( + span for span in spans if span["attributes"]["sentry.op"] == OP.GEN_AI_CHAT + ) - result = await agents.Runner.run( - agent, "Test input", run_config=test_run_config + # Verify response model from API response is captured + assert "gen_ai.response.model" in ai_client_span["attributes"] + assert ( + ai_client_span["attributes"]["gen_ai.response.model"] + == "gpt-4o-mini-2024-07-18" ) + else: + with patch.object( + agent.model._client._client, + "send", + return_value=response, + ) as _: + sentry_init( + integrations=[OpenAIAgentsIntegration()], + traces_sample_rate=1.0, + stream_gen_ai_spans=stream_gen_ai_spans, + ) + events = capture_events() + + result = await agents.Runner.run( + agent, "Test input", run_config=test_run_config + ) - assert result is not None + assert result is not None - (transaction,) = events - spans = transaction["spans"] - ai_client_span = next(span for span in spans if span["op"] == OP.GEN_AI_CHAT) + (transaction,) = events + spans = transaction["spans"] + ai_client_span = next(span for span in spans if span["op"] == OP.GEN_AI_CHAT) - # Verify response model from API response is captured - assert "gen_ai.response.model" in ai_client_span["data"] - assert ai_client_span["data"]["gen_ai.response.model"] == "gpt-4o-mini-2024-07-18" + # Verify response model from API response is captured + assert "gen_ai.response.model" in ai_client_span["data"] + assert ( + ai_client_span["data"]["gen_ai.response.model"] == "gpt-4o-mini-2024-07-18" + ) +@pytest.mark.parametrize("stream_gen_ai_spans", [True, False]) @pytest.mark.asyncio async def test_multiple_llm_calls_aggregate_usage( - sentry_init, capture_events, test_agent, get_model_response + sentry_init, + capture_events, + capture_items, + test_agent, + get_model_response, + stream_gen_ai_spans, ): """ Test that invoke_agent spans show aggregated usage across multiple LLM calls @@ -2723,48 +4118,91 @@ def calculator(a: int, b: int) -> int: serialize_pydantic=True, ) - with patch.object( - agent_with_tool.model._client._client, - "send", - side_effect=[tool_call_response, final_response], - ) as _: - sentry_init( - integrations=[OpenAIAgentsIntegration()], - traces_sample_rate=1.0, - send_default_pii=True, - ) + if stream_gen_ai_spans: + with patch.object( + agent_with_tool.model._client._client, + "send", + side_effect=[tool_call_response, final_response], + ) as _: + sentry_init( + integrations=[OpenAIAgentsIntegration()], + traces_sample_rate=1.0, + send_default_pii=True, + stream_gen_ai_spans=stream_gen_ai_spans, + ) - events = capture_events() + items = capture_items("span", "transaction") - result = await agents.Runner.run( - agent_with_tool, - "What is 5 + 3?", - run_config=test_run_config, + result = await agents.Runner.run( + agent_with_tool, + "What is 5 + 3?", + run_config=test_run_config, + ) + + assert result is not None + + spans = [item.payload for item in items if item.type == "span"] + + invoke_agent_span = spans[0] + + # Verify invoke_agent span has aggregated usage from both API calls + # Total: 10 + 20 = 30 input tokens, 5 + 15 = 20 output tokens, 15 + 35 = 50 total + assert invoke_agent_span["attributes"]["gen_ai.usage.input_tokens"] == 30 + assert invoke_agent_span["attributes"]["gen_ai.usage.output_tokens"] == 20 + assert invoke_agent_span["attributes"]["gen_ai.usage.total_tokens"] == 50 + # Cached tokens should be aggregated: 0 + 5 = 5 + assert invoke_agent_span["attributes"]["gen_ai.usage.input_tokens.cached"] == 5 + # Reasoning tokens should be aggregated: 0 + 3 = 3 + assert ( + invoke_agent_span["attributes"]["gen_ai.usage.output_tokens.reasoning"] == 3 ) + else: + with patch.object( + agent_with_tool.model._client._client, + "send", + side_effect=[tool_call_response, final_response], + ) as _: + sentry_init( + integrations=[OpenAIAgentsIntegration()], + traces_sample_rate=1.0, + send_default_pii=True, + stream_gen_ai_spans=stream_gen_ai_spans, + ) + events = capture_events() + + result = await agents.Runner.run( + agent_with_tool, + "What is 5 + 3?", + run_config=test_run_config, + ) + + assert result is not None - assert result is not None + (transaction,) = events + spans = transaction["spans"] - (transaction,) = events - spans = transaction["spans"] - invoke_agent_span = spans[0] + invoke_agent_span = spans[0] - # Verify invoke_agent span has aggregated usage from both API calls - # Total: 10 + 20 = 30 input tokens, 5 + 15 = 20 output tokens, 15 + 35 = 50 total - assert invoke_agent_span["data"]["gen_ai.usage.input_tokens"] == 30 - assert invoke_agent_span["data"]["gen_ai.usage.output_tokens"] == 20 - assert invoke_agent_span["data"]["gen_ai.usage.total_tokens"] == 50 - # Cached tokens should be aggregated: 0 + 5 = 5 - assert invoke_agent_span["data"]["gen_ai.usage.input_tokens.cached"] == 5 - # Reasoning tokens should be aggregated: 0 + 3 = 3 - assert invoke_agent_span["data"]["gen_ai.usage.output_tokens.reasoning"] == 3 + # Verify invoke_agent span has aggregated usage from both API calls + # Total: 10 + 20 = 30 input tokens, 5 + 15 = 20 output tokens, 15 + 35 = 50 total + assert invoke_agent_span["data"]["gen_ai.usage.input_tokens"] == 30 + assert invoke_agent_span["data"]["gen_ai.usage.output_tokens"] == 20 + assert invoke_agent_span["data"]["gen_ai.usage.total_tokens"] == 50 + # Cached tokens should be aggregated: 0 + 5 = 5 + assert invoke_agent_span["data"]["gen_ai.usage.input_tokens.cached"] == 5 + # Reasoning tokens should be aggregated: 0 + 3 = 3 + assert invoke_agent_span["data"]["gen_ai.usage.output_tokens.reasoning"] == 3 +@pytest.mark.parametrize("stream_gen_ai_spans", [True, False]) @pytest.mark.asyncio async def test_invoke_agent_span_includes_response_model( sentry_init, capture_events, + capture_items, test_agent, get_model_response, + stream_gen_ai_spans, ): """ Test that invoke_agent spans include the response model from the API response. @@ -2812,48 +4250,99 @@ async def test_invoke_agent_span_includes_response_model( serialize_pydantic=True, ) - with patch.object( - agent.model._client._client, - "send", - return_value=response, - ) as _: - sentry_init( - integrations=[OpenAIAgentsIntegration()], - traces_sample_rate=1.0, - send_default_pii=True, + if stream_gen_ai_spans: + with patch.object( + agent.model._client._client, + "send", + return_value=response, + ) as _: + sentry_init( + integrations=[OpenAIAgentsIntegration()], + traces_sample_rate=1.0, + send_default_pii=True, + stream_gen_ai_spans=stream_gen_ai_spans, + ) + + items = capture_items("span", "transaction") + + result = await agents.Runner.run( + agent, "Test input", run_config=test_run_config + ) + + assert result is not None + + spans = [item.payload for item in items if item.type == "span"] + invoke_agent_span = next( + span + for span in spans + if span["attributes"]["sentry.op"] == OP.GEN_AI_INVOKE_AGENT + ) + ai_client_span = next( + span for span in spans if span["attributes"]["sentry.op"] == OP.GEN_AI_CHAT ) - events = capture_events() + # Verify invoke_agent span has response model from API + assert invoke_agent_span["name"] == "invoke_agent test_agent" + assert "gen_ai.response.model" in invoke_agent_span["attributes"] + assert ( + invoke_agent_span["attributes"]["gen_ai.response.model"] + == "gpt-4.1-2025-04-14" + ) - result = await agents.Runner.run( - agent, "Test input", run_config=test_run_config + # Also verify ai_client span has it + assert "gen_ai.response.model" in ai_client_span["attributes"] + assert ( + ai_client_span["attributes"]["gen_ai.response.model"] + == "gpt-4.1-2025-04-14" ) + else: + with patch.object( + agent.model._client._client, + "send", + return_value=response, + ) as _: + sentry_init( + integrations=[OpenAIAgentsIntegration()], + traces_sample_rate=1.0, + send_default_pii=True, + stream_gen_ai_spans=stream_gen_ai_spans, + ) + events = capture_events() - assert result is not None + result = await agents.Runner.run( + agent, "Test input", run_config=test_run_config + ) - (transaction,) = events - spans = transaction["spans"] - invoke_agent_span = next( - span for span in spans if span["op"] == OP.GEN_AI_INVOKE_AGENT - ) - ai_client_span = next(span for span in spans if span["op"] == OP.GEN_AI_CHAT) + assert result is not None + + (transaction,) = events + spans = transaction["spans"] + invoke_agent_span = next( + span for span in spans if span["op"] == OP.GEN_AI_INVOKE_AGENT + ) + ai_client_span = next(span for span in spans if span["op"] == OP.GEN_AI_CHAT) - # Verify invoke_agent span has response model from API - assert invoke_agent_span["description"] == "invoke_agent test_agent" - assert "gen_ai.response.model" in invoke_agent_span["data"] - assert invoke_agent_span["data"]["gen_ai.response.model"] == "gpt-4.1-2025-04-14" + # Verify invoke_agent span has response model from API + assert invoke_agent_span["description"] == "invoke_agent test_agent" + assert "gen_ai.response.model" in invoke_agent_span["data"] + assert ( + invoke_agent_span["data"]["gen_ai.response.model"] == "gpt-4.1-2025-04-14" + ) - # Also verify ai_client span has it - assert "gen_ai.response.model" in ai_client_span["data"] - assert ai_client_span["data"]["gen_ai.response.model"] == "gpt-4.1-2025-04-14" + # Also verify ai_client span has it + assert "gen_ai.response.model" in ai_client_span["data"] + assert ai_client_span["data"]["gen_ai.response.model"] == "gpt-4.1-2025-04-14" +@pytest.mark.parametrize("stream_gen_ai_spans", [True, False]) @pytest.mark.asyncio async def test_invoke_agent_span_uses_last_response_model( sentry_init, capture_events, + capture_items, test_agent, get_model_response, + stream_gen_ai_spans, ): """ Test that when an agent makes multiple LLM calls (e.g., with tools), @@ -2941,45 +4430,94 @@ def calculator(a: int, b: int) -> int: serialize_pydantic=True, ) - with patch.object( - agent_with_tool.model._client._client, - "send", - side_effect=[first_response, second_response], - ) as _: - sentry_init( - integrations=[OpenAIAgentsIntegration()], - traces_sample_rate=1.0, - send_default_pii=True, - ) + if stream_gen_ai_spans: + with patch.object( + agent_with_tool.model._client._client, + "send", + side_effect=[first_response, second_response], + ) as _: + sentry_init( + integrations=[OpenAIAgentsIntegration()], + traces_sample_rate=1.0, + send_default_pii=True, + stream_gen_ai_spans=stream_gen_ai_spans, + ) - events = capture_events() + items = capture_items("span", "transaction") - result = await agents.Runner.run( - agent_with_tool, - "What is 5 + 3?", - run_config=test_run_config, + result = await agents.Runner.run( + agent_with_tool, + "What is 5 + 3?", + run_config=test_run_config, + ) + + assert result is not None + + spans = [item.payload for item in items if item.type == "span"] + + invoke_agent_span = spans[0] + first_ai_client_span = spans[1] + second_ai_client_span = spans[3] # After tool span + + # Invoke_agent span uses the LAST response model + assert "gen_ai.response.model" in invoke_agent_span["attributes"] + assert ( + invoke_agent_span["attributes"]["gen_ai.response.model"] + == "gpt-4.1-2025-04-14" + ) + + # Each ai_client span has its own response model from the API + assert ( + first_ai_client_span["attributes"]["gen_ai.response.model"] == "gpt-4-0613" + ) + assert ( + second_ai_client_span["attributes"]["gen_ai.response.model"] + == "gpt-4.1-2025-04-14" ) + else: + with patch.object( + agent_with_tool.model._client._client, + "send", + side_effect=[first_response, second_response], + ) as _: + sentry_init( + integrations=[OpenAIAgentsIntegration()], + traces_sample_rate=1.0, + send_default_pii=True, + stream_gen_ai_spans=stream_gen_ai_spans, + ) + events = capture_events() + + result = await agents.Runner.run( + agent_with_tool, + "What is 5 + 3?", + run_config=test_run_config, + ) - assert result is not None + assert result is not None - (transaction,) = events - spans = transaction["spans"] - invoke_agent_span = spans[0] - first_ai_client_span = spans[1] - second_ai_client_span = spans[3] # After tool span + (transaction,) = events + spans = transaction["spans"] - # Invoke_agent span uses the LAST response model - assert "gen_ai.response.model" in invoke_agent_span["data"] - assert invoke_agent_span["data"]["gen_ai.response.model"] == "gpt-4.1-2025-04-14" + invoke_agent_span = spans[0] + first_ai_client_span = spans[1] + second_ai_client_span = spans[3] # After tool span - # Each ai_client span has its own response model from the API - assert first_ai_client_span["data"]["gen_ai.response.model"] == "gpt-4-0613" - assert ( - second_ai_client_span["data"]["gen_ai.response.model"] == "gpt-4.1-2025-04-14" - ) + # Invoke_agent span uses the LAST response model + assert "gen_ai.response.model" in invoke_agent_span["data"] + assert ( + invoke_agent_span["data"]["gen_ai.response.model"] == "gpt-4.1-2025-04-14" + ) + + # Each ai_client span has its own response model from the API + assert first_ai_client_span["data"]["gen_ai.response.model"] == "gpt-4-0613" + assert ( + second_ai_client_span["data"]["gen_ai.response.model"] + == "gpt-4.1-2025-04-14" + ) -def test_openai_agents_message_truncation(sentry_init, capture_events): +def test_openai_agents_message_truncation(sentry_init, capture_items): """Test that large messages are truncated properly in OpenAI Agents integration.""" large_content = ( @@ -3199,30 +4737,28 @@ async def test_streaming_ttft_on_chat_span( agent_with_tool.model._client._client, "send", return_value=response, - ) as _: - with sentry_sdk.start_transaction( - name="test_ttft", sampled=True - ) as transaction: - result = agents.Runner.run_streamed( - agent_with_tool, - "Please use the simple test tool", - run_config=test_run_config, - ) + ) as _, sentry_sdk.start_transaction(name="test_ttft", sampled=True) as transaction: + result = agents.Runner.run_streamed( + agent_with_tool, + "Please use the simple test tool", + run_config=test_run_config, + ) - async for event in result.stream_events(): - pass + async for event in result.stream_events(): + pass - # Verify TTFT is recorded on the chat span (must be inside transaction context) - chat_spans = [ - s for s in transaction._span_recorder.spans if s.op == "gen_ai.chat" - ] - assert len(chat_spans) >= 1 - chat_span = chat_spans[0] + # Verify TTFT is recorded on the chat span (must be inside transaction context) + chat_spans = [ + s for s in transaction._span_recorder.spans if s.op == "gen_ai.chat" + ] + assert len(chat_spans) >= 1 + chat_span = chat_spans[0] - assert SPANDATA.GEN_AI_RESPONSE_TIME_TO_FIRST_TOKEN in chat_span._data - assert chat_span._data.get(SPANDATA.GEN_AI_RESPONSE_STREAMING) is True + assert SPANDATA.GEN_AI_RESPONSE_TIME_TO_FIRST_TOKEN in chat_span._data + assert chat_span._data.get(SPANDATA.GEN_AI_RESPONSE_STREAMING) is True +@pytest.mark.parametrize("stream_gen_ai_spans", [True, False]) @pytest.mark.skipif( parse_version(OPENAI_AGENTS_VERSION) < (0, 4, 0), reason="conversation_id support requires openai-agents >= 0.4.0", @@ -3231,9 +4767,11 @@ async def test_streaming_ttft_on_chat_span( async def test_conversation_id_on_all_spans( sentry_init, capture_events, + capture_items, test_agent, nonstreaming_responses_model_response, get_model_response, + stream_gen_ai_spans, ): """ Test that gen_ai.conversation.id is set on all AI-related spans when passed to Runner.run(). @@ -3247,54 +4785,109 @@ async def test_conversation_id_on_all_spans( nonstreaming_responses_model_response, serialize_pydantic=True ) - with patch.object( - agent.model._client._client, - "send", - return_value=response, - ) as _: - sentry_init( - integrations=[OpenAIAgentsIntegration()], - traces_sample_rate=1.0, + if stream_gen_ai_spans: + with patch.object( + agent.model._client._client, + "send", + return_value=response, + ) as _: + sentry_init( + integrations=[OpenAIAgentsIntegration()], + traces_sample_rate=1.0, + stream_gen_ai_spans=stream_gen_ai_spans, + ) + + items = capture_items("span", "transaction") + + result = await agents.Runner.run( + agent, + "Test input", + run_config=test_run_config, + conversation_id="conv_test_123", + ) + + assert result is not None + + spans = [item.payload for item in items if item.type == "span"] + invoke_agent_span = next( + span + for span in spans + if span["attributes"]["sentry.op"] == OP.GEN_AI_INVOKE_AGENT + ) + ai_client_span = next( + span for span in spans if span["attributes"]["sentry.op"] == OP.GEN_AI_CHAT ) - events = capture_events() + # Verify workflow span (transaction) has conversation_id + (transaction,) = (item.payload for item in items if item.type == "transaction") - result = await agents.Runner.run( - agent, - "Test input", - run_config=test_run_config, - conversation_id="conv_test_123", + assert ( + transaction["contexts"]["trace"]["data"]["gen_ai.conversation.id"] + == "conv_test_123" + ) + + # Verify invoke_agent span has conversation_id + assert ( + invoke_agent_span["attributes"]["gen_ai.conversation.id"] == "conv_test_123" ) - assert result is not None + # Verify ai_client span has conversation_id + assert ai_client_span["attributes"]["gen_ai.conversation.id"] == "conv_test_123" + else: + with patch.object( + agent.model._client._client, + "send", + return_value=response, + ) as _: + sentry_init( + integrations=[OpenAIAgentsIntegration()], + traces_sample_rate=1.0, + stream_gen_ai_spans=stream_gen_ai_spans, + ) + events = capture_events() - (transaction,) = events - spans = transaction["spans"] - invoke_agent_span = next( - span for span in spans if span["op"] == OP.GEN_AI_INVOKE_AGENT - ) - ai_client_span = next(span for span in spans if span["op"] == OP.GEN_AI_CHAT) + result = await agents.Runner.run( + agent, + "Test input", + run_config=test_run_config, + conversation_id="conv_test_123", + ) - # Verify workflow span (transaction) has conversation_id - assert ( - transaction["contexts"]["trace"]["data"]["gen_ai.conversation.id"] - == "conv_test_123" - ) + assert result is not None + + (transaction,) = events + spans = transaction["spans"] + invoke_agent_span = next( + span for span in spans if span["op"] == OP.GEN_AI_INVOKE_AGENT + ) + ai_client_span = next(span for span in spans if span["op"] == OP.GEN_AI_CHAT) + + # Verify workflow span (transaction) has conversation_id + assert ( + transaction["contexts"]["trace"]["data"]["gen_ai.conversation.id"] + == "conv_test_123" + ) - # Verify invoke_agent span has conversation_id - assert invoke_agent_span["data"]["gen_ai.conversation.id"] == "conv_test_123" + # Verify invoke_agent span has conversation_id + assert invoke_agent_span["data"]["gen_ai.conversation.id"] == "conv_test_123" - # Verify ai_client span has conversation_id - assert ai_client_span["data"]["gen_ai.conversation.id"] == "conv_test_123" + # Verify ai_client span has conversation_id + assert ai_client_span["data"]["gen_ai.conversation.id"] == "conv_test_123" +@pytest.mark.parametrize("stream_gen_ai_spans", [True, False]) @pytest.mark.skipif( parse_version(OPENAI_AGENTS_VERSION) < (0, 4, 0), reason="conversation_id support requires openai-agents >= 0.4.0", ) @pytest.mark.asyncio async def test_conversation_id_on_tool_span( - sentry_init, capture_events, test_agent, get_model_response + sentry_init, + capture_events, + capture_items, + test_agent, + get_model_response, + stream_gen_ai_spans, ): """ Test that gen_ai.conversation.id is set on tool execution spans when passed to Runner.run(). @@ -3381,46 +4974,91 @@ def simple_tool(message: str) -> str: serialize_pydantic=True, ) - with patch.object( - agent_with_tool.model._client._client, - "send", - side_effect=[tool_response, final_response], - ) as _: - sentry_init( - integrations=[OpenAIAgentsIntegration()], - traces_sample_rate=1.0, - ) + if stream_gen_ai_spans: + with patch.object( + agent_with_tool.model._client._client, + "send", + side_effect=[tool_response, final_response], + ) as _: + sentry_init( + integrations=[OpenAIAgentsIntegration()], + traces_sample_rate=1.0, + stream_gen_ai_spans=stream_gen_ai_spans, + ) - events = capture_events() + items = capture_items("span", "transaction") - await agents.Runner.run( - agent_with_tool, - "Use the tool", - run_config=test_run_config, - conversation_id="conv_tool_test_456", + await agents.Runner.run( + agent_with_tool, + "Use the tool", + run_config=test_run_config, + conversation_id="conv_tool_test_456", + ) + + spans = [item.payload for item in items if item.type == "span"] + + # Find the tool span + tool_span = None + for span in spans: + if span.get("name", "").startswith("execute_tool"): + tool_span = span + break + + assert tool_span is not None + # Tool span should have the conversation_id passed to Runner.run() + assert tool_span["attributes"]["gen_ai.conversation.id"] == "conv_tool_test_456" + + # Workflow span (transaction) should have the same conversation_id + (transaction,) = (item.payload for item in items if item.type == "transaction") + + # Workflow span (transaction) should have the same conversation_id + assert ( + transaction["contexts"]["trace"]["data"]["gen_ai.conversation.id"] + == "conv_tool_test_456" ) + else: + with patch.object( + agent_with_tool.model._client._client, + "send", + side_effect=[tool_response, final_response], + ) as _: + sentry_init( + integrations=[OpenAIAgentsIntegration()], + traces_sample_rate=1.0, + stream_gen_ai_spans=stream_gen_ai_spans, + ) + + events = capture_events() + + await agents.Runner.run( + agent_with_tool, + "Use the tool", + run_config=test_run_config, + conversation_id="conv_tool_test_456", + ) - (transaction,) = events - spans = transaction["spans"] + (transaction,) = events + spans = transaction["spans"] - # Find the tool span - tool_span = None - for span in spans: - if span.get("description", "").startswith("execute_tool"): - tool_span = span - break + # Find the tool span + tool_span = None + for span in spans: + if span.get("description", "").startswith("execute_tool"): + tool_span = span + break - assert tool_span is not None - # Tool span should have the conversation_id passed to Runner.run() - assert tool_span["data"]["gen_ai.conversation.id"] == "conv_tool_test_456" + assert tool_span is not None + # Tool span should have the conversation_id passed to Runner.run() + assert tool_span["data"]["gen_ai.conversation.id"] == "conv_tool_test_456" - # Workflow span (transaction) should have the same conversation_id - assert ( - transaction["contexts"]["trace"]["data"]["gen_ai.conversation.id"] - == "conv_tool_test_456" - ) + # Workflow span (transaction) should have the same conversation_id + assert ( + transaction["contexts"]["trace"]["data"]["gen_ai.conversation.id"] + == "conv_tool_test_456" + ) +@pytest.mark.parametrize("stream_gen_ai_spans", [True, False]) @pytest.mark.skipif( parse_version(OPENAI_AGENTS_VERSION) < (0, 4, 0), reason="conversation_id support requires openai-agents >= 0.4.0", @@ -3429,9 +5067,11 @@ def simple_tool(message: str) -> str: async def test_no_conversation_id_when_not_provided( sentry_init, capture_events, + capture_items, test_agent, nonstreaming_responses_model_response, get_model_response, + stream_gen_ai_spans, ): """ Test that gen_ai.conversation.id is not set when not passed to Runner.run(). @@ -3445,35 +5085,76 @@ async def test_no_conversation_id_when_not_provided( nonstreaming_responses_model_response, serialize_pydantic=True ) - with patch.object( - agent.model._client._client, - "send", - return_value=response, - ) as _: - sentry_init( - integrations=[OpenAIAgentsIntegration()], - traces_sample_rate=1.0, - ) + if stream_gen_ai_spans: + with patch.object( + agent.model._client._client, + "send", + return_value=response, + ) as _: + sentry_init( + integrations=[OpenAIAgentsIntegration()], + traces_sample_rate=1.0, + stream_gen_ai_spans=stream_gen_ai_spans, + ) + + items = capture_items("span", "transaction") + + # Don't pass conversation_id + result = await agents.Runner.run( + agent, "Test input", run_config=test_run_config + ) + + assert result is not None - events = capture_events() + (transaction,) = (item.payload for item in items if item.type == "transaction") - # Don't pass conversation_id - result = await agents.Runner.run( - agent, "Test input", run_config=test_run_config + spans = [item.payload for item in items if item.type == "span"] + invoke_agent_span = next( + span + for span in spans + if span["attributes"]["sentry.op"] == OP.GEN_AI_INVOKE_AGENT + ) + ai_client_span = next( + span for span in spans if span["attributes"]["sentry.op"] == OP.GEN_AI_CHAT ) - assert result is not None + # Verify conversation_id is NOT set on any spans + assert "gen_ai.conversation.id" not in transaction["contexts"]["trace"].get( + "attributes", {} + ) + assert "gen_ai.conversation.id" not in invoke_agent_span.get("attributes", {}) + assert "gen_ai.conversation.id" not in ai_client_span.get("attributes", {}) + else: + with patch.object( + agent.model._client._client, + "send", + return_value=response, + ) as _: + sentry_init( + integrations=[OpenAIAgentsIntegration()], + traces_sample_rate=1.0, + stream_gen_ai_spans=stream_gen_ai_spans, + ) - (transaction,) = events - spans = transaction["spans"] - invoke_agent_span = next( - span for span in spans if span["op"] == OP.GEN_AI_INVOKE_AGENT - ) - ai_client_span = next(span for span in spans if span["op"] == OP.GEN_AI_CHAT) + events = capture_events() - # Verify conversation_id is NOT set on any spans - assert "gen_ai.conversation.id" not in transaction["contexts"]["trace"].get( - "data", {} - ) - assert "gen_ai.conversation.id" not in invoke_agent_span.get("data", {}) - assert "gen_ai.conversation.id" not in ai_client_span.get("data", {}) + # Don't pass conversation_id + result = await agents.Runner.run( + agent, "Test input", run_config=test_run_config + ) + + assert result is not None + + (transaction,) = events + spans = transaction["spans"] + invoke_agent_span = next( + span for span in spans if span["op"] == OP.GEN_AI_INVOKE_AGENT + ) + ai_client_span = next(span for span in spans if span["op"] == OP.GEN_AI_CHAT) + + # Verify conversation_id is NOT set on any spans + assert "gen_ai.conversation.id" not in transaction["contexts"]["trace"].get( + "data", {} + ) + assert "gen_ai.conversation.id" not in invoke_agent_span.get("data", {}) + assert "gen_ai.conversation.id" not in ai_client_span.get("data", {}) diff --git a/tests/integrations/pydantic_ai/test_pydantic_ai.py b/tests/integrations/pydantic_ai/test_pydantic_ai.py index 50ce155f5b..23850c3233 100644 --- a/tests/integrations/pydantic_ai/test_pydantic_ai.py +++ b/tests/integrations/pydantic_ai/test_pydantic_ai.py @@ -52,8 +52,15 @@ def inner(): return inner +@pytest.mark.parametrize("stream_gen_ai_spans", [True, False]) @pytest.mark.asyncio -async def test_agent_run_async(sentry_init, capture_events, get_test_agent): +async def test_agent_run_async( + sentry_init, + capture_events, + capture_items, + get_test_agent, + stream_gen_ai_spans, +): """ Test that the integration creates spans for async agent runs. """ @@ -61,49 +68,89 @@ async def test_agent_run_async(sentry_init, capture_events, get_test_agent): integrations=[PydanticAIIntegration()], traces_sample_rate=1.0, send_default_pii=True, + stream_gen_ai_spans=stream_gen_ai_spans, ) - events = capture_events() - test_agent = get_test_agent() - result = await test_agent.run("Test input") - assert result is not None - assert result.output is not None + if stream_gen_ai_spans: + items = capture_items("transaction", "span") - (transaction,) = events - spans = transaction["spans"] + result = await test_agent.run("Test input") - # Verify transaction (the transaction IS the invoke_agent span) - assert transaction["transaction"] == "invoke_agent test_agent" - assert transaction["contexts"]["trace"]["origin"] == "auto.ai.pydantic_ai" + assert result is not None + assert result.output is not None - # The transaction itself should have invoke_agent data - assert transaction["contexts"]["trace"]["op"] == "gen_ai.invoke_agent" + (transaction,) = (item.payload for item in items if item.type == "transaction") - # Find child span types (invoke_agent is the transaction, not a child span) - chat_spans = [s for s in spans if s["op"] == "gen_ai.chat"] - assert len(chat_spans) >= 1 + # Verify transaction (the transaction IS the invoke_agent span) + assert transaction["transaction"] == "invoke_agent test_agent" + assert transaction["contexts"]["trace"]["origin"] == "auto.ai.pydantic_ai" - # Check chat span - chat_span = chat_spans[0] - assert "chat" in chat_span["description"] - assert chat_span["data"]["gen_ai.operation.name"] == "chat" - assert chat_span["data"]["gen_ai.response.streaming"] is False - assert "gen_ai.request.messages" in chat_span["data"] - assert "gen_ai.usage.input_tokens" in chat_span["data"] - assert "gen_ai.usage.output_tokens" in chat_span["data"] + # The transaction itself should have invoke_agent data + assert transaction["contexts"]["trace"]["op"] == "gen_ai.invoke_agent" + + spans = [item.payload for item in items if item.type == "span"] + # Find child span types (invoke_agent is the transaction, not a child span) + chat_spans = [ + s for s in spans if s["attributes"].get("sentry.op", "") == "gen_ai.chat" + ] + assert len(chat_spans) >= 1 + + # Check chat span + chat_span = chat_spans[0] + assert "chat" in chat_span["name"] + assert chat_span["attributes"]["gen_ai.operation.name"] == "chat" + assert chat_span["attributes"]["gen_ai.response.streaming"] is False + assert "gen_ai.request.messages" in chat_span["attributes"] + assert "gen_ai.usage.input_tokens" in chat_span["attributes"] + assert "gen_ai.usage.output_tokens" in chat_span["attributes"] + else: + events = capture_events() + + result = await test_agent.run("Test input") + + assert result is not None + assert result.output is not None + + (transaction,) = events + spans = transaction["spans"] + + # Verify transaction (the transaction IS the invoke_agent span) + assert transaction["transaction"] == "invoke_agent test_agent" + assert transaction["contexts"]["trace"]["origin"] == "auto.ai.pydantic_ai" + + # The transaction itself should have invoke_agent data + assert transaction["contexts"]["trace"]["op"] == "gen_ai.invoke_agent" + + # Find child span types (invoke_agent is the transaction, not a child span) + chat_spans = [s for s in spans if s["op"] == "gen_ai.chat"] + assert len(chat_spans) >= 1 + + # Check chat span + chat_span = chat_spans[0] + assert "chat" in chat_span["description"] + assert chat_span["data"]["gen_ai.operation.name"] == "chat" + assert chat_span["data"]["gen_ai.response.streaming"] is False + assert "gen_ai.request.messages" in chat_span["data"] + assert "gen_ai.usage.input_tokens" in chat_span["data"] + assert "gen_ai.usage.output_tokens" in chat_span["data"] +@pytest.mark.parametrize("stream_gen_ai_spans", [True, False]) @pytest.mark.asyncio -async def test_agent_run_async_model_error(sentry_init, capture_events): +async def test_agent_run_async_model_error( + sentry_init, + capture_events, + capture_items, + stream_gen_ai_spans, +): sentry_init( integrations=[PydanticAIIntegration()], traces_sample_rate=1.0, + stream_gen_ai_spans=stream_gen_ai_spans, ) - events = capture_events() - def failing_model(messages, info): raise RuntimeError("model exploded") @@ -112,20 +159,43 @@ def failing_model(messages, info): name="test_agent", ) - with pytest.raises(RuntimeError, match="model exploded"): - await agent.run("Test input") + if stream_gen_ai_spans: + items = capture_items("event", "transaction", "span") + + with pytest.raises(RuntimeError, match="model exploded"): + await agent.run("Test input") + + (error,) = (item.payload for item in items if item.type == "event") + assert error["level"] == "error" + + spans = [item.payload for item in items if item.type == "span"] + assert len(spans) == 1 + + assert spans[0]["status"] == "error" + else: + events = capture_events() - (error, transaction) = events - assert error["level"] == "error" + with pytest.raises(RuntimeError, match="model exploded"): + await agent.run("Test input") - spans = transaction["spans"] - assert len(spans) == 1 + (error, transaction) = events + assert error["level"] == "error" - assert spans[0]["status"] == "internal_error" + spans = transaction["spans"] + assert len(spans) == 1 + assert spans[0]["status"] == "internal_error" + +@pytest.mark.parametrize("stream_gen_ai_spans", [True, False]) @pytest.mark.asyncio -async def test_agent_run_async_usage_data(sentry_init, capture_events, get_test_agent): +async def test_agent_run_async_usage_data( + sentry_init, + capture_events, + capture_items, + get_test_agent, + stream_gen_ai_spans, +): """ Test that the invoke_agent span includes token usage and model data. """ @@ -133,17 +203,29 @@ async def test_agent_run_async_usage_data(sentry_init, capture_events, get_test_ integrations=[PydanticAIIntegration()], traces_sample_rate=1.0, send_default_pii=True, + stream_gen_ai_spans=stream_gen_ai_spans, ) - events = capture_events() - test_agent = get_test_agent() - result = await test_agent.run("Test input") - assert result is not None - assert result.output is not None + if stream_gen_ai_spans: + items = capture_items("transaction", "span") - (transaction,) = events + result = await test_agent.run("Test input") + + assert result is not None + assert result.output is not None + + (transaction,) = (item.payload for item in items if item.type == "transaction") + else: + events = capture_events() + + result = await test_agent.run("Test input") + + assert result is not None + assert result.output is not None + + (transaction,) = events # Verify transaction (the transaction IS the invoke_agent span) assert transaction["transaction"] == "invoke_agent test_agent" @@ -170,7 +252,14 @@ async def test_agent_run_async_usage_data(sentry_init, capture_events, get_test_ assert trace_data["gen_ai.response.model"] == "test" # Test model name -def test_agent_run_sync(sentry_init, capture_events, get_test_agent): +@pytest.mark.parametrize("stream_gen_ai_spans", [True, False]) +def test_agent_run_sync( + sentry_init, + capture_events, + capture_items, + get_test_agent, + stream_gen_ai_spans, +): """ Test that the integration creates spans for sync agent runs. """ @@ -178,40 +267,73 @@ def test_agent_run_sync(sentry_init, capture_events, get_test_agent): integrations=[PydanticAIIntegration()], traces_sample_rate=1.0, send_default_pii=True, + stream_gen_ai_spans=stream_gen_ai_spans, ) - events = capture_events() - test_agent = get_test_agent() - result = test_agent.run_sync("Test input") - assert result is not None - assert result.output is not None + if stream_gen_ai_spans: + items = capture_items("transaction", "span") - (transaction,) = events - spans = transaction["spans"] + result = test_agent.run_sync("Test input") - # Verify transaction - assert transaction["transaction"] == "invoke_agent test_agent" - assert transaction["contexts"]["trace"]["origin"] == "auto.ai.pydantic_ai" + assert result is not None + assert result.output is not None - # Find span types - chat_spans = [s for s in spans if s["op"] == "gen_ai.chat"] - assert len(chat_spans) >= 1 + # Verify transaction + (transaction,) = (item.payload for item in items if item.type == "transaction") - # Verify streaming flag is False for sync - for chat_span in chat_spans: - assert chat_span["data"]["gen_ai.response.streaming"] is False + # Verify transaction + assert transaction["transaction"] == "invoke_agent test_agent" + assert transaction["contexts"]["trace"]["origin"] == "auto.ai.pydantic_ai" + + # Find span types + spans = [item.payload for item in items if item.type == "span"] + chat_spans = [ + s for s in spans if s["attributes"].get("sentry.op", "") == "gen_ai.chat" + ] + assert len(chat_spans) >= 1 + + # Verify streaming flag is False for sync + for chat_span in chat_spans: + assert chat_span["attributes"]["gen_ai.response.streaming"] is False + else: + events = capture_events() + + result = test_agent.run_sync("Test input") + assert result is not None + assert result.output is not None -def test_agent_run_sync_model_error(sentry_init, capture_events): + (transaction,) = events + spans = transaction["spans"] + + # Verify transaction + assert transaction["transaction"] == "invoke_agent test_agent" + assert transaction["contexts"]["trace"]["origin"] == "auto.ai.pydantic_ai" + + # Find span types + chat_spans = [s for s in spans if s["op"] == "gen_ai.chat"] + assert len(chat_spans) >= 1 + + # Verify streaming flag is False for sync + for chat_span in chat_spans: + assert chat_span["data"]["gen_ai.response.streaming"] is False + + +@pytest.mark.parametrize("stream_gen_ai_spans", [True, False]) +def test_agent_run_sync_model_error( + sentry_init, + capture_events, + capture_items, + stream_gen_ai_spans, +): sentry_init( integrations=[PydanticAIIntegration()], traces_sample_rate=1.0, + stream_gen_ai_spans=stream_gen_ai_spans, ) - events = capture_events() - def failing_model(messages, info): raise RuntimeError("model exploded") @@ -220,20 +342,43 @@ def failing_model(messages, info): name="test_agent", ) - with pytest.raises(RuntimeError, match="model exploded"): - agent.run_sync("Test input") + if stream_gen_ai_spans: + items = capture_items("event", "transaction", "span") + + with pytest.raises(RuntimeError, match="model exploded"): + agent.run_sync("Test input") - (error, transaction) = events - assert error["level"] == "error" + (error,) = (item.payload for item in items if item.type == "event") + assert error["level"] == "error" - spans = transaction["spans"] - assert len(spans) == 1 + spans = [item.payload for item in items if item.type == "span"] + assert len(spans) == 1 + + assert spans[0]["status"] == "error" + else: + events = capture_events() + + with pytest.raises(RuntimeError, match="model exploded"): + agent.run_sync("Test input") + + (error, transaction) = events + assert error["level"] == "error" - assert spans[0]["status"] == "internal_error" + spans = transaction["spans"] + assert len(spans) == 1 + assert spans[0]["status"] == "internal_error" + +@pytest.mark.parametrize("stream_gen_ai_spans", [True, False]) @pytest.mark.asyncio -async def test_agent_run_stream(sentry_init, capture_events, get_test_agent): +async def test_agent_run_stream( + sentry_init, + capture_events, + capture_items, + get_test_agent, + stream_gen_ai_spans, +): """ Test that the integration creates spans for streaming agent runs. """ @@ -241,41 +386,83 @@ async def test_agent_run_stream(sentry_init, capture_events, get_test_agent): integrations=[PydanticAIIntegration()], traces_sample_rate=1.0, send_default_pii=True, + stream_gen_ai_spans=stream_gen_ai_spans, ) - events = capture_events() - test_agent = get_test_agent() - async with test_agent.run_stream("Test input") as result: - # Consume the stream - async for _ in result.stream_output(): - pass - (transaction,) = events - spans = transaction["spans"] + if stream_gen_ai_spans: + items = capture_items("transaction", "span") - # Verify transaction - assert transaction["transaction"] == "invoke_agent test_agent" - assert transaction["contexts"]["trace"]["origin"] == "auto.ai.pydantic_ai" + async with test_agent.run_stream("Test input") as result: + # Consume the stream + async for _ in result.stream_output(): + pass - # Find chat spans - chat_spans = [s for s in spans if s["op"] == "gen_ai.chat"] - assert len(chat_spans) >= 1 + # Verify transaction + (transaction,) = (item.payload for item in items if item.type == "transaction") - # Verify streaming flag is True for streaming - for chat_span in chat_spans: - assert chat_span["data"]["gen_ai.response.streaming"] is True - assert "gen_ai.request.messages" in chat_span["data"] - assert "gen_ai.usage.input_tokens" in chat_span["data"] - # Streaming responses should still have output data - assert ( - "gen_ai.response.text" in chat_span["data"] - or "gen_ai.response.model" in chat_span["data"] - ) + # Verify transaction + assert transaction["transaction"] == "invoke_agent test_agent" + assert transaction["contexts"]["trace"]["origin"] == "auto.ai.pydantic_ai" + + # Find chat spans + spans = [item.payload for item in items if item.type == "span"] + chat_spans = [ + s for s in spans if s["attributes"].get("sentry.op", "") == "gen_ai.chat" + ] + assert len(chat_spans) >= 1 + + # Verify streaming flag is True for streaming + for chat_span in chat_spans: + assert chat_span["attributes"]["gen_ai.response.streaming"] is True + assert "gen_ai.request.messages" in chat_span["attributes"] + assert "gen_ai.usage.input_tokens" in chat_span["attributes"] + # Streaming responses should still have output data + assert ( + "gen_ai.response.text" in chat_span["attributes"] + or "gen_ai.response.model" in chat_span["attributes"] + ) + else: + events = capture_events() + async with test_agent.run_stream("Test input") as result: + # Consume the stream + async for _ in result.stream_output(): + pass + (transaction,) = events + spans = transaction["spans"] + + # Verify transaction + assert transaction["transaction"] == "invoke_agent test_agent" + assert transaction["contexts"]["trace"]["origin"] == "auto.ai.pydantic_ai" + + # Find chat spans + chat_spans = [s for s in spans if s["op"] == "gen_ai.chat"] + assert len(chat_spans) >= 1 + + # Verify streaming flag is True for streaming + for chat_span in chat_spans: + assert chat_span["data"]["gen_ai.response.streaming"] is True + assert "gen_ai.request.messages" in chat_span["data"] + assert "gen_ai.usage.input_tokens" in chat_span["data"] + # Streaming responses should still have output data + assert ( + "gen_ai.response.text" in chat_span["data"] + or "gen_ai.response.model" in chat_span["data"] + ) + + +@pytest.mark.parametrize("stream_gen_ai_spans", [True, False]) @pytest.mark.asyncio -async def test_agent_run_stream_events(sentry_init, capture_events, get_test_agent): +async def test_agent_run_stream_events( + sentry_init, + capture_events, + capture_items, + get_test_agent, + stream_gen_ai_spans, +): """ Test that run_stream_events creates spans (it uses run internally, so non-streaming). """ @@ -283,32 +470,64 @@ async def test_agent_run_stream_events(sentry_init, capture_events, get_test_age integrations=[PydanticAIIntegration()], traces_sample_rate=1.0, send_default_pii=True, + stream_gen_ai_spans=stream_gen_ai_spans, ) - events = capture_events() - # Consume all events test_agent = get_test_agent() - async for _ in test_agent.run_stream_events("Test input"): - pass - (transaction,) = events + if stream_gen_ai_spans: + items = capture_items("transaction", "span") - # Verify transaction - assert transaction["transaction"] == "invoke_agent test_agent" + async for _ in test_agent.run_stream_events("Test input"): + pass - # Find chat spans - spans = transaction["spans"] - chat_spans = [s for s in spans if s["op"] == "gen_ai.chat"] - assert len(chat_spans) >= 1 + # Verify transaction + (transaction,) = (item.payload for item in items if item.type == "transaction") - # run_stream_events uses run() internally, so streaming should be False - for chat_span in chat_spans: - assert chat_span["data"]["gen_ai.response.streaming"] is False + # Verify transaction + assert transaction["transaction"] == "invoke_agent test_agent" + + # Find chat spans + spans = [item.payload for item in items if item.type == "span"] + chat_spans = [ + s for s in spans if s["attributes"].get("sentry.op", "") == "gen_ai.chat" + ] + assert len(chat_spans) >= 1 + + # run_stream_events uses run() internally, so streaming should be False + for chat_span in chat_spans: + assert chat_span["attributes"]["gen_ai.response.streaming"] is False + else: + events = capture_events() + + async for _ in test_agent.run_stream_events("Test input"): + pass + + (transaction,) = events + + # Verify transaction + assert transaction["transaction"] == "invoke_agent test_agent" + + # Find chat spans + spans = transaction["spans"] + chat_spans = [s for s in spans if s["op"] == "gen_ai.chat"] + assert len(chat_spans) >= 1 + # run_stream_events uses run() internally, so streaming should be False + for chat_span in chat_spans: + assert chat_span["data"]["gen_ai.response.streaming"] is False + +@pytest.mark.parametrize("stream_gen_ai_spans", [True, False]) @pytest.mark.asyncio -async def test_agent_with_tools(sentry_init, capture_events, get_test_agent): +async def test_agent_with_tools( + sentry_init, + capture_events, + capture_items, + get_test_agent, + stream_gen_ai_spans, +): """ Test that tool execution creates execute_tool spans. """ @@ -316,6 +535,7 @@ async def test_agent_with_tools(sentry_init, capture_events, get_test_agent): integrations=[PydanticAIIntegration()], traces_sample_rate=1.0, send_default_pii=True, + stream_gen_ai_spans=stream_gen_ai_spans, ) test_agent = get_test_agent() @@ -325,45 +545,90 @@ def add_numbers(a: int, b: int) -> int: """Add two numbers together.""" return a + b - events = capture_events() + if stream_gen_ai_spans: + items = capture_items("transaction", "span") - result = await test_agent.run("What is 5 + 3?") + result = await test_agent.run("What is 5 + 3?") - assert result is not None + assert result is not None - (transaction,) = events - spans = transaction["spans"] + spans = [item.payload for item in items if item.type == "span"] - # Find child span types (invoke_agent is the transaction, not a child span) - chat_spans = [s for s in spans if s["op"] == "gen_ai.chat"] - tool_spans = [s for s in spans if s["op"] == "gen_ai.execute_tool"] + # Find child span types (invoke_agent is the transaction, not a child span) + chat_spans = [ + s for s in spans if s["attributes"].get("sentry.op", "") == "gen_ai.chat" + ] + tool_spans = [ + s + for s in spans + if s["attributes"].get("sentry.op", "") == "gen_ai.execute_tool" + ] - # Should have tool spans - assert len(tool_spans) >= 1 + # Should have tool spans + assert len(tool_spans) >= 1 + + # Check tool span + tool_span = tool_spans[0] + assert "execute_tool" in tool_span["name"] + assert tool_span["attributes"]["gen_ai.operation.name"] == "execute_tool" + assert tool_span["attributes"]["gen_ai.tool.name"] == "add_numbers" + assert "gen_ai.tool.input" in tool_span["attributes"] + assert "gen_ai.tool.output" in tool_span["attributes"] + + # Check chat spans have available_tools + for chat_span in chat_spans: + assert "gen_ai.request.available_tools" in chat_span["attributes"] + available_tools_str = chat_span["attributes"][ + "gen_ai.request.available_tools" + ] + # Available tools is serialized as a string + assert "add_numbers" in available_tools_str + else: + events = capture_events() - # Check tool span - tool_span = tool_spans[0] - assert "execute_tool" in tool_span["description"] - assert tool_span["data"]["gen_ai.operation.name"] == "execute_tool" - assert tool_span["data"]["gen_ai.tool.name"] == "add_numbers" - assert "gen_ai.tool.input" in tool_span["data"] - assert "gen_ai.tool.output" in tool_span["data"] + result = await test_agent.run("What is 5 + 3?") - # Check chat spans have available_tools - for chat_span in chat_spans: - assert "gen_ai.request.available_tools" in chat_span["data"] - available_tools_str = chat_span["data"]["gen_ai.request.available_tools"] - # Available tools is serialized as a string - assert "add_numbers" in available_tools_str + assert result is not None + (transaction,) = events + spans = transaction["spans"] + # Find child span types (invoke_agent is the transaction, not a child span) + chat_spans = [s for s in spans if s["op"] == "gen_ai.chat"] + tool_spans = [s for s in spans if s["op"] == "gen_ai.execute_tool"] + + # Should have tool spans + assert len(tool_spans) >= 1 + + # Check tool span + tool_span = tool_spans[0] + assert "execute_tool" in tool_span["description"] + assert tool_span["data"]["gen_ai.operation.name"] == "execute_tool" + assert tool_span["data"]["gen_ai.tool.name"] == "add_numbers" + assert "gen_ai.tool.input" in tool_span["data"] + assert "gen_ai.tool.output" in tool_span["data"] + + # Check chat spans have available_tools + for chat_span in chat_spans: + assert "gen_ai.request.available_tools" in chat_span["data"] + available_tools_str = chat_span["data"]["gen_ai.request.available_tools"] + # Available tools is serialized as a string + assert "add_numbers" in available_tools_str + + +@pytest.mark.parametrize("stream_gen_ai_spans", [True, False]) @pytest.mark.parametrize( "handled_tool_call_exceptions", [False, True], ) @pytest.mark.asyncio async def test_agent_with_tool_model_retry( - sentry_init, capture_events, get_test_agent, handled_tool_call_exceptions + sentry_init, + capture_events, + capture_items, + get_test_agent, + handled_tool_call_exceptions, + stream_gen_ai_spans, ): """ Test that a handled exception is captured when a tool raises ModelRetry. @@ -376,6 +641,7 @@ async def test_agent_with_tool_model_retry( ], traces_sample_rate=1.0, send_default_pii=True, + stream_gen_ai_spans=stream_gen_ai_spans, ) retries = 0 @@ -391,58 +657,117 @@ def add_numbers(a: int, b: int) -> float: raise ModelRetry(message="Try again with the same arguments.") return a + b - events = capture_events() + if stream_gen_ai_spans: + items = capture_items("event", "transaction", "span") - result = await test_agent.run("What is 5 + 3?") + result = await test_agent.run("What is 5 + 3?") - assert result is not None + assert result is not None - if handled_tool_call_exceptions: - (error, transaction) = events + if handled_tool_call_exceptions: + (error,) = (item.payload for item in items if item.type == "event") + assert error["level"] == "error" + assert error["exception"]["values"][0]["mechanism"]["handled"] + + spans = [item.payload for item in items if item.type == "span"] + # Find child span types (invoke_agent is the transaction, not a child span) + chat_spans = [ + s for s in spans if s["attributes"].get("sentry.op", "") == "gen_ai.chat" + ] + tool_spans = [ + s + for s in spans + if s["attributes"].get("sentry.op", "") == "gen_ai.execute_tool" + ] + + # Should have tool spans + assert len(tool_spans) >= 1 + + # Check tool spans + model_retry_tool_span = tool_spans[0] + assert "execute_tool" in model_retry_tool_span["name"] + assert ( + model_retry_tool_span["attributes"]["gen_ai.operation.name"] + == "execute_tool" + ) + assert model_retry_tool_span["attributes"]["gen_ai.tool.name"] == "add_numbers" + assert "gen_ai.tool.input" in model_retry_tool_span["attributes"] + + tool_span = tool_spans[1] + assert "execute_tool" in tool_span["name"] + assert tool_span["attributes"]["gen_ai.operation.name"] == "execute_tool" + assert tool_span["attributes"]["gen_ai.tool.name"] == "add_numbers" + assert "gen_ai.tool.input" in tool_span["attributes"] + assert "gen_ai.tool.output" in tool_span["attributes"] + + # Check chat spans have available_tools + for chat_span in chat_spans: + assert "gen_ai.request.available_tools" in chat_span["attributes"] + available_tools_str = chat_span["attributes"][ + "gen_ai.request.available_tools" + ] + + # Available tools is serialized as a string + assert "add_numbers" in available_tools_str else: - (transaction,) = events - spans = transaction["spans"] + events = capture_events() - if handled_tool_call_exceptions: - assert error["level"] == "error" - assert error["exception"]["values"][0]["mechanism"]["handled"] + result = await test_agent.run("What is 5 + 3?") + + assert result is not None + + if handled_tool_call_exceptions: + (error, transaction) = events + else: + (transaction,) = events + spans = transaction["spans"] + + if handled_tool_call_exceptions: + assert error["level"] == "error" + assert error["exception"]["values"][0]["mechanism"]["handled"] - # Find child span types (invoke_agent is the transaction, not a child span) - chat_spans = [s for s in spans if s["op"] == "gen_ai.chat"] - tool_spans = [s for s in spans if s["op"] == "gen_ai.execute_tool"] + # Find child span types (invoke_agent is the transaction, not a child span) + chat_spans = [s for s in spans if s["op"] == "gen_ai.chat"] + tool_spans = [s for s in spans if s["op"] == "gen_ai.execute_tool"] - # Should have tool spans - assert len(tool_spans) >= 1 + # Should have tool spans + assert len(tool_spans) >= 1 - # Check tool spans - model_retry_tool_span = tool_spans[0] - assert "execute_tool" in model_retry_tool_span["description"] - assert model_retry_tool_span["data"]["gen_ai.operation.name"] == "execute_tool" - assert model_retry_tool_span["data"]["gen_ai.tool.name"] == "add_numbers" - assert "gen_ai.tool.input" in model_retry_tool_span["data"] + # Check tool spans + model_retry_tool_span = tool_spans[0] + assert "execute_tool" in model_retry_tool_span["description"] + assert model_retry_tool_span["data"]["gen_ai.operation.name"] == "execute_tool" + assert model_retry_tool_span["data"]["gen_ai.tool.name"] == "add_numbers" + assert "gen_ai.tool.input" in model_retry_tool_span["data"] - tool_span = tool_spans[1] - assert "execute_tool" in tool_span["description"] - assert tool_span["data"]["gen_ai.operation.name"] == "execute_tool" - assert tool_span["data"]["gen_ai.tool.name"] == "add_numbers" - assert "gen_ai.tool.input" in tool_span["data"] - assert "gen_ai.tool.output" in tool_span["data"] + tool_span = tool_spans[1] + assert "execute_tool" in tool_span["description"] + assert tool_span["data"]["gen_ai.operation.name"] == "execute_tool" + assert tool_span["data"]["gen_ai.tool.name"] == "add_numbers" + assert "gen_ai.tool.input" in tool_span["data"] + assert "gen_ai.tool.output" in tool_span["data"] - # Check chat spans have available_tools - for chat_span in chat_spans: - assert "gen_ai.request.available_tools" in chat_span["data"] - available_tools_str = chat_span["data"]["gen_ai.request.available_tools"] - # Available tools is serialized as a string - assert "add_numbers" in available_tools_str + # Check chat spans have available_tools + for chat_span in chat_spans: + assert "gen_ai.request.available_tools" in chat_span["data"] + available_tools_str = chat_span["data"]["gen_ai.request.available_tools"] + # Available tools is serialized as a string + assert "add_numbers" in available_tools_str +@pytest.mark.parametrize("stream_gen_ai_spans", [True, False]) @pytest.mark.parametrize( "handled_tool_call_exceptions", [False, True], ) @pytest.mark.asyncio async def test_agent_with_tool_validation_error( - sentry_init, capture_events, get_test_agent, handled_tool_call_exceptions + sentry_init, + capture_events, + capture_items, + get_test_agent, + handled_tool_call_exceptions, + stream_gen_ai_spans, ): """ Test that a handled exception is captured when a tool has unsatisfiable constraints. @@ -455,6 +780,7 @@ async def test_agent_with_tool_validation_error( ], traces_sample_rate=1.0, send_default_pii=True, + stream_gen_ai_spans=stream_gen_ai_spans, ) test_agent = get_test_agent() @@ -464,51 +790,109 @@ def add_numbers(a: Annotated[int, Field(gt=0, lt=0)], b: int) -> int: """Add two numbers together.""" return a + b - events = capture_events() - - result = None - with pytest.raises(UnexpectedModelBehavior): - result = await test_agent.run("What is 5 + 3?") - - assert result is None - - if handled_tool_call_exceptions: - (error, model_behaviour_error, transaction) = events - else: - ( - model_behaviour_error, - transaction, - ) = events - spans = transaction["spans"] + if stream_gen_ai_spans: + items = capture_items("event", "transaction", "span") - if handled_tool_call_exceptions: - assert error["level"] == "error" - assert error["exception"]["values"][0]["mechanism"]["handled"] + result = None + with pytest.raises(UnexpectedModelBehavior): + result = await test_agent.run("What is 5 + 3?") - # Find child span types (invoke_agent is the transaction, not a child span) - chat_spans = [s for s in spans if s["op"] == "gen_ai.chat"] - tool_spans = [s for s in spans if s["op"] == "gen_ai.execute_tool"] + assert result is None - # Should have tool spans - assert len(tool_spans) >= 1 + if handled_tool_call_exceptions: + ( + error, + model_behaviour_error, + ) = (item.payload for item in items if item.type == "event") - # Check tool spans - model_retry_tool_span = tool_spans[0] - assert "execute_tool" in model_retry_tool_span["description"] - assert model_retry_tool_span["data"]["gen_ai.operation.name"] == "execute_tool" - assert model_retry_tool_span["data"]["gen_ai.tool.name"] == "add_numbers" - assert "gen_ai.tool.input" in model_retry_tool_span["data"] + assert error["level"] == "error" + assert error["exception"]["values"][0]["mechanism"]["handled"] - # Check chat spans have available_tools - for chat_span in chat_spans: - assert "gen_ai.request.available_tools" in chat_span["data"] - available_tools_str = chat_span["data"]["gen_ai.request.available_tools"] - # Available tools is serialized as a string - assert "add_numbers" in available_tools_str + spans = [item.payload for item in items if item.type == "span"] + chat_spans = [ + s for s in spans if s["attributes"].get("sentry.op", "") == "gen_ai.chat" + ] + tool_spans = [ + s + for s in spans + if s["attributes"].get("sentry.op", "") == "gen_ai.execute_tool" + ] + # Should have tool spans + assert len(tool_spans) >= 1 + # Check tool spans + model_retry_tool_span = tool_spans[0] + assert "execute_tool" in model_retry_tool_span["name"] + assert ( + model_retry_tool_span["attributes"]["gen_ai.operation.name"] + == "execute_tool" + ) + assert model_retry_tool_span["attributes"]["gen_ai.tool.name"] == "add_numbers" + assert "gen_ai.tool.input" in model_retry_tool_span["attributes"] + + # Check chat spans have available_tools + for chat_span in chat_spans: + assert "gen_ai.request.available_tools" in chat_span["attributes"] + available_tools_str = chat_span["attributes"][ + "gen_ai.request.available_tools" + ] + + # Available tools is serialized as a string + assert "add_numbers" in available_tools_str + else: + events = capture_events() + + result = None + with pytest.raises(UnexpectedModelBehavior): + result = await test_agent.run("What is 5 + 3?") + + assert result is None + + if handled_tool_call_exceptions: + (error, model_behaviour_error, transaction) = events + else: + ( + model_behaviour_error, + transaction, + ) = events + spans = transaction["spans"] + + if handled_tool_call_exceptions: + assert error["level"] == "error" + assert error["exception"]["values"][0]["mechanism"]["handled"] + + # Find child span types (invoke_agent is the transaction, not a child span) + chat_spans = [s for s in spans if s["op"] == "gen_ai.chat"] + tool_spans = [s for s in spans if s["op"] == "gen_ai.execute_tool"] + + # Should have tool spans + assert len(tool_spans) >= 1 + + # Check tool spans + model_retry_tool_span = tool_spans[0] + assert "execute_tool" in model_retry_tool_span["description"] + assert model_retry_tool_span["data"]["gen_ai.operation.name"] == "execute_tool" + assert model_retry_tool_span["data"]["gen_ai.tool.name"] == "add_numbers" + assert "gen_ai.tool.input" in model_retry_tool_span["data"] + + # Check chat spans have available_tools + for chat_span in chat_spans: + assert "gen_ai.request.available_tools" in chat_span["data"] + available_tools_str = chat_span["data"]["gen_ai.request.available_tools"] + # Available tools is serialized as a string + assert "add_numbers" in available_tools_str + + +@pytest.mark.parametrize("stream_gen_ai_spans", [True, False]) @pytest.mark.asyncio -async def test_agent_with_tools_streaming(sentry_init, capture_events, get_test_agent): +async def test_agent_with_tools_streaming( + sentry_init, + capture_events, + capture_items, + get_test_agent, + stream_gen_ai_spans, +): """ Test that tool execution works correctly with streaming. """ @@ -516,6 +900,7 @@ async def test_agent_with_tools_streaming(sentry_init, capture_events, get_test_ integrations=[PydanticAIIntegration()], traces_sample_rate=1.0, send_default_pii=True, + stream_gen_ai_spans=stream_gen_ai_spans, ) test_agent = get_test_agent() @@ -525,36 +910,73 @@ def multiply(a: int, b: int) -> int: """Multiply two numbers.""" return a * b - events = capture_events() + if stream_gen_ai_spans: + items = capture_items("transaction", "span") - async with test_agent.run_stream("What is 7 times 8?") as result: - async for _ in result.stream_output(): - pass + async with test_agent.run_stream("What is 7 times 8?") as result: + async for _ in result.stream_output(): + pass + + spans = [item.payload for item in items if item.type == "span"] + + # Find span types + chat_spans = [ + s for s in spans if s["attributes"].get("sentry.op", "") == "gen_ai.chat" + ] + tool_spans = [ + s + for s in spans + if s["attributes"].get("sentry.op", "") == "gen_ai.execute_tool" + ] + + # Should have tool spans + assert len(tool_spans) >= 1 + + # Verify streaming flag is True + for chat_span in chat_spans: + assert chat_span["attributes"]["gen_ai.response.streaming"] is True - (transaction,) = events - spans = transaction["spans"] + # Check tool span + tool_span = tool_spans[0] + assert tool_span["attributes"]["gen_ai.tool.name"] == "multiply" + assert "gen_ai.tool.input" in tool_span["attributes"] + assert "gen_ai.tool.output" in tool_span["attributes"] + else: + events = capture_events() + + async with test_agent.run_stream("What is 7 times 8?") as result: + async for _ in result.stream_output(): + pass - # Find span types - chat_spans = [s for s in spans if s["op"] == "gen_ai.chat"] - tool_spans = [s for s in spans if s["op"] == "gen_ai.execute_tool"] + (transaction,) = events + spans = transaction["spans"] - # Should have tool spans - assert len(tool_spans) >= 1 + # Find span types + chat_spans = [s for s in spans if s["op"] == "gen_ai.chat"] + tool_spans = [s for s in spans if s["op"] == "gen_ai.execute_tool"] - # Verify streaming flag is True - for chat_span in chat_spans: - assert chat_span["data"]["gen_ai.response.streaming"] is True + # Should have tool spans + assert len(tool_spans) >= 1 - # Check tool span - tool_span = tool_spans[0] - assert tool_span["data"]["gen_ai.tool.name"] == "multiply" - assert "gen_ai.tool.input" in tool_span["data"] - assert "gen_ai.tool.output" in tool_span["data"] + # Verify streaming flag is True + for chat_span in chat_spans: + assert chat_span["data"]["gen_ai.response.streaming"] is True + # Check tool span + tool_span = tool_spans[0] + assert tool_span["data"]["gen_ai.tool.name"] == "multiply" + assert "gen_ai.tool.input" in tool_span["data"] + assert "gen_ai.tool.output" in tool_span["data"] + +@pytest.mark.parametrize("stream_gen_ai_spans", [True, False]) @pytest.mark.asyncio async def test_model_settings( - sentry_init, capture_events, get_test_agent_with_settings + sentry_init, + capture_events, + capture_items, + get_test_agent_with_settings, + stream_gen_ai_spans, ): """ Test that model settings are captured in spans. @@ -562,27 +984,49 @@ async def test_model_settings( sentry_init( integrations=[PydanticAIIntegration()], traces_sample_rate=1.0, + stream_gen_ai_spans=stream_gen_ai_spans, ) - events = capture_events() - test_agent_with_settings = get_test_agent_with_settings() - await test_agent_with_settings.run("Test input") - (transaction,) = events - spans = transaction["spans"] + if stream_gen_ai_spans: + items = capture_items("transaction", "span") - # Find chat span - chat_spans = [s for s in spans if s["op"] == "gen_ai.chat"] - assert len(chat_spans) >= 1 + await test_agent_with_settings.run("Test input") - chat_span = chat_spans[0] - # Check that model settings are captured - assert chat_span["data"].get("gen_ai.request.temperature") == 0.7 - assert chat_span["data"].get("gen_ai.request.max_tokens") == 100 - assert chat_span["data"].get("gen_ai.request.top_p") == 0.9 + spans = [item.payload for item in items if item.type == "span"] + + # Find chat span + chat_spans = [ + s for s in spans if s["attributes"].get("sentry.op", "") == "gen_ai.chat" + ] + assert len(chat_spans) >= 1 + + chat_span = chat_spans[0] + # Check that model settings are captured + assert chat_span["attributes"].get("gen_ai.request.temperature") == 0.7 + assert chat_span["attributes"].get("gen_ai.request.max_tokens") == 100 + assert chat_span["attributes"].get("gen_ai.request.top_p") == 0.9 + else: + events = capture_events() + + await test_agent_with_settings.run("Test input") + + (transaction,) = events + spans = transaction["spans"] + + # Find chat span + chat_spans = [s for s in spans if s["op"] == "gen_ai.chat"] + assert len(chat_spans) >= 1 + + chat_span = chat_spans[0] + # Check that model settings are captured + assert chat_span["data"].get("gen_ai.request.temperature") == 0.7 + assert chat_span["data"].get("gen_ai.request.max_tokens") == 100 + assert chat_span["data"].get("gen_ai.request.top_p") == 0.9 +@pytest.mark.parametrize("stream_gen_ai_spans", [True, False]) @pytest.mark.asyncio @pytest.mark.parametrize( "send_default_pii, include_prompts", @@ -594,7 +1038,12 @@ async def test_model_settings( ], ) async def test_system_prompt_attribute( - sentry_init, capture_events, send_default_pii, include_prompts + sentry_init, + capture_events, + capture_items, + send_default_pii, + include_prompts, + stream_gen_ai_spans, ): """ Test that system prompts are included as the first message. @@ -609,35 +1058,70 @@ async def test_system_prompt_attribute( integrations=[PydanticAIIntegration(include_prompts=include_prompts)], traces_sample_rate=1.0, send_default_pii=send_default_pii, + stream_gen_ai_spans=stream_gen_ai_spans, ) - events = capture_events() + if stream_gen_ai_spans: + items = capture_items("transaction", "span") - await agent.run("Hello") + await agent.run("Hello") - (transaction,) = events - spans = transaction["spans"] + spans = [item.payload for item in items if item.type == "span"] - # The transaction IS the invoke_agent span, check for messages in chat spans instead - chat_spans = [s for s in spans if s["op"] == "gen_ai.chat"] - assert len(chat_spans) >= 1 + # The transaction IS the invoke_agent span, check for messages in chat spans instead + chat_spans = [ + s for s in spans if s["attributes"].get("sentry.op", "") == "gen_ai.chat" + ] + assert len(chat_spans) >= 1 - chat_span = chat_spans[0] + chat_span = chat_spans[0] - if send_default_pii and include_prompts: - system_instructions = chat_span["data"][SPANDATA.GEN_AI_SYSTEM_INSTRUCTIONS] - assert json.loads(system_instructions) == [ - { - "type": "text", - "content": "You are a helpful assistant specialized in testing.", - } - ] + if send_default_pii and include_prompts: + system_instructions = chat_span["attributes"][ + SPANDATA.GEN_AI_SYSTEM_INSTRUCTIONS + ] + assert json.loads(system_instructions) == [ + { + "type": "text", + "content": "You are a helpful assistant specialized in testing.", + } + ] + else: + assert SPANDATA.GEN_AI_SYSTEM_INSTRUCTIONS not in chat_span["attributes"] else: - assert SPANDATA.GEN_AI_SYSTEM_INSTRUCTIONS not in chat_span["data"] + events = capture_events() + await agent.run("Hello") + + (transaction,) = events + spans = transaction["spans"] + # The transaction IS the invoke_agent span, check for messages in chat spans instead + chat_spans = [s for s in spans if s["op"] == "gen_ai.chat"] + assert len(chat_spans) >= 1 + + chat_span = chat_spans[0] + + if send_default_pii and include_prompts: + system_instructions = chat_span["data"][SPANDATA.GEN_AI_SYSTEM_INSTRUCTIONS] + assert json.loads(system_instructions) == [ + { + "type": "text", + "content": "You are a helpful assistant specialized in testing.", + } + ] + else: + assert SPANDATA.GEN_AI_SYSTEM_INSTRUCTIONS not in chat_span["data"] + + +@pytest.mark.parametrize("stream_gen_ai_spans", [True, False]) @pytest.mark.asyncio -async def test_error_handling(sentry_init, capture_events): +async def test_error_handling( + sentry_init, + capture_events, + capture_items, + stream_gen_ai_spans, +): """ Test error handling in agent execution. """ @@ -651,24 +1135,42 @@ async def test_error_handling(sentry_init, capture_events): sentry_init( integrations=[PydanticAIIntegration()], traces_sample_rate=1.0, + stream_gen_ai_spans=stream_gen_ai_spans, ) - events = capture_events() + if stream_gen_ai_spans: + items = capture_items("transaction", "span") - # Simple run that should succeed - await agent.run("Hello") + # Simple run that should succeed + await agent.run("Hello") + + # At minimum, we should have a transaction + transaction = next(item.payload for item in items if item.type == "transaction") + else: + events = capture_events() + + # Simple run that should succeed + await agent.run("Hello") + + # At minimum, we should have a transaction + assert len(events) >= 1 + transaction = [e for e in events if e.get("type") == "transaction"][0] - # At minimum, we should have a transaction - assert len(events) >= 1 - transaction = [e for e in events if e.get("type") == "transaction"][0] assert transaction["transaction"] == "invoke_agent test_error" # Transaction should complete successfully (status key may not exist if no error) trace_status = transaction["contexts"]["trace"].get("status") assert trace_status != "error" # Could be None or some other status +@pytest.mark.parametrize("stream_gen_ai_spans", [True, False]) @pytest.mark.asyncio -async def test_without_pii(sentry_init, capture_events, get_test_agent): +async def test_without_pii( + sentry_init, + capture_events, + capture_items, + get_test_agent, + stream_gen_ai_spans, +): """ Test that PII is not captured when send_default_pii is False. """ @@ -676,27 +1178,53 @@ async def test_without_pii(sentry_init, capture_events, get_test_agent): integrations=[PydanticAIIntegration()], traces_sample_rate=1.0, send_default_pii=False, + stream_gen_ai_spans=stream_gen_ai_spans, ) - events = capture_events() + if stream_gen_ai_spans: + items = capture_items("transaction", "span") - test_agent = get_test_agent() - await test_agent.run("Sensitive input") + test_agent = get_test_agent() + await test_agent.run("Sensitive input") - (transaction,) = events - spans = transaction["spans"] + spans = [item.payload for item in items if item.type == "span"] - # Find child spans (invoke_agent is the transaction, not a child span) - chat_spans = [s for s in spans if s["op"] == "gen_ai.chat"] + # Find child spans (invoke_agent is the transaction, not a child span) + chat_spans = [ + s for s in spans if s["attributes"].get("sentry.op", "") == "gen_ai.chat" + ] - # Verify that messages and response text are not captured - for span in chat_spans: - assert "gen_ai.request.messages" not in span["data"] - assert "gen_ai.response.text" not in span["data"] + # Verify that messages and response text are not captured + for span in chat_spans: + assert "gen_ai.request.messages" not in span["attributes"] + assert "gen_ai.response.text" not in span["attributes"] + else: + events = capture_events() + + test_agent = get_test_agent() + await test_agent.run("Sensitive input") + + (transaction,) = events + spans = transaction["spans"] + # Find child spans (invoke_agent is the transaction, not a child span) + chat_spans = [s for s in spans if s["op"] == "gen_ai.chat"] + # Verify that messages and response text are not captured + for span in chat_spans: + assert "gen_ai.request.messages" not in span["data"] + assert "gen_ai.response.text" not in span["data"] + + +@pytest.mark.parametrize("stream_gen_ai_spans", [True, False]) @pytest.mark.asyncio -async def test_without_pii_tools(sentry_init, capture_events, get_test_agent): +async def test_without_pii_tools( + sentry_init, + capture_events, + capture_items, + get_test_agent, + stream_gen_ai_spans, +): """ Test that tool input/output are not captured when send_default_pii is False. """ @@ -704,6 +1232,7 @@ async def test_without_pii_tools(sentry_init, capture_events, get_test_agent): integrations=[PydanticAIIntegration()], traces_sample_rate=1.0, send_default_pii=False, + stream_gen_ai_spans=stream_gen_ai_spans, ) test_agent = get_test_agent() @@ -713,55 +1242,102 @@ def sensitive_tool(data: str) -> str: """A tool with sensitive data.""" return f"Processed: {data}" - events = capture_events() + if stream_gen_ai_spans: + items = capture_items("transaction", "span") + + await test_agent.run("Use sensitive tool with private data") + + spans = [item.payload for item in items if item.type == "span"] + + # Find tool spans + tool_spans = [ + s + for s in spans + if s["attributes"].get("sentry.op", "") == "gen_ai.execute_tool" + ] + + # If tool was executed, verify input/output are not captured + for tool_span in tool_spans: + assert "gen_ai.tool.input" not in tool_span["attributes"] + assert "gen_ai.tool.output" not in tool_span["attributes"] + else: + events = capture_events() - await test_agent.run("Use sensitive tool with private data") + await test_agent.run("Use sensitive tool with private data") - (transaction,) = events - spans = transaction["spans"] + (transaction,) = events + spans = transaction["spans"] - # Find tool spans - tool_spans = [s for s in spans if s["op"] == "gen_ai.execute_tool"] + # Find tool spans + tool_spans = [s for s in spans if s["op"] == "gen_ai.execute_tool"] - # If tool was executed, verify input/output are not captured - for tool_span in tool_spans: - assert "gen_ai.tool.input" not in tool_span["data"] - assert "gen_ai.tool.output" not in tool_span["data"] + # If tool was executed, verify input/output are not captured + for tool_span in tool_spans: + assert "gen_ai.tool.input" not in tool_span["data"] + assert "gen_ai.tool.output" not in tool_span["data"] +@pytest.mark.parametrize("stream_gen_ai_spans", [True, False]) @pytest.mark.asyncio -async def test_multiple_agents_concurrent(sentry_init, capture_events, get_test_agent): +async def test_multiple_agents_concurrent( + sentry_init, + capture_events, + capture_items, + get_test_agent, + stream_gen_ai_spans, +): """ Test that multiple agents can run concurrently without interfering. """ sentry_init( integrations=[PydanticAIIntegration()], traces_sample_rate=1.0, + stream_gen_ai_spans=stream_gen_ai_spans, ) - events = capture_events() - test_agent = get_test_agent() async def run_agent(input_text): return await test_agent.run(input_text) - # Run 3 agents concurrently - results = await asyncio.gather(*[run_agent(f"Input {i}") for i in range(3)]) + if stream_gen_ai_spans: + items = capture_items("transaction", "span") - assert len(results) == 3 - assert len(events) == 3 + # Run 3 agents concurrently + results = await asyncio.gather(*[run_agent(f"Input {i}") for i in range(3)]) - # Verify each transaction is separate - for i, transaction in enumerate(events): - assert transaction["type"] == "transaction" - assert transaction["transaction"] == "invoke_agent test_agent" - # Each should have its own spans - assert len(transaction["spans"]) >= 1 + assert len(results) == 3 + + # Verify each transaction is separate + events = [item.payload for item in items if item.type == "transaction"] + assert len(events) == 3 + for i, transaction in enumerate(events): + assert transaction["transaction"] == "invoke_agent test_agent" + else: + events = capture_events() + + # Run 3 agents concurrently + results = await asyncio.gather(*[run_agent(f"Input {i}") for i in range(3)]) + assert len(results) == 3 + assert len(events) == 3 + # Verify each transaction is separate + for i, transaction in enumerate(events): + assert transaction["type"] == "transaction" + assert transaction["transaction"] == "invoke_agent test_agent" + # Each should have its own spans + assert len(transaction["spans"]) >= 1 + + +@pytest.mark.parametrize("stream_gen_ai_spans", [True, False]) @pytest.mark.asyncio -async def test_message_history(sentry_init, capture_events): +async def test_message_history( + sentry_init, + capture_events, + capture_items, + stream_gen_ai_spans, +): """ Test that full conversation history is captured in chat spans. """ @@ -774,13 +1350,9 @@ async def test_message_history(sentry_init, capture_events): integrations=[PydanticAIIntegration()], traces_sample_rate=1.0, send_default_pii=True, + stream_gen_ai_spans=stream_gen_ai_spans, ) - events = capture_events() - - # First message - await agent.run("Hello, I'm Alice") - # Second message with history from pydantic_ai import messages @@ -794,54 +1366,118 @@ async def test_message_history(sentry_init, capture_events): ), ] - await agent.run("What is my name?", message_history=history) + if stream_gen_ai_spans: + items = capture_items("transaction", "span") - # We should have 2 transactions - assert len(events) >= 2 + # First message + await agent.run("Hello, I'm Alice") - # Check the second transaction has the full history - second_transaction = events[1] - spans = second_transaction["spans"] - chat_spans = [s for s in spans if s["op"] == "gen_ai.chat"] + await agent.run("What is my name?", message_history=history) - if chat_spans: - chat_span = chat_spans[0] - if "gen_ai.request.messages" in chat_span["data"]: - messages_data = chat_span["data"]["gen_ai.request.messages"] - # Should have multiple messages including history - assert len(messages_data) > 1 + # We should have 2 transactions + events = [item.payload for item in items if item.type == "transaction"] + + # Check the second transaction has the full history + second_transaction = events[1] + spans = second_transaction["spans"] + chat_spans = [ + s for s in spans if s["attributes"].get("sentry.op", "") == "gen_ai.chat" + ] + + if chat_spans: + chat_span = chat_spans[0] + if "gen_ai.request.messages" in chat_span["attributes"]: + messages_data = chat_span["attributes"]["gen_ai.request.messages"] + # Should have multiple messages including history + assert len(messages_data) > 1 + else: + events = capture_events() + + # First message + await agent.run("Hello, I'm Alice") + + await agent.run("What is my name?", message_history=history) + + # We should have 2 transactions + assert len(events) >= 2 + + # Check the second transaction has the full history + second_transaction = events[1] + spans = second_transaction["spans"] + chat_spans = [s for s in spans if s["op"] == "gen_ai.chat"] + + if chat_spans: + chat_span = chat_spans[0] + if "gen_ai.request.messages" in chat_span["data"]: + messages_data = chat_span["data"]["gen_ai.request.messages"] + # Should have multiple messages including history + assert len(messages_data) > 1 +@pytest.mark.parametrize("stream_gen_ai_spans", [True, False]) @pytest.mark.asyncio -async def test_gen_ai_system(sentry_init, capture_events, get_test_agent): +async def test_gen_ai_system( + sentry_init, + capture_events, + capture_items, + get_test_agent, + stream_gen_ai_spans, +): """ Test that gen_ai.system is set from the model. """ sentry_init( integrations=[PydanticAIIntegration()], traces_sample_rate=1.0, + stream_gen_ai_spans=stream_gen_ai_spans, ) - events = capture_events() - test_agent = get_test_agent() - await test_agent.run("Test input") - (transaction,) = events - spans = transaction["spans"] + if stream_gen_ai_spans: + items = capture_items("transaction", "span") - # Find chat span - chat_spans = [s for s in spans if s["op"] == "gen_ai.chat"] - assert len(chat_spans) >= 1 + await test_agent.run("Test input") - chat_span = chat_spans[0] - # gen_ai.system should be set from the model (TestModel -> 'test') - assert "gen_ai.system" in chat_span["data"] - assert chat_span["data"]["gen_ai.system"] == "test" + spans = [item.payload for item in items if item.type == "span"] + # Find chat span + chat_spans = [ + s for s in spans if s["attributes"].get("sentry.op", "") == "gen_ai.chat" + ] + assert len(chat_spans) >= 1 + chat_span = chat_spans[0] + # gen_ai.system should be set from the model (TestModel -> 'test') + assert "gen_ai.system" in chat_span["attributes"] + assert chat_span["attributes"]["gen_ai.system"] == "test" + else: + events = capture_events() + + await test_agent.run("Test input") + + (transaction,) = events + spans = transaction["spans"] + + # Find chat span + chat_spans = [s for s in spans if s["op"] == "gen_ai.chat"] + assert len(chat_spans) >= 1 + + chat_span = chat_spans[0] + # gen_ai.system should be set from the model (TestModel -> 'test') + assert "gen_ai.system" in chat_span["data"] + assert chat_span["data"]["gen_ai.system"] == "test" + + +@pytest.mark.parametrize("stream_gen_ai_spans", [True, False]) @pytest.mark.asyncio -async def test_include_prompts_false(sentry_init, capture_events, get_test_agent): +async def test_include_prompts_false( + sentry_init, + capture_events, + capture_items, + get_test_agent, + stream_gen_ai_spans, +): """ Test that prompts are not captured when include_prompts=False. """ @@ -849,27 +1485,53 @@ async def test_include_prompts_false(sentry_init, capture_events, get_test_agent integrations=[PydanticAIIntegration(include_prompts=False)], traces_sample_rate=1.0, send_default_pii=True, # Even with PII enabled, prompts should not be captured + stream_gen_ai_spans=stream_gen_ai_spans, ) - events = capture_events() + test_agent = get_test_agent() + + if stream_gen_ai_spans: + items = capture_items("transaction", "span") + + await test_agent.run("Sensitive prompt") + + spans = [item.payload for item in items if item.type == "span"] + + # Find child spans (invoke_agent is the transaction, not a child span) + chat_spans = [ + s for s in spans if s["attributes"].get("sentry.op", "") == "gen_ai.chat" + ] + + # Verify that messages and response text are not captured + for span in chat_spans: + assert "gen_ai.request.messages" not in span["attributes"] + assert "gen_ai.response.text" not in span["attributes"] + else: + events = capture_events() - test_agent = get_test_agent() - await test_agent.run("Sensitive prompt") + await test_agent.run("Sensitive prompt") - (transaction,) = events - spans = transaction["spans"] + (transaction,) = events + spans = transaction["spans"] - # Find child spans (invoke_agent is the transaction, not a child span) - chat_spans = [s for s in spans if s["op"] == "gen_ai.chat"] + # Find child spans (invoke_agent is the transaction, not a child span) + chat_spans = [s for s in spans if s["op"] == "gen_ai.chat"] - # Verify that messages and response text are not captured - for span in chat_spans: - assert "gen_ai.request.messages" not in span["data"] - assert "gen_ai.response.text" not in span["data"] + # Verify that messages and response text are not captured + for span in chat_spans: + assert "gen_ai.request.messages" not in span["data"] + assert "gen_ai.response.text" not in span["data"] +@pytest.mark.parametrize("stream_gen_ai_spans", [True, False]) @pytest.mark.asyncio -async def test_include_prompts_true(sentry_init, capture_events, get_test_agent): +async def test_include_prompts_true( + sentry_init, + capture_events, + capture_items, + get_test_agent, + stream_gen_ai_spans, +): """ Test that prompts are captured when include_prompts=True (default). """ @@ -877,28 +1539,52 @@ async def test_include_prompts_true(sentry_init, capture_events, get_test_agent) integrations=[PydanticAIIntegration(include_prompts=True)], traces_sample_rate=1.0, send_default_pii=True, + stream_gen_ai_spans=stream_gen_ai_spans, ) - events = capture_events() - test_agent = get_test_agent() - await test_agent.run("Test prompt") - (transaction,) = events - spans = transaction["spans"] + if stream_gen_ai_spans: + items = capture_items("transaction", "span") - # Find child spans (invoke_agent is the transaction, not a child span) - chat_spans = [s for s in spans if s["op"] == "gen_ai.chat"] + await test_agent.run("Test prompt") - # Verify that messages are captured in chat spans - assert len(chat_spans) >= 1 - for chat_span in chat_spans: - assert "gen_ai.request.messages" in chat_span["data"] + spans = [item.payload for item in items if item.type == "span"] + + # Find child spans (invoke_agent is the transaction, not a child span) + chat_spans = [ + s for s in spans if s["attributes"].get("sentry.op", "") == "gen_ai.chat" + ] + + # Verify that messages are captured in chat spans + assert len(chat_spans) >= 1 + for chat_span in chat_spans: + assert "gen_ai.request.messages" in chat_span["attributes"] + else: + events = capture_events() + + await test_agent.run("Test prompt") + + (transaction,) = events + spans = transaction["spans"] + + # Find child spans (invoke_agent is the transaction, not a child span) + chat_spans = [s for s in spans if s["op"] == "gen_ai.chat"] + + # Verify that messages are captured in chat spans + assert len(chat_spans) >= 1 + for chat_span in chat_spans: + assert "gen_ai.request.messages" in chat_span["data"] +@pytest.mark.parametrize("stream_gen_ai_spans", [True, False]) @pytest.mark.asyncio async def test_include_prompts_false_with_tools( - sentry_init, capture_events, get_test_agent + sentry_init, + capture_events, + capture_items, + get_test_agent, + stream_gen_ai_spans, ): """ Test that tool input/output are not captured when include_prompts=False. @@ -907,6 +1593,7 @@ async def test_include_prompts_false_with_tools( integrations=[PydanticAIIntegration(include_prompts=False)], traces_sample_rate=1.0, send_default_pii=True, + stream_gen_ai_spans=stream_gen_ai_spans, ) test_agent = get_test_agent() @@ -916,25 +1603,49 @@ def test_tool(value: int) -> int: """A test tool.""" return value * 2 - events = capture_events() + if stream_gen_ai_spans: + items = capture_items("transaction", "span") - await test_agent.run("Use the test tool with value 5") + await test_agent.run("Use the test tool with value 5") - (transaction,) = events - spans = transaction["spans"] + spans = [item.payload for item in items if item.type == "span"] - # Find tool spans - tool_spans = [s for s in spans if s["op"] == "gen_ai.execute_tool"] + # Find tool spans + tool_spans = [ + s + for s in spans + if s["attributes"].get("sentry.op", "") == "gen_ai.execute_tool" + ] + + # If tool was executed, verify input/output are not captured + for tool_span in tool_spans: + assert "gen_ai.tool.input" not in tool_span["attributes"] + assert "gen_ai.tool.output" not in tool_span["attributes"] + else: + events = capture_events() + + await test_agent.run("Use the test tool with value 5") - # If tool was executed, verify input/output are not captured - for tool_span in tool_spans: - assert "gen_ai.tool.input" not in tool_span["data"] - assert "gen_ai.tool.output" not in tool_span["data"] + (transaction,) = events + spans = transaction["spans"] + + # Find tool spans + tool_spans = [s for s in spans if s["op"] == "gen_ai.execute_tool"] + # If tool was executed, verify input/output are not captured + for tool_span in tool_spans: + assert "gen_ai.tool.input" not in tool_span["data"] + assert "gen_ai.tool.output" not in tool_span["data"] + +@pytest.mark.parametrize("stream_gen_ai_spans", [True, False]) @pytest.mark.asyncio async def test_include_prompts_requires_pii( - sentry_init, capture_events, get_test_agent + sentry_init, + capture_events, + capture_items, + get_test_agent, + stream_gen_ai_spans, ): """ Test that include_prompts requires send_default_pii=True. @@ -943,27 +1654,52 @@ async def test_include_prompts_requires_pii( integrations=[PydanticAIIntegration(include_prompts=True)], traces_sample_rate=1.0, send_default_pii=False, # PII disabled + stream_gen_ai_spans=stream_gen_ai_spans, ) - events = capture_events() - test_agent = get_test_agent() - await test_agent.run("Test prompt") - (transaction,) = events - spans = transaction["spans"] + if stream_gen_ai_spans: + items = capture_items("transaction", "span") + + await test_agent.run("Test prompt") + + spans = [item.payload for item in items if item.type == "span"] + + # Find child spans (invoke_agent is the transaction, not a child span) + chat_spans = [ + s for s in spans if s["attributes"].get("sentry.op", "") == "gen_ai.chat" + ] + + # Even with include_prompts=True, if PII is disabled, messages should not be captured + for span in chat_spans: + assert "gen_ai.request.messages" not in span["attributes"] + assert "gen_ai.response.text" not in span["attributes"] + else: + events = capture_events() + + await test_agent.run("Test prompt") + + (transaction,) = events + spans = transaction["spans"] - # Find child spans (invoke_agent is the transaction, not a child span) - chat_spans = [s for s in spans if s["op"] == "gen_ai.chat"] + # Find child spans (invoke_agent is the transaction, not a child span) + chat_spans = [s for s in spans if s["op"] == "gen_ai.chat"] - # Even with include_prompts=True, if PII is disabled, messages should not be captured - for span in chat_spans: - assert "gen_ai.request.messages" not in span["data"] - assert "gen_ai.response.text" not in span["data"] + # Even with include_prompts=True, if PII is disabled, messages should not be captured + for span in chat_spans: + assert "gen_ai.request.messages" not in span["data"] + assert "gen_ai.response.text" not in span["data"] +@pytest.mark.parametrize("stream_gen_ai_spans", [True, False]) @pytest.mark.asyncio -async def test_mcp_tool_execution_spans(sentry_init, capture_events): +async def test_mcp_tool_execution_spans( + sentry_init, + capture_events, + capture_items, + stream_gen_ai_spans, +): """ Test that MCP (Model Context Protocol) tool calls create execute_tool spans. @@ -1033,59 +1769,112 @@ async def mock_map_tool_result_part(part): integrations=[PydanticAIIntegration()], traces_sample_rate=1.0, send_default_pii=True, + stream_gen_ai_spans=stream_gen_ai_spans, ) - events = capture_events() + if stream_gen_ai_spans: + items = capture_items("transaction", "span") + + # Simulate MCP tool execution within a transaction through CombinedToolset + with sentry_sdk.start_transaction( + op="ai.run", name="invoke_agent test_mcp_agent" + ): + # Set up the agent context + scope = sentry_sdk.get_current_scope() + scope._contexts["pydantic_ai_agent"] = { + "_agent": agent, + } - # Simulate MCP tool execution within a transaction through CombinedToolset - with sentry_sdk.start_transaction( - op="ai.run", name="invoke_agent test_mcp_agent" - ) as transaction: - # Set up the agent context - scope = sentry_sdk.get_current_scope() - scope._contexts["pydantic_ai_agent"] = { - "_agent": agent, - } + # Create a mock tool that simulates an MCP tool from CombinedToolset + from pydantic_ai._run_context import RunContext + from pydantic_ai.result import RunUsage + from pydantic_ai.models.test import TestModel + from pydantic_ai.toolsets.combined import _CombinedToolsetTool + + ctx = RunContext( + deps=None, + model=TestModel(), + usage=RunUsage(), + retry=0, + tool_name="test_mcp_tool", + ) - # Create a mock tool that simulates an MCP tool from CombinedToolset - from pydantic_ai._run_context import RunContext - from pydantic_ai.result import RunUsage - from pydantic_ai.models.test import TestModel - from pydantic_ai.toolsets.combined import _CombinedToolsetTool - - ctx = RunContext( - deps=None, - model=TestModel(), - usage=RunUsage(), - retry=0, - tool_name="test_mcp_tool", - ) + tool_name = "test_mcp_tool" + + # Create a tool that points to the MCP server + # This simulates how CombinedToolset wraps tools from different sources + tool = _CombinedToolsetTool( + toolset=combined, + tool_def=MagicMock(name=tool_name), + max_retries=0, + args_validator=MagicMock(), + source_toolset=mock_server, + source_tool=MagicMock(), + ) - tool_name = "test_mcp_tool" - - # Create a tool that points to the MCP server - # This simulates how CombinedToolset wraps tools from different sources - tool = _CombinedToolsetTool( - toolset=combined, - tool_def=MagicMock(name=tool_name), - max_retries=0, - args_validator=MagicMock(), - source_toolset=mock_server, - source_tool=MagicMock(), - ) + try: + await combined.call_tool(tool_name, {"query": "test"}, ctx, tool) + except Exception: + # MCP tool might raise if not fully mocked, that's okay + pass - try: - await combined.call_tool(tool_name, {"query": "test"}, ctx, tool) - except Exception: - # MCP tool might raise if not fully mocked, that's okay - pass + if len(items) == 0: + pytest.skip("No events captured, MCP test setup incomplete") + + (transaction,) = (item.payload for item in items if item.type == "transaction") + transaction["spans"] + else: + events = capture_events() + + # Simulate MCP tool execution within a transaction through CombinedToolset + with sentry_sdk.start_transaction( + op="ai.run", name="invoke_agent test_mcp_agent" + ) as transaction: + # Set up the agent context + scope = sentry_sdk.get_current_scope() + scope._contexts["pydantic_ai_agent"] = { + "_agent": agent, + } + + # Create a mock tool that simulates an MCP tool from CombinedToolset + from pydantic_ai._run_context import RunContext + from pydantic_ai.result import RunUsage + from pydantic_ai.models.test import TestModel + from pydantic_ai.toolsets.combined import _CombinedToolsetTool + + ctx = RunContext( + deps=None, + model=TestModel(), + usage=RunUsage(), + retry=0, + tool_name="test_mcp_tool", + ) + + tool_name = "test_mcp_tool" + + # Create a tool that points to the MCP server + # This simulates how CombinedToolset wraps tools from different sources + tool = _CombinedToolsetTool( + toolset=combined, + tool_def=MagicMock(name=tool_name), + max_retries=0, + args_validator=MagicMock(), + source_toolset=mock_server, + source_tool=MagicMock(), + ) + + try: + await combined.call_tool(tool_name, {"query": "test"}, ctx, tool) + except Exception: + # MCP tool might raise if not fully mocked, that's okay + pass - events_list = events - if len(events_list) == 0: - pytest.skip("No events captured, MCP test setup incomplete") + events_list = events + if len(events_list) == 0: + pytest.skip("No events captured, MCP test setup incomplete") - (transaction,) = events_list - transaction["spans"] + (transaction,) = events_list + transaction["spans"] # Note: This test manually calls combined.call_tool which doesn't go through # ToolManager._call_tool (which is what the integration patches). @@ -1255,8 +2044,14 @@ async def run_and_check_context(agent, agent_name): # ==================== Additional Coverage Tests ==================== +@pytest.mark.parametrize("stream_gen_ai_spans", [True, False]) @pytest.mark.asyncio -async def test_invoke_agent_with_list_user_prompt(sentry_init, capture_events): +async def test_invoke_agent_with_list_user_prompt( + sentry_init, + capture_events, + capture_items, + stream_gen_ai_spans, +): """ Test that invoke_agent span handles list user prompts correctly. """ @@ -1269,14 +2064,23 @@ async def test_invoke_agent_with_list_user_prompt(sentry_init, capture_events): integrations=[PydanticAIIntegration()], traces_sample_rate=1.0, send_default_pii=True, + stream_gen_ai_spans=stream_gen_ai_spans, ) - events = capture_events() + if stream_gen_ai_spans: + items = capture_items("transaction", "span") + + # Use a list as user prompt + await agent.run(["First part", "Second part"]) - # Use a list as user prompt - await agent.run(["First part", "Second part"]) + (transaction,) = [item.payload for item in items if item.type == "transaction"] + else: + events = capture_events() + + # Use a list as user prompt + await agent.run(["First part", "Second part"]) - (transaction,) = events + (transaction,) = events # Check that the invoke_agent transaction has messages data # The invoke_agent is the transaction itself @@ -1288,6 +2092,7 @@ async def test_invoke_agent_with_list_user_prompt(sentry_init, capture_events): assert "Second part" in messages_str +@pytest.mark.parametrize("stream_gen_ai_spans", [True, False]) @pytest.mark.asyncio @pytest.mark.parametrize( "send_default_pii, include_prompts", @@ -1299,7 +2104,12 @@ async def test_invoke_agent_with_list_user_prompt(sentry_init, capture_events): ], ) async def test_invoke_agent_with_instructions( - sentry_init, capture_events, send_default_pii, include_prompts + sentry_init, + capture_events, + capture_items, + send_default_pii, + include_prompts, + stream_gen_ai_spans, ): """ Test that invoke_agent span handles instructions correctly. @@ -1320,33 +2130,61 @@ async def test_invoke_agent_with_instructions( integrations=[PydanticAIIntegration(include_prompts=include_prompts)], traces_sample_rate=1.0, send_default_pii=send_default_pii, + stream_gen_ai_spans=stream_gen_ai_spans, ) - events = capture_events() + if stream_gen_ai_spans: + items = capture_items("transaction", "span") + + await agent.run("Test input") - await agent.run("Test input") + spans = [item.payload for item in items if item.type == "span"] - (transaction,) = events - spans = transaction["spans"] + # The transaction IS the invoke_agent span, check for messages in chat spans instead + chat_spans = [ + s for s in spans if s["attributes"].get("sentry.op", "") == "gen_ai.chat" + ] + assert len(chat_spans) >= 1 - # The transaction IS the invoke_agent span, check for messages in chat spans instead - chat_spans = [s for s in spans if s["op"] == "gen_ai.chat"] - assert len(chat_spans) >= 1 + chat_span = chat_spans[0] - chat_span = chat_spans[0] + if send_default_pii and include_prompts: + system_instructions = chat_span["attributes"][ + SPANDATA.GEN_AI_SYSTEM_INSTRUCTIONS + ] + assert json.loads(system_instructions) == [ + {"type": "text", "content": "System prompt"}, + {"type": "text", "content": "Instruction 1\nInstruction 2"}, + ] + else: + assert SPANDATA.GEN_AI_SYSTEM_INSTRUCTIONS not in chat_span["attributes"] - if send_default_pii and include_prompts: - system_instructions = chat_span["data"][SPANDATA.GEN_AI_SYSTEM_INSTRUCTIONS] - assert json.loads(system_instructions) == [ - {"type": "text", "content": "System prompt"}, - {"type": "text", "content": "Instruction 1\nInstruction 2"}, - ] else: - assert SPANDATA.GEN_AI_SYSTEM_INSTRUCTIONS not in chat_span["data"] + events = capture_events() + + await agent.run("Test input") + + (transaction,) = events + spans = transaction["spans"] + + # The transaction IS the invoke_agent span, check for messages in chat spans instead + chat_spans = [s for s in spans if s["op"] == "gen_ai.chat"] + assert len(chat_spans) >= 1 + + chat_span = chat_spans[0] + + if send_default_pii and include_prompts: + system_instructions = chat_span["data"][SPANDATA.GEN_AI_SYSTEM_INSTRUCTIONS] + assert json.loads(system_instructions) == [ + {"type": "text", "content": "System prompt"}, + {"type": "text", "content": "Instruction 1\nInstruction 2"}, + ] + else: + assert SPANDATA.GEN_AI_SYSTEM_INSTRUCTIONS not in chat_span["data"] @pytest.mark.asyncio -async def test_model_name_extraction_with_callable(sentry_init, capture_events): +async def test_model_name_extraction_with_callable(sentry_init, capture_items): """ Test model name extraction when model has a callable name() method. """ @@ -1372,7 +2210,7 @@ async def test_model_name_extraction_with_callable(sentry_init, capture_events): @pytest.mark.asyncio -async def test_model_name_extraction_fallback_to_str(sentry_init, capture_events): +async def test_model_name_extraction_fallback_to_str(sentry_init, capture_items): """ Test model name extraction falls back to str() when no name attribute exists. """ @@ -1399,7 +2237,7 @@ async def test_model_name_extraction_fallback_to_str(sentry_init, capture_events @pytest.mark.asyncio -async def test_model_settings_object_style(sentry_init, capture_events): +async def test_model_settings_object_style(sentry_init, capture_items): """ Test that object-style model settings (non-dict) are handled correctly. """ @@ -1432,8 +2270,14 @@ async def test_model_settings_object_style(sentry_init, capture_events): assert transaction is not None +@pytest.mark.parametrize("stream_gen_ai_spans", [True, False]) @pytest.mark.asyncio -async def test_usage_data_partial(sentry_init, capture_events): +async def test_usage_data_partial( + sentry_init, + capture_events, + capture_items, + stream_gen_ai_spans, +): """ Test that usage data is correctly handled when only some fields are present. """ @@ -1445,16 +2289,29 @@ async def test_usage_data_partial(sentry_init, capture_events): sentry_init( integrations=[PydanticAIIntegration()], traces_sample_rate=1.0, + stream_gen_ai_spans=stream_gen_ai_spans, ) - events = capture_events() + if stream_gen_ai_spans: + items = capture_items("transaction", "span") + + await agent.run("Test input") + + spans = [item.payload for item in items if item.type == "span"] + + chat_spans = [ + s for s in spans if s["attributes"].get("sentry.op", "") == "gen_ai.chat" + ] + else: + events = capture_events() + + await agent.run("Test input") - await agent.run("Test input") + (transaction,) = events + spans = transaction["spans"] - (transaction,) = events - spans = transaction["spans"] + chat_spans = [s for s in spans if s["op"] == "gen_ai.chat"] - chat_spans = [s for s in spans if s["op"] == "gen_ai.chat"] assert len(chat_spans) >= 1 # Check that usage data fields exist (they may or may not be set depending on TestModel) @@ -1463,8 +2320,14 @@ async def test_usage_data_partial(sentry_init, capture_events): assert chat_span is not None +@pytest.mark.parametrize("stream_gen_ai_spans", [True, False]) @pytest.mark.asyncio -async def test_agent_data_from_scope(sentry_init, capture_events): +async def test_agent_data_from_scope( + sentry_init, + capture_events, + capture_items, + stream_gen_ai_spans, +): """ Test that agent data can be retrieved from Sentry scope when not passed directly. """ @@ -1477,22 +2340,38 @@ async def test_agent_data_from_scope(sentry_init, capture_events): sentry_init( integrations=[PydanticAIIntegration()], traces_sample_rate=1.0, + stream_gen_ai_spans=stream_gen_ai_spans, ) - events = capture_events() + if stream_gen_ai_spans: + items = capture_items("transaction", "span") + + # The integration automatically sets agent in scope during execution + await agent.run("Test input") + + # Verify agent name is capture + (transaction,) = (item.payload for item in items if item.type == "transaction") + else: + events = capture_events() - # The integration automatically sets agent in scope during execution - await agent.run("Test input") + # The integration automatically sets agent in scope during execution + await agent.run("Test input") - (transaction,) = events + # Verify agent name is capture + (transaction,) = events # Verify agent name is captured assert transaction["transaction"] == "invoke_agent test_scope_agent" +@pytest.mark.parametrize("stream_gen_ai_spans", [True, False]) @pytest.mark.asyncio async def test_available_tools_without_description( - sentry_init, capture_events, get_test_agent + sentry_init, + capture_events, + capture_items, + get_test_agent, + stream_gen_ai_spans, ): """ Test that available tools are captured even when description is missing. @@ -1500,6 +2379,7 @@ async def test_available_tools_without_description( sentry_init( integrations=[PydanticAIIntegration()], traces_sample_rate=1.0, + stream_gen_ai_spans=stream_gen_ai_spans, ) test_agent = get_test_agent() @@ -1509,23 +2389,46 @@ def tool_without_desc(x: int) -> int: # No docstring = no description return x * 2 - events = capture_events() + if stream_gen_ai_spans: + items = capture_items("transaction", "span") - await test_agent.run("Use the tool with 5") + await test_agent.run("Use the tool with 5") - (transaction,) = events - spans = transaction["spans"] + spans = [item.payload for item in items if item.type == "span"] - chat_spans = [s for s in spans if s["op"] == "gen_ai.chat"] - if chat_spans: - chat_span = chat_spans[0] - if "gen_ai.request.available_tools" in chat_span["data"]: - tools_str = chat_span["data"]["gen_ai.request.available_tools"] - assert "tool_without_desc" in tools_str + chat_spans = [ + s for s in spans if s["attributes"].get("sentry.op", "") == "gen_ai.chat" + ] + if chat_spans: + chat_span = chat_spans[0] + if "gen_ai.request.available_tools" in chat_span["attributes"]: + tools_str = chat_span["attributes"]["gen_ai.request.available_tools"] + assert "tool_without_desc" in tools_str + else: + events = capture_events() + + await test_agent.run("Use the tool with 5") + + (transaction,) = events + spans = transaction["spans"] + + chat_spans = [s for s in spans if s["op"] == "gen_ai.chat"] + if chat_spans: + chat_span = chat_spans[0] + if "gen_ai.request.available_tools" in chat_span["data"]: + tools_str = chat_span["data"]["gen_ai.request.available_tools"] + assert "tool_without_desc" in tools_str +@pytest.mark.parametrize("stream_gen_ai_spans", [True, False]) @pytest.mark.asyncio -async def test_output_with_tool_calls(sentry_init, capture_events, get_test_agent): +async def test_output_with_tool_calls( + sentry_init, + capture_events, + capture_items, + get_test_agent, + stream_gen_ai_spans, +): """ Test that tool calls in model response are captured correctly. """ @@ -1533,6 +2436,7 @@ async def test_output_with_tool_calls(sentry_init, capture_events, get_test_agen integrations=[PydanticAIIntegration()], traces_sample_rate=1.0, send_default_pii=True, + stream_gen_ai_spans=stream_gen_ai_spans, ) test_agent = get_test_agent() @@ -1542,27 +2446,53 @@ def calc_tool(value: int) -> int: """Calculate something.""" return value + 10 - events = capture_events() + if stream_gen_ai_spans: + items = capture_items("transaction", "span") + + await test_agent.run("Use calc_tool with 5") - await test_agent.run("Use calc_tool with 5") + spans = [item.payload for item in items if item.type == "span"] - (transaction,) = events - spans = transaction["spans"] + chat_spans = [ + s for s in spans if s["attributes"].get("sentry.op", "") == "gen_ai.chat" + ] - chat_spans = [s for s in spans if s["op"] == "gen_ai.chat"] + # At least one chat span should exist + assert len(chat_spans) >= 1 - # At least one chat span should exist - assert len(chat_spans) >= 1 + # Check if tool calls are captured in response + for chat_span in chat_spans: + # Tool calls may or may not be in response depending on TestModel behavior + # Just verify the span was created and has basic data + assert "gen_ai.operation.name" in chat_span["attributes"] + else: + events = capture_events() + + await test_agent.run("Use calc_tool with 5") + + (transaction,) = events + spans = transaction["spans"] + + chat_spans = [s for s in spans if s["op"] == "gen_ai.chat"] + + # At least one chat span should exist + assert len(chat_spans) >= 1 - # Check if tool calls are captured in response - for chat_span in chat_spans: - # Tool calls may or may not be in response depending on TestModel behavior - # Just verify the span was created and has basic data - assert "gen_ai.operation.name" in chat_span["data"] + # Check if tool calls are captured in response + for chat_span in chat_spans: + # Tool calls may or may not be in response depending on TestModel behavior + # Just verify the span was created and has basic data + assert "gen_ai.operation.name" in chat_span["data"] +@pytest.mark.parametrize("stream_gen_ai_spans", [True, False]) @pytest.mark.asyncio -async def test_message_formatting_with_different_parts(sentry_init, capture_events): +async def test_message_formatting_with_different_parts( + sentry_init, + capture_events, + capture_items, + stream_gen_ai_spans, +): """ Test that different message part types are handled correctly in ai_client span. """ @@ -1577,10 +2507,9 @@ async def test_message_formatting_with_different_parts(sentry_init, capture_even integrations=[PydanticAIIntegration()], traces_sample_rate=1.0, send_default_pii=True, + stream_gen_ai_spans=stream_gen_ai_spans, ) - events = capture_events() - # Create message history with different part types history = [ messages.ModelRequest(parts=[messages.UserPromptPart(content="Hello")]), @@ -1592,26 +2521,48 @@ async def test_message_formatting_with_different_parts(sentry_init, capture_even ), ] - await agent.run("What did I say?", message_history=history) + if stream_gen_ai_spans: + items = capture_items("transaction", "span") - (transaction,) = events - spans = transaction["spans"] + await agent.run("What did I say?", message_history=history) - chat_spans = [s for s in spans if s["op"] == "gen_ai.chat"] + spans = [item.payload for item in items if item.type == "span"] - # Should have chat spans - assert len(chat_spans) >= 1 + chat_spans = [ + s for s in spans if s["attributes"].get("sentry.op", "") == "gen_ai.chat" + ] - # Check that messages are captured - chat_span = chat_spans[0] - if "gen_ai.request.messages" in chat_span["data"]: - messages_data = chat_span["data"]["gen_ai.request.messages"] - # Should contain message history - assert messages_data is not None + # Should have chat spans + assert len(chat_spans) >= 1 + + # Check that messages are captured + chat_span = chat_spans[0] + if "gen_ai.request.messages" in chat_span["attributes"]: + messages_data = chat_span["attributes"]["gen_ai.request.messages"] + assert messages_data is not None + else: + events = capture_events() + + await agent.run("What did I say?", message_history=history) + + (transaction,) = events + spans = transaction["spans"] + + chat_spans = [s for s in spans if s["op"] == "gen_ai.chat"] + + # Should have chat spans + assert len(chat_spans) >= 1 + + # Check that messages are captured + chat_span = chat_spans[0] + if "gen_ai.request.messages" in chat_span["data"]: + messages_data = chat_span["data"]["gen_ai.request.messages"] + # Should contain message history + assert messages_data is not None @pytest.mark.asyncio -async def test_update_invoke_agent_span_with_none_output(sentry_init, capture_events): +async def test_update_invoke_agent_span_with_none_output(sentry_init, capture_items): """ Test that update_invoke_agent_span handles None output gracefully. """ @@ -1639,7 +2590,7 @@ async def test_update_invoke_agent_span_with_none_output(sentry_init, capture_ev @pytest.mark.asyncio -async def test_update_ai_client_span_with_none_response(sentry_init, capture_events): +async def test_update_ai_client_span_with_none_response(sentry_init, capture_items): """ Test that update_ai_client_span handles None response gracefully. """ @@ -1665,8 +2616,14 @@ async def test_update_ai_client_span_with_none_response(sentry_init, capture_eve assert transaction is not None +@pytest.mark.parametrize("stream_gen_ai_spans", [True, False]) @pytest.mark.asyncio -async def test_agent_without_name(sentry_init, capture_events): +async def test_agent_without_name( + sentry_init, + capture_events, + capture_items, + stream_gen_ai_spans, +): """ Test that agent without a name is handled correctly. """ @@ -1676,22 +2633,32 @@ async def test_agent_without_name(sentry_init, capture_events): sentry_init( integrations=[PydanticAIIntegration()], traces_sample_rate=1.0, + stream_gen_ai_spans=stream_gen_ai_spans, ) - events = capture_events() + if stream_gen_ai_spans: + items = capture_items("transaction", "span") + + await agent.run("Test input") + + # Should still create transaction, just with default name + (transaction,) = (item.payload for item in items if item.type == "transaction") + else: + events = capture_events() + + await agent.run("Test input") - await agent.run("Test input") + (transaction,) = events - (transaction,) = events + # Should still create transaction, just with default name + assert transaction["type"] == "transaction" - # Should still create transaction, just with default name - assert transaction["type"] == "transaction" # Transaction name should be "invoke_agent agent" or similar default assert "invoke_agent" in transaction["transaction"] @pytest.mark.asyncio -async def test_model_response_without_parts(sentry_init, capture_events): +async def test_model_response_without_parts(sentry_init, capture_items): """ Test handling of model response without parts attribute. """ @@ -1723,7 +2690,7 @@ async def test_model_response_without_parts(sentry_init, capture_events): @pytest.mark.asyncio -async def test_input_messages_error_handling(sentry_init, capture_events): +async def test_input_messages_error_handling(sentry_init, capture_items): """ Test that _set_input_messages handles errors gracefully. """ @@ -1751,7 +2718,7 @@ async def test_input_messages_error_handling(sentry_init, capture_events): @pytest.mark.asyncio -async def test_available_tools_error_handling(sentry_init, capture_events): +async def test_available_tools_error_handling(sentry_init, capture_items): """ Test that _set_available_tools handles errors gracefully. """ @@ -1781,7 +2748,7 @@ async def test_available_tools_error_handling(sentry_init, capture_events): @pytest.mark.asyncio -async def test_set_usage_data_with_none_usage(sentry_init, capture_events): +async def test_set_usage_data_with_none_usage(sentry_init, capture_items): """ Test that _set_usage_data handles None usage gracefully. """ @@ -1806,7 +2773,7 @@ async def test_set_usage_data_with_none_usage(sentry_init, capture_events): @pytest.mark.asyncio -async def test_set_usage_data_with_partial_fields(sentry_init, capture_events): +async def test_set_usage_data_with_partial_fields(sentry_init, capture_items): """ Test that _set_usage_data handles usage with only some fields. """ @@ -1837,8 +2804,14 @@ async def test_set_usage_data_with_partial_fields(sentry_init, capture_events): assert transaction is not None +@pytest.mark.parametrize("stream_gen_ai_spans", [True, False]) @pytest.mark.asyncio -async def test_message_parts_with_tool_return(sentry_init, capture_events): +async def test_message_parts_with_tool_return( + sentry_init, + capture_events, + capture_items, + stream_gen_ai_spans, +): """ Test that ToolReturnPart messages are handled correctly. """ @@ -1858,24 +2831,37 @@ def test_tool(x: int) -> int: integrations=[PydanticAIIntegration()], traces_sample_rate=1.0, send_default_pii=True, + stream_gen_ai_spans=stream_gen_ai_spans, ) - events = capture_events() + if stream_gen_ai_spans: + items = capture_items("transaction", "span") + + # Run with history containing tool return + await agent.run("Use test_tool with 5") + + spans = [item.payload for item in items if item.type == "span"] + + chat_spans = [ + s for s in spans if s["attributes"].get("sentry.op", "") == "gen_ai.chat" + ] + else: + events = capture_events() - # Run with history containing tool return - await agent.run("Use test_tool with 5") + # Run with history containing tool return + await agent.run("Use test_tool with 5") - (transaction,) = events - spans = transaction["spans"] + (transaction,) = events + spans = transaction["spans"] - chat_spans = [s for s in spans if s["op"] == "gen_ai.chat"] + chat_spans = [s for s in spans if s["op"] == "gen_ai.chat"] # Should have chat spans assert len(chat_spans) >= 1 @pytest.mark.asyncio -async def test_message_parts_with_list_content(sentry_init, capture_events): +async def test_message_parts_with_list_content(sentry_init, capture_items): """ Test that message parts with list content are handled correctly. """ @@ -1910,7 +2896,7 @@ async def test_message_parts_with_list_content(sentry_init, capture_events): @pytest.mark.asyncio -async def test_output_data_with_text_and_tool_calls(sentry_init, capture_events): +async def test_output_data_with_text_and_tool_calls(sentry_init, capture_items): """ Test that _set_output_data handles both text and tool calls in response. """ @@ -1949,7 +2935,7 @@ async def test_output_data_with_text_and_tool_calls(sentry_init, capture_events) @pytest.mark.asyncio -async def test_output_data_error_handling(sentry_init, capture_events): +async def test_output_data_error_handling(sentry_init, capture_items): """ Test that _set_output_data handles errors in formatting gracefully. """ @@ -1981,7 +2967,7 @@ async def test_output_data_error_handling(sentry_init, capture_events): @pytest.mark.asyncio -async def test_message_with_system_prompt_part(sentry_init, capture_events): +async def test_message_with_system_prompt_part(sentry_init, capture_items): """ Test that SystemPromptPart is handled with correct role. """ @@ -2017,7 +3003,7 @@ async def test_message_with_system_prompt_part(sentry_init, capture_events): @pytest.mark.asyncio -async def test_message_with_instructions(sentry_init, capture_events): +async def test_message_with_instructions(sentry_init, capture_items): """ Test that messages with instructions field are handled correctly. """ @@ -2052,7 +3038,7 @@ async def test_message_with_instructions(sentry_init, capture_events): @pytest.mark.asyncio -async def test_set_input_messages_without_prompts(sentry_init, capture_events): +async def test_set_input_messages_without_prompts(sentry_init, capture_items): """ Test that _set_input_messages respects _should_send_prompts(). """ @@ -2078,7 +3064,7 @@ async def test_set_input_messages_without_prompts(sentry_init, capture_events): @pytest.mark.asyncio -async def test_set_output_data_without_prompts(sentry_init, capture_events): +async def test_set_output_data_without_prompts(sentry_init, capture_items): """ Test that _set_output_data respects _should_send_prompts(). """ @@ -2107,7 +3093,7 @@ async def test_set_output_data_without_prompts(sentry_init, capture_events): @pytest.mark.asyncio -async def test_get_model_name_with_exception_in_callable(sentry_init, capture_events): +async def test_get_model_name_with_exception_in_callable(sentry_init, capture_items): """ Test that _get_model_name handles exceptions in name() callable. """ @@ -2131,7 +3117,7 @@ async def test_get_model_name_with_exception_in_callable(sentry_init, capture_ev @pytest.mark.asyncio -async def test_get_model_name_with_string_model(sentry_init, capture_events): +async def test_get_model_name_with_string_model(sentry_init, capture_items): """ Test that _get_model_name handles string models. """ @@ -2150,7 +3136,7 @@ async def test_get_model_name_with_string_model(sentry_init, capture_events): @pytest.mark.asyncio -async def test_get_model_name_with_none(sentry_init, capture_events): +async def test_get_model_name_with_none(sentry_init, capture_items): """ Test that _get_model_name handles None model. """ @@ -2169,7 +3155,7 @@ async def test_get_model_name_with_none(sentry_init, capture_events): @pytest.mark.asyncio -async def test_set_model_data_with_system(sentry_init, capture_events): +async def test_set_model_data_with_system(sentry_init, capture_items): """ Test that _set_model_data captures system from model. """ @@ -2200,7 +3186,7 @@ async def test_set_model_data_with_system(sentry_init, capture_events): @pytest.mark.asyncio -async def test_set_model_data_from_agent_scope(sentry_init, capture_events): +async def test_set_model_data_from_agent_scope(sentry_init, capture_items): """ Test that _set_model_data retrieves model from agent in scope when not passed. """ @@ -2234,7 +3220,7 @@ async def test_set_model_data_from_agent_scope(sentry_init, capture_events): @pytest.mark.asyncio -async def test_set_model_data_with_none_settings_values(sentry_init, capture_events): +async def test_set_model_data_with_none_settings_values(sentry_init, capture_items): """ Test that _set_model_data skips None values in settings. """ @@ -2266,7 +3252,7 @@ async def test_set_model_data_with_none_settings_values(sentry_init, capture_eve @pytest.mark.asyncio -async def test_should_send_prompts_without_pii(sentry_init, capture_events): +async def test_should_send_prompts_without_pii(sentry_init, capture_items): """ Test that _should_send_prompts returns False when PII disabled. """ @@ -2284,7 +3270,7 @@ async def test_should_send_prompts_without_pii(sentry_init, capture_events): @pytest.mark.asyncio -async def test_set_agent_data_without_agent(sentry_init, capture_events): +async def test_set_agent_data_without_agent(sentry_init, capture_items): """ Test that _set_agent_data handles None agent gracefully. """ @@ -2309,7 +3295,7 @@ async def test_set_agent_data_without_agent(sentry_init, capture_events): @pytest.mark.asyncio -async def test_set_agent_data_from_scope(sentry_init, capture_events): +async def test_set_agent_data_from_scope(sentry_init, capture_items): """ Test that _set_agent_data retrieves agent from scope when not passed. """ @@ -2341,7 +3327,7 @@ async def test_set_agent_data_from_scope(sentry_init, capture_events): @pytest.mark.asyncio -async def test_set_agent_data_without_name(sentry_init, capture_events): +async def test_set_agent_data_without_name(sentry_init, capture_items): """ Test that _set_agent_data handles agent without name attribute. """ @@ -2371,7 +3357,7 @@ async def test_set_agent_data_without_name(sentry_init, capture_events): @pytest.mark.asyncio -async def test_set_available_tools_without_toolset(sentry_init, capture_events): +async def test_set_available_tools_without_toolset(sentry_init, capture_items): """ Test that _set_available_tools handles agent without toolset. """ @@ -2401,7 +3387,7 @@ async def test_set_available_tools_without_toolset(sentry_init, capture_events): @pytest.mark.asyncio -async def test_set_available_tools_with_schema(sentry_init, capture_events): +async def test_set_available_tools_with_schema(sentry_init, capture_items): """ Test that _set_available_tools extracts tool schema correctly. """ @@ -2437,7 +3423,7 @@ async def test_set_available_tools_with_schema(sentry_init, capture_events): @pytest.mark.asyncio -async def test_execute_tool_span_creation(sentry_init, capture_events): +async def test_execute_tool_span_creation(sentry_init, capture_items): """ Test direct creation of execute_tool span. """ @@ -2464,7 +3450,7 @@ async def test_execute_tool_span_creation(sentry_init, capture_events): @pytest.mark.asyncio -async def test_execute_tool_span_with_mcp_type(sentry_init, capture_events): +async def test_execute_tool_span_with_mcp_type(sentry_init, capture_items): """ Test execute_tool span with MCP tool type. """ @@ -2490,7 +3476,7 @@ async def test_execute_tool_span_with_mcp_type(sentry_init, capture_events): @pytest.mark.asyncio -async def test_execute_tool_span_without_prompts(sentry_init, capture_events): +async def test_execute_tool_span_without_prompts(sentry_init, capture_items): """ Test that execute_tool span respects _should_send_prompts(). """ @@ -2517,7 +3503,7 @@ async def test_execute_tool_span_without_prompts(sentry_init, capture_events): @pytest.mark.asyncio -async def test_execute_tool_span_with_none_args(sentry_init, capture_events): +async def test_execute_tool_span_with_none_args(sentry_init, capture_items): """ Test execute_tool span with None args. """ @@ -2540,7 +3526,7 @@ async def test_execute_tool_span_with_none_args(sentry_init, capture_events): @pytest.mark.asyncio -async def test_update_execute_tool_span_with_none_span(sentry_init, capture_events): +async def test_update_execute_tool_span_with_none_span(sentry_init, capture_items): """ Test that update_execute_tool_span handles None span gracefully. """ @@ -2561,7 +3547,7 @@ async def test_update_execute_tool_span_with_none_span(sentry_init, capture_even @pytest.mark.asyncio -async def test_update_execute_tool_span_with_none_result(sentry_init, capture_events): +async def test_update_execute_tool_span_with_none_result(sentry_init, capture_items): """ Test that update_execute_tool_span handles None result gracefully. """ @@ -2588,7 +3574,7 @@ async def test_update_execute_tool_span_with_none_result(sentry_init, capture_ev @pytest.mark.asyncio -async def test_tool_execution_without_span_context(sentry_init, capture_events): +async def test_tool_execution_without_span_context(sentry_init, capture_items): """ Test that tool execution patch handles case when no span context exists. This tests the code path where current_span is None in _patch_tool_execution. @@ -2617,7 +3603,7 @@ async def test_tool_execution_without_span_context(sentry_init, capture_events): @pytest.mark.asyncio -async def test_invoke_agent_span_with_callable_instruction(sentry_init, capture_events): +async def test_invoke_agent_span_with_callable_instruction(sentry_init, capture_items): """ Test that invoke_agent_span skips callable instructions correctly. """ @@ -2650,7 +3636,7 @@ async def test_invoke_agent_span_with_callable_instruction(sentry_init, capture_ @pytest.mark.asyncio -async def test_invoke_agent_span_with_string_instructions(sentry_init, capture_events): +async def test_invoke_agent_span_with_string_instructions(sentry_init, capture_items): """ Test that invoke_agent_span handles string instructions (not list). """ @@ -2680,7 +3666,7 @@ async def test_invoke_agent_span_with_string_instructions(sentry_init, capture_e @pytest.mark.asyncio -async def test_ai_client_span_with_streaming_flag(sentry_init, capture_events): +async def test_ai_client_span_with_streaming_flag(sentry_init, capture_items): """ Test that ai_client_span reads streaming flag from scope. """ @@ -2706,7 +3692,7 @@ async def test_ai_client_span_with_streaming_flag(sentry_init, capture_events): @pytest.mark.asyncio -async def test_ai_client_span_gets_agent_from_scope(sentry_init, capture_events): +async def test_ai_client_span_gets_agent_from_scope(sentry_init, capture_items): """ Test that ai_client_span gets agent from scope when not passed. """ @@ -2758,63 +3744,116 @@ def _find_binary_content(messages_data, expected_modality, expected_mime_type): return False +@pytest.mark.parametrize("stream_gen_ai_spans", [True, False]) @pytest.mark.asyncio -async def test_binary_content_encoding_image(sentry_init, capture_events): +async def test_binary_content_encoding_image( + sentry_init, + capture_events, + capture_items, + stream_gen_ai_spans, +): """Test that BinaryContent with image data is properly encoded in messages.""" sentry_init( integrations=[PydanticAIIntegration()], traces_sample_rate=1.0, send_default_pii=True, + stream_gen_ai_spans=stream_gen_ai_spans, ) - events = capture_events() + if stream_gen_ai_spans: + items = capture_items("transaction", "span") - with sentry_sdk.start_transaction(op="test", name="test"): - span = sentry_sdk.start_span(op="test_span") - binary_content = BinaryContent( - data=b"fake_image_data_12345", media_type="image/png" - ) - user_part = UserPromptPart(content=["Look at this image:", binary_content]) - mock_msg = MagicMock() - mock_msg.parts = [user_part] - mock_msg.instructions = None + with sentry_sdk.start_transaction(op="test", name="test"): + span = sentry_sdk.start_span(op="test_span") + binary_content = BinaryContent( + data=b"fake_image_data_12345", media_type="image/png" + ) + user_part = UserPromptPart(content=["Look at this image:", binary_content]) + mock_msg = MagicMock() + mock_msg.parts = [user_part] + mock_msg.instructions = None - _set_input_messages(span, [mock_msg]) - span.finish() + _set_input_messages(span, [mock_msg]) + span.finish() + + (event,) = (item.payload for item in items if item.type == "transaction") + else: + events = capture_events() + + with sentry_sdk.start_transaction(op="test", name="test"): + span = sentry_sdk.start_span(op="test_span") + binary_content = BinaryContent( + data=b"fake_image_data_12345", media_type="image/png" + ) + user_part = UserPromptPart(content=["Look at this image:", binary_content]) + mock_msg = MagicMock() + mock_msg.parts = [user_part] + mock_msg.instructions = None + + _set_input_messages(span, [mock_msg]) + span.finish() - (event,) = events + (event,) = events span_data = event["spans"][0]["data"] messages_data = _get_messages_from_span(span_data) assert _find_binary_content(messages_data, "image", "image/png") +@pytest.mark.parametrize("stream_gen_ai_spans", [True, False]) @pytest.mark.asyncio -async def test_binary_content_encoding_mixed_content(sentry_init, capture_events): +async def test_binary_content_encoding_mixed_content( + sentry_init, + capture_events, + capture_items, + stream_gen_ai_spans, +): """Test that BinaryContent mixed with text content is properly handled.""" sentry_init( integrations=[PydanticAIIntegration()], traces_sample_rate=1.0, send_default_pii=True, + stream_gen_ai_spans=stream_gen_ai_spans, ) - events = capture_events() + if stream_gen_ai_spans: + items = capture_items("transaction", "span") - with sentry_sdk.start_transaction(op="test", name="test"): - span = sentry_sdk.start_span(op="test_span") - binary_content = BinaryContent( - data=b"fake_image_bytes", media_type="image/jpeg" - ) - user_part = UserPromptPart( - content=["Here is an image:", binary_content, "What do you see?"] - ) - mock_msg = MagicMock() - mock_msg.parts = [user_part] - mock_msg.instructions = None + with sentry_sdk.start_transaction(op="test", name="test"): + span = sentry_sdk.start_span(op="test_span") + binary_content = BinaryContent( + data=b"fake_image_bytes", media_type="image/jpeg" + ) + user_part = UserPromptPart( + content=["Here is an image:", binary_content, "What do you see?"] + ) + mock_msg = MagicMock() + mock_msg.parts = [user_part] + mock_msg.instructions = None - _set_input_messages(span, [mock_msg]) - span.finish() + _set_input_messages(span, [mock_msg]) + span.finish() + + (event,) = (item.payload for item in items if item.type == "transaction") + else: + events = capture_events() + + with sentry_sdk.start_transaction(op="test", name="test"): + span = sentry_sdk.start_span(op="test_span") + binary_content = BinaryContent( + data=b"fake_image_bytes", media_type="image/jpeg" + ) + user_part = UserPromptPart( + content=["Here is an image:", binary_content, "What do you see?"] + ) + mock_msg = MagicMock() + mock_msg.parts = [user_part] + mock_msg.instructions = None + + _set_input_messages(span, [mock_msg]) + span.finish() + + (event,) = events - (event,) = events span_data = event["spans"][0]["data"] messages_data = _get_messages_from_span(span_data) @@ -2829,8 +3868,14 @@ async def test_binary_content_encoding_mixed_content(sentry_init, capture_events assert _find_binary_content(messages_data, "image", "image/jpeg") +@pytest.mark.parametrize("stream_gen_ai_spans", [True, False]) @pytest.mark.asyncio -async def test_binary_content_in_agent_run(sentry_init, capture_events): +async def test_binary_content_in_agent_run( + sentry_init, + capture_events, + capture_items, + stream_gen_ai_spans, +): """Test that BinaryContent in actual agent run is properly captured in spans.""" agent = Agent("test", name="test_binary_agent") @@ -2838,48 +3883,100 @@ async def test_binary_content_in_agent_run(sentry_init, capture_events): integrations=[PydanticAIIntegration()], traces_sample_rate=1.0, send_default_pii=True, + stream_gen_ai_spans=stream_gen_ai_spans, ) - events = capture_events() binary_content = BinaryContent( data=b"fake_image_data_for_testing", media_type="image/png" ) - await agent.run(["Analyze this image:", binary_content]) - (transaction,) = events - chat_spans = [s for s in transaction["spans"] if s["op"] == "gen_ai.chat"] - assert len(chat_spans) >= 1 + if stream_gen_ai_spans: + items = capture_items("transaction", "span") - chat_span = chat_spans[0] - if "gen_ai.request.messages" in chat_span["data"]: - messages_str = str(chat_span["data"]["gen_ai.request.messages"]) - assert any(keyword in messages_str for keyword in ["blob", "image", "base64"]) + await agent.run(["Analyze this image:", binary_content]) + + spans = [item.payload for item in items if item.type == "span"] + chat_spans = [ + s for s in spans if s["attributes"].get("sentry.op", "") == "gen_ai.chat" + ] + assert len(chat_spans) >= 1 + + chat_span = chat_spans[0] + if "gen_ai.request.messages" in chat_span["attributes"]: + messages_str = str(chat_span["attributes"]["gen_ai.request.messages"]) + + assert any( + keyword in messages_str for keyword in ["blob", "image", "base64"] + ) + else: + events = capture_events() + await agent.run(["Analyze this image:", binary_content]) + (transaction,) = events + chat_spans = [s for s in transaction["spans"] if s["op"] == "gen_ai.chat"] + assert len(chat_spans) >= 1 + + chat_span = chat_spans[0] + if "gen_ai.request.messages" in chat_span["data"]: + messages_str = str(chat_span["data"]["gen_ai.request.messages"]) + assert any( + keyword in messages_str for keyword in ["blob", "image", "base64"] + ) + + +@pytest.mark.parametrize("stream_gen_ai_spans", [True, False]) @pytest.mark.asyncio -async def test_set_usage_data_with_cache_tokens(sentry_init, capture_events): +async def test_set_usage_data_with_cache_tokens( + sentry_init, + capture_events, + capture_items, + stream_gen_ai_spans, +): """Test that cache_read_tokens and cache_write_tokens are tracked.""" - sentry_init(integrations=[PydanticAIIntegration()], traces_sample_rate=1.0) + sentry_init( + integrations=[PydanticAIIntegration()], + traces_sample_rate=1.0, + stream_gen_ai_spans=stream_gen_ai_spans, + ) - events = capture_events() + if stream_gen_ai_spans: + items = capture_items("transaction", "span") - with sentry_sdk.start_transaction(op="test", name="test"): - span = sentry_sdk.start_span(op="test_span") - usage = RequestUsage( - input_tokens=100, - output_tokens=50, - cache_read_tokens=80, - cache_write_tokens=20, - ) - _set_usage_data(span, usage) - span.finish() + with sentry_sdk.start_transaction(op="test", name="test"): + span = sentry_sdk.start_span(op="test_span") + usage = RequestUsage( + input_tokens=100, + output_tokens=50, + cache_read_tokens=80, + cache_write_tokens=20, + ) + _set_usage_data(span, usage) + span.finish() + + (event,) = (item.payload for item in items if item.type == "transaction") + else: + events = capture_events() + + with sentry_sdk.start_transaction(op="test", name="test"): + span = sentry_sdk.start_span(op="test_span") + usage = RequestUsage( + input_tokens=100, + output_tokens=50, + cache_read_tokens=80, + cache_write_tokens=20, + ) + _set_usage_data(span, usage) + span.finish() + + (event,) = events - (event,) = events (span_data,) = event["spans"] assert span_data["data"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS_CACHED] == 80 assert span_data["data"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS_CACHE_WRITE] == 20 +@pytest.mark.parametrize("stream_gen_ai_spans", [True, False]) @pytest.mark.parametrize( "url,image_url_kwargs,expected_content", [ @@ -2922,7 +4019,13 @@ async def test_set_usage_data_with_cache_tokens(sentry_init, capture_events): ], ) def test_image_url_base64_content_in_span( - sentry_init, capture_events, url, image_url_kwargs, expected_content + sentry_init, + capture_events, + capture_items, + url, + image_url_kwargs, + expected_content, + stream_gen_ai_spans, ): from sentry_sdk.integrations.pydantic_ai.spans.ai_client import ai_client_span @@ -2930,37 +4033,67 @@ def test_image_url_base64_content_in_span( integrations=[PydanticAIIntegration()], traces_sample_rate=1.0, send_default_pii=True, + stream_gen_ai_spans=stream_gen_ai_spans, ) - events = capture_events() + found_image = False + if stream_gen_ai_spans: + items = capture_items("transaction", "span") + + with sentry_sdk.start_transaction(op="test", name="test"): + image_url = ImageUrl(url=url, **image_url_kwargs) + user_part = UserPromptPart(content=["Look at this image:", image_url]) + mock_msg = MagicMock() + mock_msg.parts = [user_part] + mock_msg.instructions = None + + span = ai_client_span([mock_msg], None, None, None) + span.finish() + + spans = [item.payload for item in items if item.type == "span"] + chat_spans = [ + s for s in spans if s["attributes"].get("sentry.op", "") == "gen_ai.chat" + ] + assert len(chat_spans) >= 1 + messages_data = _get_messages_from_span(chat_spans[0]["attributes"]) - with sentry_sdk.start_transaction(op="test", name="test"): - image_url = ImageUrl(url=url, **image_url_kwargs) - user_part = UserPromptPart(content=["Look at this image:", image_url]) - mock_msg = MagicMock() - mock_msg.parts = [user_part] - mock_msg.instructions = None + for msg in messages_data: + if "content" not in msg: + continue + for content_item in msg["content"]: + if content_item.get("type") == "image": + found_image = True + assert content_item["content"] == expected_content + else: + events = capture_events() - span = ai_client_span([mock_msg], None, None, None) - span.finish() + with sentry_sdk.start_transaction(op="test", name="test"): + image_url = ImageUrl(url=url, **image_url_kwargs) + user_part = UserPromptPart(content=["Look at this image:", image_url]) + mock_msg = MagicMock() + mock_msg.parts = [user_part] + mock_msg.instructions = None - (event,) = events - chat_spans = [s for s in event["spans"] if s["op"] == "gen_ai.chat"] - assert len(chat_spans) >= 1 - messages_data = _get_messages_from_span(chat_spans[0]["data"]) + span = ai_client_span([mock_msg], None, None, None) + span.finish() - found_image = False - for msg in messages_data: - if "content" not in msg: - continue - for content_item in msg["content"]: - if content_item.get("type") == "image": - found_image = True - assert content_item["content"] == expected_content + (event,) = events + chat_spans = [s for s in event["spans"] if s["op"] == "gen_ai.chat"] + assert len(chat_spans) >= 1 + messages_data = _get_messages_from_span(chat_spans[0]["data"]) + + for msg in messages_data: + if "content" not in msg: + continue + for content_item in msg["content"]: + if content_item.get("type") == "image": + found_image = True + assert content_item["content"] == expected_content assert found_image, "Image content item should be found in messages data" +@pytest.mark.parametrize("stream_gen_ai_spans", [True, False]) @pytest.mark.asyncio @pytest.mark.parametrize( "url, image_url_kwargs, expected_content", @@ -2992,40 +4125,76 @@ def test_image_url_base64_content_in_span( ], ) async def test_invoke_agent_image_url( - sentry_init, capture_events, url, image_url_kwargs, expected_content + sentry_init, + capture_events, + capture_items, + url, + image_url_kwargs, + expected_content, + stream_gen_ai_spans, ): sentry_init( integrations=[PydanticAIIntegration()], traces_sample_rate=1.0, send_default_pii=True, + stream_gen_ai_spans=stream_gen_ai_spans, ) agent = Agent("test", name="test_image_url_agent") - events = capture_events() image_url = ImageUrl(url=url, **image_url_kwargs) - await agent.run([image_url, "Describe this image"]) - (transaction,) = events + if stream_gen_ai_spans: + items = capture_items("transaction", "span") - found_image = False + await agent.run([image_url, "Describe this image"]) - chat_spans = [s for s in transaction["spans"] if s["op"] == "gen_ai.chat"] - for chat_span in chat_spans: - messages_data = _get_messages_from_span(chat_span["data"]) - for msg in messages_data: - if "content" not in msg: - continue - for content_item in msg["content"]: - if content_item.get("type") == "image": - assert content_item["content"] == expected_content - found_image = True + found_image = False + + spans = [item.payload for item in items if item.type == "span"] + chat_spans = [ + s for s in spans if s["attributes"].get("sentry.op", "") == "gen_ai.chat" + ] + for chat_span in chat_spans: + messages_data = _get_messages_from_span(chat_span["attributes"]) + for msg in messages_data: + if "content" not in msg: + continue + for content_item in msg["content"]: + if content_item.get("type") == "image": + assert content_item["content"] == expected_content + found_image = True + else: + events = capture_events() + + await agent.run([image_url, "Describe this image"]) + + (transaction,) = events + + found_image = False + + chat_spans = [s for s in transaction["spans"] if s["op"] == "gen_ai.chat"] + for chat_span in chat_spans: + messages_data = _get_messages_from_span(chat_span["data"]) + for msg in messages_data: + if "content" not in msg: + continue + for content_item in msg["content"]: + if content_item.get("type") == "image": + assert content_item["content"] == expected_content + found_image = True assert found_image, "Image content item should be found in messages data" +@pytest.mark.parametrize("stream_gen_ai_spans", [True, False]) @pytest.mark.asyncio -async def test_tool_description_in_execute_tool_span(sentry_init, capture_events): +async def test_tool_description_in_execute_tool_span( + sentry_init, + capture_events, + capture_items, + stream_gen_ai_spans, +): """ Test that tool description from the tool's docstring is included in execute_tool spans. """ @@ -3044,20 +4213,51 @@ def multiply_numbers(a: int, b: int) -> int: integrations=[PydanticAIIntegration()], traces_sample_rate=1.0, send_default_pii=True, + stream_gen_ai_spans=stream_gen_ai_spans, ) - events = capture_events() + if stream_gen_ai_spans: + items = capture_items("transaction", "span") - result = await agent.run("What is 5 times 3?") - assert result is not None + result = await agent.run("What is 5 times 3?") + assert result is not None + + spans = [item.payload for item in items if item.type == "span"] + + tool_spans = [ + s + for s in spans + if s["attributes"].get("sentry.op", "") == "gen_ai.execute_tool" + ] + + assert len(tool_spans) >= 1 + + tool_span = tool_spans[0] + + assert tool_span["attributes"]["gen_ai.tool.name"] == "multiply_numbers" + assert SPANDATA.GEN_AI_TOOL_DESCRIPTION in tool_span["attributes"] + assert ( + "Multiply two numbers" + in tool_span["attributes"][SPANDATA.GEN_AI_TOOL_DESCRIPTION] + ) + else: + events = capture_events() + + result = await agent.run("What is 5 times 3?") + assert result is not None + + (transaction,) = events + spans = transaction["spans"] + + tool_spans = [s for s in spans if s["op"] == "gen_ai.execute_tool"] - (transaction,) = events - spans = transaction["spans"] + assert len(tool_spans) >= 1 - tool_spans = [s for s in spans if s["op"] == "gen_ai.execute_tool"] - assert len(tool_spans) >= 1 + tool_span = tool_spans[0] - tool_span = tool_spans[0] - assert tool_span["data"]["gen_ai.tool.name"] == "multiply_numbers" - assert SPANDATA.GEN_AI_TOOL_DESCRIPTION in tool_span["data"] - assert "Multiply two numbers" in tool_span["data"][SPANDATA.GEN_AI_TOOL_DESCRIPTION] + assert tool_span["data"]["gen_ai.tool.name"] == "multiply_numbers" + assert SPANDATA.GEN_AI_TOOL_DESCRIPTION in tool_span["data"] + assert ( + "Multiply two numbers" + in tool_span["data"][SPANDATA.GEN_AI_TOOL_DESCRIPTION] + ) diff --git a/tests/tracing/test_decorator.py b/tests/tracing/test_decorator.py index 15432f5862..4e0c6cc1a9 100644 --- a/tests/tracing/test_decorator.py +++ b/tests/tracing/test_decorator.py @@ -121,10 +121,13 @@ async def _some_function_traced(a, b, c): ) -def test_span_templates_ai_dicts(sentry_init, capture_events): - sentry_init(traces_sample_rate=1.0) - events = capture_events() - +@pytest.mark.parametrize("stream_gen_ai_spans", [True, False]) +def test_span_templates_ai_dicts( + sentry_init, + capture_events, + capture_items, + stream_gen_ai_spans, +): @sentry_sdk.trace(template=SPANTEMPLATE.AI_TOOL) def my_tool(arg1, arg2): return { @@ -163,65 +166,157 @@ def my_agent(): presence_penalty=2.0, ) - with sentry_sdk.start_transaction(name="test-transaction"): - my_agent() + if stream_gen_ai_spans: + sentry_init( + traces_sample_rate=1.0, + stream_gen_ai_spans=stream_gen_ai_spans, + ) + items = capture_items("span") - (event,) = events - (agent_span, tool_span, chat_span) = event["spans"] + with sentry_sdk.start_transaction(name="test-transaction"): + my_agent() - assert agent_span["op"] == "gen_ai.invoke_agent" - assert ( - agent_span["description"] - == "invoke_agent test_decorator.test_span_templates_ai_dicts..my_agent" - ) - assert agent_span["data"] == { - "gen_ai.agent.name": "test_decorator.test_span_templates_ai_dicts..my_agent", - "gen_ai.operation.name": "invoke_agent", - "thread.id": mock.ANY, - "thread.name": mock.ANY, - } - - assert tool_span["op"] == "gen_ai.execute_tool" - assert ( - tool_span["description"] - == "execute_tool test_decorator.test_span_templates_ai_dicts..my_tool" - ) - assert tool_span["data"] == { - "gen_ai.tool.name": "test_decorator.test_span_templates_ai_dicts..my_tool", - "gen_ai.operation.name": "execute_tool", - "gen_ai.usage.input_tokens": 10, - "gen_ai.usage.output_tokens": 20, - "gen_ai.usage.total_tokens": 30, - "thread.id": mock.ANY, - "thread.name": mock.ANY, - } - assert "gen_ai.tool.description" not in tool_span["data"] - - assert chat_span["op"] == "gen_ai.chat" - assert chat_span["description"] == "chat my-gpt-4o-mini" - assert chat_span["data"] == { - "gen_ai.operation.name": "chat", - "gen_ai.request.frequency_penalty": 1.0, - "gen_ai.request.max_tokens": 100, - "gen_ai.request.messages": "[{'role': 'user', 'content': 'What is the weather in Tokyo?'}, {'role': 'system', 'content': 'You are a helpful assistant that can answer questions about the weather.'}]", - "gen_ai.request.model": "my-gpt-4o-mini", - "gen_ai.request.presence_penalty": 2.0, - "gen_ai.request.temperature": 0.5, - "gen_ai.request.top_k": 40, - "gen_ai.request.top_p": 0.9, - "gen_ai.response.model": "my-gpt-4o-mini-v123", - "gen_ai.usage.input_tokens": 11, - "gen_ai.usage.output_tokens": 22, - "gen_ai.usage.total_tokens": 33, - "thread.id": mock.ANY, - "thread.name": mock.ANY, - } - - -def test_span_templates_ai_objects(sentry_init, capture_events): - sentry_init(traces_sample_rate=1.0) - events = capture_events() + (agent_span, tool_span, chat_span) = ( + item.payload for item in items if item.type == "span" + ) + + assert ( + agent_span["name"] + == "invoke_agent test_decorator.test_span_templates_ai_dicts..my_agent" + ) + assert agent_span["attributes"] == { + "gen_ai.agent.name": "test_decorator.test_span_templates_ai_dicts..my_agent", + "gen_ai.operation.name": "invoke_agent", + "sentry.environment": "production", + "sentry.op": "gen_ai.invoke_agent", + "sentry.origin": "manual", + "sentry.release": mock.ANY, + "sentry.sdk.name": "sentry.python", + "sentry.sdk.version": mock.ANY, + "sentry.segment.id": mock.ANY, + "sentry.segment.name": "test-transaction", + "thread.id": mock.ANY, + "thread.name": mock.ANY, + } + + assert ( + tool_span["name"] + == "execute_tool test_decorator.test_span_templates_ai_dicts..my_tool" + ) + assert tool_span["attributes"] == { + "gen_ai.tool.name": "test_decorator.test_span_templates_ai_dicts..my_tool", + "gen_ai.operation.name": "execute_tool", + "gen_ai.usage.input_tokens": 10, + "gen_ai.usage.output_tokens": 20, + "gen_ai.usage.total_tokens": 30, + "sentry.environment": "production", + "sentry.op": "gen_ai.execute_tool", + "sentry.origin": "manual", + "sentry.release": mock.ANY, + "sentry.sdk.name": "sentry.python", + "sentry.sdk.version": mock.ANY, + "sentry.segment.id": mock.ANY, + "sentry.segment.name": "test-transaction", + "thread.id": mock.ANY, + "thread.name": mock.ANY, + } + assert "gen_ai.tool.description" not in tool_span["attributes"] + + assert chat_span["name"] == "chat my-gpt-4o-mini" + assert chat_span["attributes"] == { + "gen_ai.operation.name": "chat", + "gen_ai.request.frequency_penalty": 1.0, + "gen_ai.request.max_tokens": 100, + "gen_ai.request.messages": "[{'role': 'user', 'content': 'What is the weather in Tokyo?'}, {'role': 'system', 'content': 'You are a helpful assistant that can answer questions about the weather.'}]", + "gen_ai.request.model": "my-gpt-4o-mini", + "gen_ai.request.presence_penalty": 2.0, + "gen_ai.request.temperature": 0.5, + "gen_ai.request.top_k": 40, + "gen_ai.request.top_p": 0.9, + "gen_ai.response.model": "my-gpt-4o-mini-v123", + "gen_ai.usage.input_tokens": 11, + "gen_ai.usage.output_tokens": 22, + "gen_ai.usage.total_tokens": 33, + "sentry.environment": "production", + "sentry.op": "gen_ai.chat", + "sentry.origin": "manual", + "sentry.release": mock.ANY, + "sentry.sdk.name": "sentry.python", + "sentry.sdk.version": mock.ANY, + "sentry.segment.id": mock.ANY, + "sentry.segment.name": "test-transaction", + "thread.id": mock.ANY, + "thread.name": mock.ANY, + } + else: + sentry_init( + traces_sample_rate=1.0, + stream_gen_ai_spans=stream_gen_ai_spans, + ) + events = capture_events() + + with sentry_sdk.start_transaction(name="test-transaction"): + my_agent() + + (event,) = events + (agent_span, tool_span, chat_span) = event["spans"] + assert agent_span["op"] == "gen_ai.invoke_agent" + assert ( + agent_span["description"] + == "invoke_agent test_decorator.test_span_templates_ai_dicts..my_agent" + ) + assert agent_span["data"] == { + "gen_ai.agent.name": "test_decorator.test_span_templates_ai_dicts..my_agent", + "gen_ai.operation.name": "invoke_agent", + "thread.id": mock.ANY, + "thread.name": mock.ANY, + } + + assert tool_span["op"] == "gen_ai.execute_tool" + assert ( + tool_span["description"] + == "execute_tool test_decorator.test_span_templates_ai_dicts..my_tool" + ) + assert tool_span["data"] == { + "gen_ai.tool.name": "test_decorator.test_span_templates_ai_dicts..my_tool", + "gen_ai.operation.name": "execute_tool", + "gen_ai.usage.input_tokens": 10, + "gen_ai.usage.output_tokens": 20, + "gen_ai.usage.total_tokens": 30, + "thread.id": mock.ANY, + "thread.name": mock.ANY, + } + assert "gen_ai.tool.description" not in tool_span["data"] + + assert chat_span["op"] == "gen_ai.chat" + assert chat_span["description"] == "chat my-gpt-4o-mini" + assert chat_span["data"] == { + "gen_ai.operation.name": "chat", + "gen_ai.request.frequency_penalty": 1.0, + "gen_ai.request.max_tokens": 100, + "gen_ai.request.messages": "[{'role': 'user', 'content': 'What is the weather in Tokyo?'}, {'role': 'system', 'content': 'You are a helpful assistant that can answer questions about the weather.'}]", + "gen_ai.request.model": "my-gpt-4o-mini", + "gen_ai.request.presence_penalty": 2.0, + "gen_ai.request.temperature": 0.5, + "gen_ai.request.top_k": 40, + "gen_ai.request.top_p": 0.9, + "gen_ai.response.model": "my-gpt-4o-mini-v123", + "gen_ai.usage.input_tokens": 11, + "gen_ai.usage.output_tokens": 22, + "gen_ai.usage.total_tokens": 33, + "thread.id": mock.ANY, + "thread.name": mock.ANY, + } + + +@pytest.mark.parametrize("stream_gen_ai_spans", [True, False]) +def test_span_templates_ai_objects( + sentry_init, + capture_events, + capture_items, + stream_gen_ai_spans, +): @sentry_sdk.trace(template=SPANTEMPLATE.AI_TOOL) def my_tool(arg1, arg2): """This is a tool function.""" @@ -264,66 +359,156 @@ def my_agent(): presence_penalty=2.0, ) - with sentry_sdk.start_transaction(name="test-transaction"): - my_agent() + sentry_init( + traces_sample_rate=1.0, + stream_gen_ai_spans=stream_gen_ai_spans, + ) - (event,) = events - (agent_span, tool_span, chat_span) = event["spans"] + if stream_gen_ai_spans: + items = capture_items("span") - assert agent_span["op"] == "gen_ai.invoke_agent" - assert ( - agent_span["description"] - == "invoke_agent test_decorator.test_span_templates_ai_objects..my_agent" - ) - assert agent_span["data"] == { - "gen_ai.agent.name": "test_decorator.test_span_templates_ai_objects..my_agent", - "gen_ai.operation.name": "invoke_agent", - "thread.id": mock.ANY, - "thread.name": mock.ANY, - } - - assert tool_span["op"] == "gen_ai.execute_tool" - assert ( - tool_span["description"] - == "execute_tool test_decorator.test_span_templates_ai_objects..my_tool" - ) - assert tool_span["data"] == { - "gen_ai.tool.name": "test_decorator.test_span_templates_ai_objects..my_tool", - "gen_ai.tool.description": "This is a tool function.", - "gen_ai.operation.name": "execute_tool", - "gen_ai.usage.input_tokens": 10, - "gen_ai.usage.output_tokens": 20, - "gen_ai.usage.total_tokens": 30, - "thread.id": mock.ANY, - "thread.name": mock.ANY, - } - - assert chat_span["op"] == "gen_ai.chat" - assert chat_span["description"] == "chat my-gpt-4o-mini" - assert chat_span["data"] == { - "gen_ai.operation.name": "chat", - "gen_ai.request.frequency_penalty": 1.0, - "gen_ai.request.max_tokens": 100, - "gen_ai.request.messages": "[{'role': 'user', 'content': 'What is the weather in Tokyo?'}, {'role': 'system', 'content': 'You are a helpful assistant that can answer questions about the weather.'}]", - "gen_ai.request.model": "my-gpt-4o-mini", - "gen_ai.request.presence_penalty": 2.0, - "gen_ai.request.temperature": 0.5, - "gen_ai.request.top_k": 40, - "gen_ai.request.top_p": 0.9, - "gen_ai.response.model": "my-gpt-4o-mini-v123", - "gen_ai.usage.input_tokens": 11, - "gen_ai.usage.output_tokens": 22, - "gen_ai.usage.total_tokens": 33, - "thread.id": mock.ANY, - "thread.name": mock.ANY, - } + with sentry_sdk.start_transaction(name="test-transaction"): + my_agent() + (agent_span, tool_span, chat_span) = ( + item.payload for item in items if item.type == "span" + ) -@pytest.mark.parametrize("send_default_pii", [True, False]) -def test_span_templates_ai_pii(sentry_init, capture_events, send_default_pii): - sentry_init(traces_sample_rate=1.0, send_default_pii=send_default_pii) - events = capture_events() + assert ( + agent_span["name"] + == "invoke_agent test_decorator.test_span_templates_ai_objects..my_agent" + ) + assert agent_span["attributes"] == { + "gen_ai.agent.name": "test_decorator.test_span_templates_ai_objects..my_agent", + "gen_ai.operation.name": "invoke_agent", + "sentry.environment": "production", + "sentry.op": "gen_ai.invoke_agent", + "sentry.origin": "manual", + "sentry.release": mock.ANY, + "sentry.sdk.name": "sentry.python", + "sentry.sdk.version": mock.ANY, + "sentry.segment.id": mock.ANY, + "sentry.segment.name": "test-transaction", + "thread.id": mock.ANY, + "thread.name": mock.ANY, + } + + assert ( + tool_span["name"] + == "execute_tool test_decorator.test_span_templates_ai_objects..my_tool" + ) + assert tool_span["attributes"] == { + "gen_ai.tool.name": "test_decorator.test_span_templates_ai_objects..my_tool", + "gen_ai.tool.description": "This is a tool function.", + "gen_ai.operation.name": "execute_tool", + "gen_ai.usage.input_tokens": 10, + "gen_ai.usage.output_tokens": 20, + "gen_ai.usage.total_tokens": 30, + "sentry.environment": "production", + "sentry.op": "gen_ai.execute_tool", + "sentry.origin": "manual", + "sentry.release": mock.ANY, + "sentry.sdk.name": "sentry.python", + "sentry.sdk.version": mock.ANY, + "sentry.segment.id": mock.ANY, + "sentry.segment.name": "test-transaction", + "thread.id": mock.ANY, + "thread.name": mock.ANY, + } + + assert chat_span["name"] == "chat my-gpt-4o-mini" + assert chat_span["attributes"] == { + "gen_ai.operation.name": "chat", + "gen_ai.request.frequency_penalty": 1.0, + "gen_ai.request.max_tokens": 100, + "gen_ai.request.messages": "[{'role': 'user', 'content': 'What is the weather in Tokyo?'}, {'role': 'system', 'content': 'You are a helpful assistant that can answer questions about the weather.'}]", + "gen_ai.request.model": "my-gpt-4o-mini", + "gen_ai.request.presence_penalty": 2.0, + "gen_ai.request.temperature": 0.5, + "gen_ai.request.top_k": 40, + "gen_ai.request.top_p": 0.9, + "gen_ai.response.model": "my-gpt-4o-mini-v123", + "gen_ai.usage.input_tokens": 11, + "gen_ai.usage.output_tokens": 22, + "gen_ai.usage.total_tokens": 33, + "sentry.environment": "production", + "sentry.op": "gen_ai.chat", + "sentry.origin": "manual", + "sentry.release": mock.ANY, + "sentry.sdk.name": "sentry.python", + "sentry.sdk.version": mock.ANY, + "sentry.segment.id": mock.ANY, + "sentry.segment.name": "test-transaction", + "thread.id": mock.ANY, + "thread.name": mock.ANY, + } + else: + events = capture_events() + with sentry_sdk.start_transaction(name="test-transaction"): + my_agent() + + (event,) = events + (agent_span, tool_span, chat_span) = event["spans"] + + assert agent_span["op"] == "gen_ai.invoke_agent" + assert ( + agent_span["description"] + == "invoke_agent test_decorator.test_span_templates_ai_objects..my_agent" + ) + assert agent_span["data"] == { + "gen_ai.agent.name": "test_decorator.test_span_templates_ai_objects..my_agent", + "gen_ai.operation.name": "invoke_agent", + "thread.id": mock.ANY, + "thread.name": mock.ANY, + } + + assert tool_span["op"] == "gen_ai.execute_tool" + assert ( + tool_span["description"] + == "execute_tool test_decorator.test_span_templates_ai_objects..my_tool" + ) + assert tool_span["data"] == { + "gen_ai.tool.name": "test_decorator.test_span_templates_ai_objects..my_tool", + "gen_ai.tool.description": "This is a tool function.", + "gen_ai.operation.name": "execute_tool", + "gen_ai.usage.input_tokens": 10, + "gen_ai.usage.output_tokens": 20, + "gen_ai.usage.total_tokens": 30, + "thread.id": mock.ANY, + "thread.name": mock.ANY, + } + + assert chat_span["op"] == "gen_ai.chat" + assert chat_span["description"] == "chat my-gpt-4o-mini" + assert chat_span["data"] == { + "gen_ai.operation.name": "chat", + "gen_ai.request.frequency_penalty": 1.0, + "gen_ai.request.max_tokens": 100, + "gen_ai.request.messages": "[{'role': 'user', 'content': 'What is the weather in Tokyo?'}, {'role': 'system', 'content': 'You are a helpful assistant that can answer questions about the weather.'}]", + "gen_ai.request.model": "my-gpt-4o-mini", + "gen_ai.request.presence_penalty": 2.0, + "gen_ai.request.temperature": 0.5, + "gen_ai.request.top_k": 40, + "gen_ai.request.top_p": 0.9, + "gen_ai.response.model": "my-gpt-4o-mini-v123", + "gen_ai.usage.input_tokens": 11, + "gen_ai.usage.output_tokens": 22, + "gen_ai.usage.total_tokens": 33, + "thread.id": mock.ANY, + "thread.name": mock.ANY, + } + + +@pytest.mark.parametrize("stream_gen_ai_spans", [True, False]) +@pytest.mark.parametrize("send_default_pii", [True, False]) +def test_span_templates_ai_pii( + sentry_init, + capture_events, + capture_items, + send_default_pii, + stream_gen_ai_spans, +): @sentry_sdk.trace(template=SPANTEMPLATE.AI_TOOL) def my_tool(arg1, arg2, **kwargs): """This is a tool function.""" @@ -349,18 +534,44 @@ def my_agent(*args, **kwargs): ) return "agent_output" - with sentry_sdk.start_transaction(name="test-transaction"): - my_agent(22, 33, arg1=44, arg2=55) + sentry_init( + traces_sample_rate=1.0, + send_default_pii=send_default_pii, + stream_gen_ai_spans=stream_gen_ai_spans, + ) - (event,) = events - (_, tool_span, _) = event["spans"] + if stream_gen_ai_spans: + items = capture_items("span") - if send_default_pii: - assert ( - tool_span["data"]["gen_ai.tool.input"] - == "{'args': (1, 2), 'kwargs': {'tool_arg1': '3', 'tool_arg2': '4'}}" - ) - assert tool_span["data"]["gen_ai.tool.output"] == "'tool_output'" + with sentry_sdk.start_transaction(name="test-transaction"): + my_agent(22, 33, arg1=44, arg2=55) + + (_, tool_span, _) = (item.payload for item in items if item.type == "span") + + if send_default_pii: + assert ( + tool_span["attributes"]["gen_ai.tool.input"] + == "{'args': (1, 2), 'kwargs': {'tool_arg1': '3', 'tool_arg2': '4'}}" + ) + assert tool_span["attributes"]["gen_ai.tool.output"] == "'tool_output'" + else: + assert "gen_ai.tool.input" not in tool_span["attributes"] + assert "gen_ai.tool.output" not in tool_span["attributes"] else: - assert "gen_ai.tool.input" not in tool_span["data"] - assert "gen_ai.tool.output" not in tool_span["data"] + events = capture_events() + + with sentry_sdk.start_transaction(name="test-transaction"): + my_agent(22, 33, arg1=44, arg2=55) + + (event,) = events + (_, tool_span, _) = event["spans"] + + if send_default_pii: + assert ( + tool_span["data"]["gen_ai.tool.input"] + == "{'args': (1, 2), 'kwargs': {'tool_arg1': '3', 'tool_arg2': '4'}}" + ) + assert tool_span["data"]["gen_ai.tool.output"] == "'tool_output'" + else: + assert "gen_ai.tool.input" not in tool_span["data"] + assert "gen_ai.tool.output" not in tool_span["data"] diff --git a/tests/tracing/test_misc.py b/tests/tracing/test_misc.py index c6fd6df84e..1066bcb709 100644 --- a/tests/tracing/test_misc.py +++ b/tests/tracing/test_misc.py @@ -646,22 +646,41 @@ def test_conversation_id_propagates_to_span_with_ai_op( span_data = event["spans"][0]["data"] assert span_data.get("gen_ai.conversation.id") == "conv-ai-op-test" + @pytest.mark.parametrize("stream_gen_ai_spans", [True, False]) def test_conversation_id_propagates_to_span_with_gen_ai_op( - self, sentry_init, capture_events + self, sentry_init, capture_events, capture_items, stream_gen_ai_spans ): """Span with gen_ai.* op should get conversation_id.""" - sentry_init(traces_sample_rate=1.0) - events = capture_events() + sentry_init( + traces_sample_rate=1.0, + stream_gen_ai_spans=stream_gen_ai_spans, + ) - scope = sentry_sdk.get_current_scope() - scope.set_conversation_id("conv-gen-ai-op-test") + if stream_gen_ai_spans: + items = capture_items("span") - with sentry_sdk.start_transaction(name="test-tx"): - with start_span(op="gen_ai.invoke_agent"): - pass + scope = sentry_sdk.get_current_scope() + scope.set_conversation_id("conv-gen-ai-op-test") + + with sentry_sdk.start_transaction(name="test-tx"): + with start_span(op="gen_ai.invoke_agent"): + pass + + spans = [item.payload for item in items if item.type == "span"] + span_data = spans[0]["attributes"] + else: + events = capture_events() + + scope = sentry_sdk.get_current_scope() + scope.set_conversation_id("conv-gen-ai-op-test") + + with sentry_sdk.start_transaction(name="test-tx"): + with start_span(op="gen_ai.invoke_agent"): + pass + + (event,) = events + span_data = event["spans"][0]["data"] - (event,) = events - span_data = event["spans"][0]["data"] assert span_data.get("gen_ai.conversation.id") == "conv-gen-ai-op-test" def test_conversation_id_not_propagated_to_non_ai_span(