From 1190c9f7dbc1b6d5912070891cf0e1633a027831 Mon Sep 17 00:00:00 2001 From: adityamehra Date: Fri, 13 Feb 2026 17:02:11 -0800 Subject: [PATCH] fix(langchain): use default EmbeddingInvocation operation_name instead of hardcoded value The langchain instrumentation was setting operation_name="embedding" (singular) which overrode the correct semantic convention default of "embeddings" (plural) defined in EmbeddingInvocation types.py. Remove the explicit override so the dataclass default from GenAiOperationNameValues.EMBEDDINGS is used. Add embedding tests for both packages: - util-genai: default operation_name, semconv attributes, span name, error path - langchain: VCR integration tests for success and error paths Co-authored-by: Cursor --- .../instrumentation/langchain/__init__.py | 1 - .../test_langchain_embedding_call.yaml | 78 +++++++++ .../test_langchain_embedding_call_error.yaml | 71 ++++++++ .../tests/test_langchain_embedding.py | 154 ++++++++++++++++++ .../tests/test_embedding_invocation.py | 149 ++++++++++++++++- 5 files changed, 451 insertions(+), 2 deletions(-) create mode 100644 instrumentation-genai/opentelemetry-instrumentation-langchain/tests/cassettes/test_langchain_embedding_call.yaml create mode 100644 instrumentation-genai/opentelemetry-instrumentation-langchain/tests/cassettes/test_langchain_embedding_call_error.yaml create mode 100644 instrumentation-genai/opentelemetry-instrumentation-langchain/tests/test_langchain_embedding.py diff --git a/instrumentation-genai/opentelemetry-instrumentation-langchain/src/opentelemetry/instrumentation/langchain/__init__.py b/instrumentation-genai/opentelemetry-instrumentation-langchain/src/opentelemetry/instrumentation/langchain/__init__.py index b457dc6b..688361c8 100644 --- a/instrumentation-genai/opentelemetry-instrumentation-langchain/src/opentelemetry/instrumentation/langchain/__init__.py +++ b/instrumentation-genai/opentelemetry-instrumentation-langchain/src/opentelemetry/instrumentation/langchain/__init__.py @@ -204,7 +204,6 @@ def _start_embedding(instance, texts): # Create embedding invocation embedding = UtilEmbeddingInvocation( - operation_name="embedding", request_model=request_model, input_texts=texts if isinstance(texts, list) else [texts], provider=provider, diff --git a/instrumentation-genai/opentelemetry-instrumentation-langchain/tests/cassettes/test_langchain_embedding_call.yaml b/instrumentation-genai/opentelemetry-instrumentation-langchain/tests/cassettes/test_langchain_embedding_call.yaml new file mode 100644 index 00000000..0c768091 --- /dev/null +++ b/instrumentation-genai/opentelemetry-instrumentation-langchain/tests/cassettes/test_langchain_embedding_call.yaml @@ -0,0 +1,78 @@ +interactions: +- request: + body: |- + { + "input": [ + "What is the capital of France?" + ], + "model": "text-embedding-ada-002", + "encoding_format": "float" + } + headers: + accept: + - application/json + accept-encoding: + - gzip, deflate, zstd + authorization: + - Bearer test_openai_api_key + connection: + - keep-alive + content-type: + - application/json + host: + - api.openai.com + user-agent: + - OpenAI/Python 1.108.1 + x-stainless-arch: + - arm64 + x-stainless-async: + - 'false' + x-stainless-lang: + - python + x-stainless-os: + - MacOS + x-stainless-package-version: + - 1.108.1 + x-stainless-raw-response: + - 'true' + x-stainless-retry-count: + - '0' + x-stainless-runtime: + - CPython + x-stainless-runtime-version: + - 3.12.10 + method: POST + uri: https://api.openai.com/v1/embeddings + response: + body: + string: |- + { + "object": "list", + "data": [ + { + "object": "embedding", + "embedding": [0.0023064255, -0.009327292, 0.015797347, -0.0077586975, -0.013595423], + "index": 0 + } + ], + "model": "text-embedding-ada-002-v2", + "usage": { + "prompt_tokens": 7, + "total_tokens": 7 + } + } + headers: + content-type: + - application/json + date: + - Sun, 21 Sep 2025 04:09:41 GMT + openai-organization: + - test_openai_org_id + x-request-id: + - emb-50308e7e-2aac-4167-a8fb-03f9f5ed8169 + content-length: + - '256' + status: + code: 200 + message: OK +version: 1 diff --git a/instrumentation-genai/opentelemetry-instrumentation-langchain/tests/cassettes/test_langchain_embedding_call_error.yaml b/instrumentation-genai/opentelemetry-instrumentation-langchain/tests/cassettes/test_langchain_embedding_call_error.yaml new file mode 100644 index 00000000..50cfcee5 --- /dev/null +++ b/instrumentation-genai/opentelemetry-instrumentation-langchain/tests/cassettes/test_langchain_embedding_call_error.yaml @@ -0,0 +1,71 @@ +interactions: +- request: + body: |- + { + "input": [ + "What is the capital of France?" + ], + "model": "text-embedding-ada-002", + "encoding_format": "float" + } + headers: + accept: + - application/json + accept-encoding: + - gzip, deflate, zstd + authorization: + - Bearer test_openai_api_key + connection: + - keep-alive + content-type: + - application/json + host: + - api.openai.com + user-agent: + - OpenAI/Python 1.108.1 + x-stainless-arch: + - arm64 + x-stainless-async: + - 'false' + x-stainless-lang: + - python + x-stainless-os: + - MacOS + x-stainless-package-version: + - 1.108.1 + x-stainless-raw-response: + - 'true' + x-stainless-retry-count: + - '0' + x-stainless-runtime: + - CPython + x-stainless-runtime-version: + - 3.12.10 + method: POST + uri: https://api.openai.com/v1/embeddings + response: + body: + string: |- + { + "error": { + "message": "Incorrect API key provided: test-api-****. You can find your API key at https://platform.openai.com/account/api-keys.", + "type": "invalid_request_error", + "param": null, + "code": "invalid_api_key" + } + } + headers: + content-type: + - application/json + date: + - Sun, 21 Sep 2025 04:09:41 GMT + openai-organization: + - test_openai_org_id + x-request-id: + - emb-err-50308e7e-2aac-4167-a8fb-03f9f5ed8169 + content-length: + - '256' + status: + code: 401 + message: Unauthorized +version: 1 diff --git a/instrumentation-genai/opentelemetry-instrumentation-langchain/tests/test_langchain_embedding.py b/instrumentation-genai/opentelemetry-instrumentation-langchain/tests/test_langchain_embedding.py new file mode 100644 index 00000000..9560a40c --- /dev/null +++ b/instrumentation-genai/opentelemetry-instrumentation-langchain/tests/test_langchain_embedding.py @@ -0,0 +1,154 @@ +"""Minimal LangChain embedding instrumentation test. + +Follows the same VCR cassette integration pattern as test_langchain_llm.py +to validate that embedding instrumentation emits correct telemetry: + +1. An embedding invocation succeeds using the recorded VCR cassette. +2. A span is emitted with GenAI semantic convention attributes for an embeddings op. +3. The default operation_name is 'embeddings' (from EmbeddingInvocation types.py default). +4. Core request model attribute exists and is plausible. +5. Metrics (duration at minimum) are produced and contain at least one data point. +""" + +from __future__ import annotations + +# mypy: ignore-errors +# pyright: reportGeneralTypeIssues=false, reportUnknownMemberType=false, reportUnknownVariableType=false, reportUnknownParameterType=false, reportUnknownArgumentType=false, reportAttributeAccessIssue=false, reportCallIssue=false + +from typing import Any, List +import pytest +from pytest import MonkeyPatch +from pydantic import SecretStr + +from langchain_openai import OpenAIEmbeddings + +from opentelemetry.semconv._incubating.attributes import gen_ai_attributes +from opentelemetry.semconv._incubating.metrics import gen_ai_metrics +from opentelemetry.sdk.trace import ReadableSpan # test-only type reference +from opentelemetry.trace.status import StatusCode +from opentelemetry.sdk.trace.export.in_memory_span_exporter import InMemorySpanExporter +from opentelemetry.sdk.metrics.export import InMemoryMetricReader + + +EMBEDDINGS = gen_ai_attributes.GenAiOperationNameValues.EMBEDDINGS.value + + +@pytest.mark.vcr() +def test_langchain_embedding_call( + span_exporter: InMemorySpanExporter, + metric_reader: InMemoryMetricReader, + instrument_with_content: Any, + monkeypatch: MonkeyPatch, +): + # Arrange + monkeypatch.setenv("OPENAI_API_KEY", "test-api-key") + model = "text-embedding-ada-002" + embeddings = OpenAIEmbeddings( + model=model, + api_key=SecretStr("test-api-key"), + check_embedding_ctx_length=False, # avoid tiktoken download in test + ) + + # Act + result = embeddings.embed_query("What is the capital of France?") + + # Basic functional assertion – result must be a list of floats + assert isinstance(result, list), "Expected a list of floats" + assert len(result) > 0, "Expected non-empty embedding vector" + assert all(isinstance(v, float) for v in result), "All values must be floats" + + # Spans + spans: List[ReadableSpan] = span_exporter.get_finished_spans() # type: ignore[assignment] + assert spans, "Expected at least one span" + embedding_span = None + for s in spans: + attrs_obj = getattr(s, "attributes", None) + op_name = None + try: + if attrs_obj is not None: + op_name = attrs_obj.get(gen_ai_attributes.GEN_AI_OPERATION_NAME) + except Exception: + op_name = None + if op_name == EMBEDDINGS: + embedding_span = s + break + assert embedding_span is not None, "No embeddings operation span found" + + # Span attribute sanity + attrs = getattr(embedding_span, "attributes", {}) + assert attrs.get(gen_ai_attributes.GEN_AI_OPERATION_NAME) == EMBEDDINGS + assert attrs.get(gen_ai_attributes.GEN_AI_REQUEST_MODEL) == model + # If token usage captured ensure it is a non-negative integer + tok_val = attrs.get(gen_ai_attributes.GEN_AI_USAGE_INPUT_TOKENS) + if tok_val is not None: + assert isinstance(tok_val, int) and tok_val >= 0 + + # Span name should follow "{operation_name} {request_model}" convention + assert embedding_span.name == f"embeddings {model}" + + # Metrics – ensure at least duration histogram present with >=1 point + metrics_data = metric_reader.get_metrics_data() + found_duration = False + if metrics_data: + for rm in getattr(metrics_data, "resource_metrics", []) or []: + for scope in getattr(rm, "scope_metrics", []) or []: + for metric in getattr(scope, "metrics", []) or []: + if metric.name == gen_ai_metrics.GEN_AI_CLIENT_OPERATION_DURATION: + dps = getattr(metric.data, "data_points", []) + if dps: + assert dps[0].sum >= 0 + found_duration = True + assert found_duration, "Duration metric missing" + + +@pytest.mark.vcr() +def test_langchain_embedding_call_error( + span_exporter: InMemorySpanExporter, + instrument_with_content: Any, + monkeypatch: MonkeyPatch, +): + """When the embedding API returns an error the wrapper must: + 1. Still emit a span with operation_name == 'embeddings'. + 2. Mark the span status as ERROR. + 3. Re-raise the original exception so the caller sees the failure. + """ + # Arrange + monkeypatch.setenv("OPENAI_API_KEY", "test-api-key") + model = "text-embedding-ada-002" + embeddings = OpenAIEmbeddings( + model=model, + api_key=SecretStr("test-api-key"), + check_embedding_ctx_length=False, # avoid tiktoken download in test + max_retries=0, # fail immediately, don't retry on 401 + ) + + # Act – the call should raise because the cassette returns a 401 + with pytest.raises(Exception): + embeddings.embed_query("What is the capital of France?") + + # Spans – an embedding span must still be emitted + spans: List[ReadableSpan] = span_exporter.get_finished_spans() # type: ignore[assignment] + assert spans, "Expected at least one span even on error" + embedding_span = None + for s in spans: + attrs_obj = getattr(s, "attributes", None) + op_name = None + try: + if attrs_obj is not None: + op_name = attrs_obj.get(gen_ai_attributes.GEN_AI_OPERATION_NAME) + except Exception: + op_name = None + if op_name == EMBEDDINGS: + embedding_span = s + break + assert embedding_span is not None, ( + "No embeddings operation span found on error path" + ) + + # Span attribute sanity + attrs = getattr(embedding_span, "attributes", {}) + assert attrs.get(gen_ai_attributes.GEN_AI_OPERATION_NAME) == EMBEDDINGS + assert attrs.get(gen_ai_attributes.GEN_AI_REQUEST_MODEL) == model + + # Span must be marked as error + assert embedding_span.status.status_code == StatusCode.ERROR diff --git a/util/opentelemetry-util-genai/tests/test_embedding_invocation.py b/util/opentelemetry-util-genai/tests/test_embedding_invocation.py index eabc3085..fa09dea1 100644 --- a/util/opentelemetry-util-genai/tests/test_embedding_invocation.py +++ b/util/opentelemetry-util-genai/tests/test_embedding_invocation.py @@ -1,5 +1,15 @@ +"""Tests for EmbeddingInvocation lifecycle, defaults, and telemetry.""" + +from opentelemetry.sdk.trace import TracerProvider +from opentelemetry.sdk.trace.export import SimpleSpanProcessor +from opentelemetry.sdk.trace.export.in_memory_span_exporter import ( + InMemorySpanExporter, +) +from opentelemetry.semconv._incubating.attributes import ( + gen_ai_attributes as GenAI, +) from opentelemetry.util.genai.handler import get_telemetry_handler -from opentelemetry.util.genai.types import EmbeddingInvocation +from opentelemetry.util.genai.types import EmbeddingInvocation, Error def test_embedding_invocation_creates_span(): @@ -16,3 +26,140 @@ def test_embedding_invocation_creates_span(): # span should have ended (recording possibly false depending on SDK impl) # we at least assert the object reference still exists assert emb.span is not None + + +def test_embedding_invocation_default_operation_name(): + """EmbeddingInvocation should default operation_name to 'embeddings'.""" + emb = EmbeddingInvocation( + request_model="text-embedding-ada-002", + input_texts=["hello"], + ) + assert ( + emb.operation_name == GenAI.GenAiOperationNameValues.EMBEDDINGS.value + ) + assert emb.operation_name == "embeddings" + + +def test_embedding_invocation_semantic_convention_attributes(): + """semantic_convention_attributes() should include the default operation_name.""" + emb = EmbeddingInvocation( + request_model="text-embedding-3-small", + input_texts=["test input"], + provider="openai", + ) + semconv_attrs = emb.semantic_convention_attributes() + + assert GenAI.GEN_AI_OPERATION_NAME in semconv_attrs + assert semconv_attrs[GenAI.GEN_AI_OPERATION_NAME] == "embeddings" + assert GenAI.GEN_AI_REQUEST_MODEL in semconv_attrs + assert ( + semconv_attrs[GenAI.GEN_AI_REQUEST_MODEL] == "text-embedding-3-small" + ) + + +def test_embedding_invocation_span_attributes(): + """Spans should carry the correct operation_name attribute from the default.""" + span_exporter = InMemorySpanExporter() + tracer_provider = TracerProvider() + tracer_provider.add_span_processor(SimpleSpanProcessor(span_exporter)) + + handler = get_telemetry_handler() + span_emitters = list(handler._emitter.emitters_for("span")) + if span_emitters: + span_emitters[0]._tracer = tracer_provider.get_tracer(__name__) + + emb = EmbeddingInvocation( + request_model="text-embedding-ada-002", + input_texts=["hello world"], + provider="openai", + ) + + handler.start_embedding(emb) + handler.stop_embedding(emb) + + spans = span_exporter.get_finished_spans() + assert len(spans) == 1 + + span = spans[0] + attrs = span.attributes + + # operation_name should be "embeddings" (the default from types.py) + assert attrs[GenAI.GEN_AI_OPERATION_NAME] == "embeddings" + assert attrs[GenAI.GEN_AI_REQUEST_MODEL] == "text-embedding-ada-002" + + +def test_embedding_invocation_span_name(): + """Span name should be '{operation_name} {request_model}'.""" + span_exporter = InMemorySpanExporter() + tracer_provider = TracerProvider() + tracer_provider.add_span_processor(SimpleSpanProcessor(span_exporter)) + + handler = get_telemetry_handler() + span_emitters = list(handler._emitter.emitters_for("span")) + if span_emitters: + span_emitters[0]._tracer = tracer_provider.get_tracer(__name__) + + emb = EmbeddingInvocation( + request_model="text-embedding-3-large", + input_texts=["test"], + provider="openai", + ) + + handler.start_embedding(emb) + handler.stop_embedding(emb) + + spans = span_exporter.get_finished_spans() + assert len(spans) == 1 + assert spans[0].name == "embeddings text-embedding-3-large" + + +def test_embedding_invocation_with_error(): + """Error path should still produce a span with correct operation_name.""" + span_exporter = InMemorySpanExporter() + tracer_provider = TracerProvider() + tracer_provider.add_span_processor(SimpleSpanProcessor(span_exporter)) + + handler = get_telemetry_handler() + span_emitters = list(handler._emitter.emitters_for("span")) + if span_emitters: + span_emitters[0]._tracer = tracer_provider.get_tracer(__name__) + + emb = EmbeddingInvocation( + request_model="text-embedding-ada-002", + input_texts=["test"], + provider="openai", + ) + + handler.start_embedding(emb) + handler.fail_embedding(emb, Error(message="API error", type=RuntimeError)) + + spans = span_exporter.get_finished_spans() + assert len(spans) == 1 + + span = spans[0] + attrs = span.attributes + assert attrs[GenAI.GEN_AI_OPERATION_NAME] == "embeddings" + + +def test_embedding_invocation_custom_operation_name_override(): + """If a caller explicitly sets operation_name, the override should be honoured.""" + emb = EmbeddingInvocation( + operation_name="custom_embedding", + request_model="my-model", + input_texts=["x"], + ) + assert emb.operation_name == "custom_embedding" + + +def test_embedding_invocation_without_explicit_operation_name_matches_langchain_usage(): + """Verify the pattern used by langchain instrumentation (no operation_name kwarg) + produces the correct default.""" + # This mirrors the construction in langchain __init__.py after the fix: + # UtilEmbeddingInvocation(request_model=..., input_texts=..., provider=..., attributes=...) + emb = EmbeddingInvocation( + request_model="text-embedding-ada-002", + input_texts=["hello world"], + provider="openai", + attributes={"framework": "langchain"}, + ) + assert emb.operation_name == "embeddings"