diff --git a/README.md b/README.md index 486ac2ff..3d5b4db2 100644 --- a/README.md +++ b/README.md @@ -259,6 +259,7 @@ Evaluation worker -> evaluate -> handler.evaluation_results(list) -> CompositeEm - Backpressure strategies for high-volume content events. ## 14. Development setup + Get the packages installed: Setup a virtual env (Note: will erase your .venv in the current folder) @@ -272,8 +273,6 @@ pip install -e util/opentelemetry-util-genai --no-deps pip install -e util/opentelemetry-util-genai-evals --no-deps pip install -e util/opentelemetry-util-genai-evals-deepeval --no-deps pip install -e util/opentelemetry-util-genai-emitters-splunk --no-deps -pip install -e util/opentelemetry-util-genai-traceloop-translator --no-deps -pip install -e instrumentation-genai/opentelemetry-instrumentation-langchain --no-deps pip install -r dev-genai-requirements.txt pip install -r instrumentation-genai/opentelemetry-instrumentation-langchain/examples/manual/requirements.txt @@ -281,11 +280,80 @@ export OTEL_SEMCONV_STABILITY_OPT_IN=gen_ai_latest_experimental export OTEL_INSTRUMENTATION_GENAI_EMITTERS=span_metric_event,splunk export OTEL_INSTRUMENTATION_GENAI_CAPTURE_MESSAGE_CONTENT=true export OTEL_INSTRUMENTATION_GENAI_CAPTURE_MESSAGE_CONTENT_MODE=SPAN_AND_EVENT -export OTEL_INSTRUMENTATION_GENAI_EVALS_EVALUATORS="Deepeval(LLMInvocation(bias,toxicity))" +# configure which GenAI types to evaluate and which evaluations +export OTEL_INSTRUMENTATION_GENAI_EVALS_EVALUATORS="Deepeval(LLMInvocation(bias,toxicity))" +# Deepeval optimization +export DEEPEVAL_FILE_SYSTEM=READ_ONLY +export DEEPEVAL_TELEMETRY_OPT_OUT=YES +# set environment and service names for ease of filtering +export OTEL_SERVICE_NAME=genai-eval-test +export OTEL_RESOURCE_ATTRIBUTES='deployment.environment=genai-dev' +``` + +For telemetry to properly work with Splunk Platform instrumentation, set the env var to enable Splunk format for aggregated evaluation results. + +```bash export OTEL_INSTRUMENTATION_GENAI_EVALS_RESULTS_AGGREGATION=true +export OTEL_INSTRUMENTATION_GENAI_EMITTERS_EVALUATION="replace-category:SplunkEvaluationResults" +``` + +### Deepeval evaluator integration configuration + +Instrumentation-side evaluations can be configured using `OTEL_INSTRUMENTATION_GENAI_EVALS_EVALUATORS` environment variable + +```bash +# uses defaults - evaluates LLMInvocation and AgentInvocation with 5 metrics: +# (bias,toxicity,answer_relevancy,hallucination,sentiment) +OTEL_INSTRUMENTATION_GENAI_EVALS_EVALUATORS="deepeval" + +# Specific metrics for LLMInvocation +OTEL_INSTRUMENTATION_GENAI_EVALS_EVALUATORS="deepeval(LLMInvocation(bias,toxicity))" + +# Multiple types with metrics +OTEL_INSTRUMENTATION_GENAI_EVALS_EVALUATORS="deepeval(LLMInvocation(bias,toxicity),AgentInvocation(hallucination))" + +# With metric options +OTEL_INSTRUMENTATION_GENAI_EVALS_EVALUATORS="deepeval(LLMInvocation(hallucination(threshold=0.8)))" +``` + +```bash +export OTEL_INSTRUMENTATION_GENAI_DEBUG=true +``` + +### to install an instrumentation library + +```bash +pip install -e instrumentation-genai/opentelemetry-instrumentation-langchain --no-deps +``` + +Examples for each instrumentation library or package can be found in `/examples`, i.e. + +```bash +util/opentelemetry-util-genai/examples/ +``` + +### Installing a Translator library + +To use exiting 3rd party instrumentations and convert it to Splunk Distro semantic conventions/run instrumentation-side evaluations you can install a translator library. + +For example for existing traseloop instrumentations +```bash +pip install -e util/opentelemetry-util-genai-traceloop-translator --no-deps +``` + +## Installing aidefence instrumentation + +```bash +pip install -e instrumentation-genai/opentelemetry-instrumentation-aidefense + +export AI_DEFENSE_API_KEY="your-ai-defense-key" + +python instrumentation-genai/opentelemetry-instrumentation-aidefense/examples/multi_agent_travel_planner/main.py ``` -Sudo-code to create LLMInvocation for your in-code llm code +## In-code instrumentation example + +Sudo-code to create LLMInvocation for your in-code for an application: ```python from opentelemetry.util.genai.handler import get_telemetry_handler @@ -302,14 +370,10 @@ inv.output_messages = [OutputMessage(role="assistant", parts=[Text("Hi!")], fini handler.stop_llm(inv) ``` -Additionally, for `aidefense` +Additionally, you can run a simple example reporting an LLM Invocation ```bash -pip install -e instrumentation-genai/opentelemetry-instrumentation-aidefense - -export AI_DEFENSE_API_KEY="your-ai-defense-key" - -python instrumentation-genai/opentelemetry-instrumentation-aidefense/examples/multi_agent_travel_planner/main.py +python util/opentelemetry-util-genai/examples/invocation_example.py llm --exporter otlp ``` ## 15. Linting and Formatting diff --git a/util/opentelemetry-util-genai-evals/CHANGELOG.md b/util/opentelemetry-util-genai-evals/CHANGELOG.md index e798275e..f51a8f05 100644 --- a/util/opentelemetry-util-genai-evals/CHANGELOG.md +++ b/util/opentelemetry-util-genai-evals/CHANGELOG.md @@ -2,6 +2,11 @@ All notable changes to this repository are documented in this file. +## Version 0.1.8 - 2026-02-06 + +### Fixed +- **Logging visibility** - INFO level log messages from the evals bootstrap, proxy, manager, and worker modules are now always visible (e.g., "Using separate process evaluation mode"). DEBUG level messages require `OTEL_INSTRUMENTATION_GENAI_DEBUG=true`. + ## Version 0.1.7 - 2026-01-28 ### Added diff --git a/util/opentelemetry-util-genai-evals/src/opentelemetry/util/genai/evals/bootstrap.py b/util/opentelemetry-util-genai-evals/src/opentelemetry/util/genai/evals/bootstrap.py index 2dc2c5bd..5f743d90 100644 --- a/util/opentelemetry-util-genai-evals/src/opentelemetry/util/genai/evals/bootstrap.py +++ b/util/opentelemetry-util-genai-evals/src/opentelemetry/util/genai/evals/bootstrap.py @@ -5,6 +5,9 @@ import logging from typing import Any +# Import debug module to ensure parent logger is configured when debug is enabled +# This must happen before any logging calls in this module +from opentelemetry.util.genai import debug as _debug # noqa: F401 from opentelemetry.util.genai.callbacks import CompletionCallback from .manager import Manager diff --git a/util/opentelemetry-util-genai-evals/src/opentelemetry/util/genai/evals/version.py b/util/opentelemetry-util-genai-evals/src/opentelemetry/util/genai/evals/version.py index 4e891f14..08721248 100644 --- a/util/opentelemetry-util-genai-evals/src/opentelemetry/util/genai/evals/version.py +++ b/util/opentelemetry-util-genai-evals/src/opentelemetry/util/genai/evals/version.py @@ -1,3 +1,3 @@ """Version metadata for opentelemetry-util-genai-evals.""" -__version__ = "0.1.7" +__version__ = "0.1.8" diff --git a/util/opentelemetry-util-genai/CHANGELOG.md b/util/opentelemetry-util-genai/CHANGELOG.md index a15f0318..4ddc77a7 100644 --- a/util/opentelemetry-util-genai/CHANGELOG.md +++ b/util/opentelemetry-util-genai/CHANGELOG.md @@ -2,6 +2,11 @@ All notable changes to this repository are documented in this file. +## Version 0.1.10 - 2026-02-06 + +### Fixed +- **Logging visibility for evals module** - INFO level messages from `opentelemetry.util.genai.evals.*` modules are now always visible (e.g., "Using separate process evaluation mode"). DEBUG level messages require `OTEL_INSTRUMENTATION_GENAI_DEBUG=true`. + ## Version 0.1.9 - 2026-01-29 - Release 0.1.9 diff --git a/util/opentelemetry-util-genai/src/opentelemetry/util/genai/debug.py b/util/opentelemetry-util-genai/src/opentelemetry/util/genai/debug.py index 37e7008d..880b41f3 100644 --- a/util/opentelemetry-util-genai/src/opentelemetry/util/genai/debug.py +++ b/util/opentelemetry-util-genai/src/opentelemetry/util/genai/debug.py @@ -55,6 +55,18 @@ def _read_enabled_flag() -> bool: handler.setFormatter(fmt) _LOGGER.addHandler(handler) _LOGGER.setLevel(logging.DEBUG) + _LOGGER.propagate = False # Prevent duplicate logs via parent logger + +# Configure the parent logger for all opentelemetry.util.genai.* modules +# including evals subpackages. INFO level messages (like "Using separate process +# evaluation mode") are always visible; DEBUG level requires the debug flag. +_PARENT_LOGGER = logging.getLogger("opentelemetry.util.genai") +if not _PARENT_LOGGER.handlers: + parent_handler = logging.StreamHandler() + parent_fmt = logging.Formatter("[%(name)s] %(levelname)s: %(message)s") + parent_handler.setFormatter(parent_fmt) + _PARENT_LOGGER.addHandler(parent_handler) + _PARENT_LOGGER.setLevel(logging.DEBUG if _ENABLED else logging.INFO) def is_enabled() -> bool: diff --git a/util/opentelemetry-util-genai/src/opentelemetry/util/genai/version.py b/util/opentelemetry-util-genai/src/opentelemetry/util/genai/version.py index cb785abb..ad64ed74 100644 --- a/util/opentelemetry-util-genai/src/opentelemetry/util/genai/version.py +++ b/util/opentelemetry-util-genai/src/opentelemetry/util/genai/version.py @@ -12,4 +12,4 @@ # See the License for the specific language governing permissions and # limitations under the License. -__version__ = "0.1.9" +__version__ = "0.1.10" diff --git a/util/opentelemetry-util-genai/tests/test_upload_hook.py b/util/opentelemetry-util-genai/tests/test_upload_hook.py index 93731bce..8351ed6e 100644 --- a/util/opentelemetry-util-genai/tests/test_upload_hook.py +++ b/util/opentelemetry-util-genai/tests/test_upload_hook.py @@ -70,7 +70,9 @@ def test_load_upload_hook_invalid(self, mock_entry_points: Mock): FakeEntryPoint("my-hook", lambda: InvalidUploadHook) ] - with self.assertLogs(level=logging.DEBUG) as logs: + with self.assertLogs( + "opentelemetry.util.genai.upload_hook", level=logging.DEBUG + ) as logs: self.assertIsInstance(load_upload_hook(), _NoOpUploadHook) self.assertEqual(len(logs.output), 1) self.assertIn("is not a valid UploadHook. Using noop", logs.output[0])