From d59a1eb90fcce6f8734d8b0f66f1bf12fef2642c Mon Sep 17 00:00:00 2001 From: HardMax71 Date: Fri, 13 Feb 2026 00:30:32 +0100 Subject: [PATCH 01/12] fix: structog instead of logging, also updated middlewares --- backend/app/api/routes/admin/events.py | 4 +- backend/app/api/routes/execution.py | 22 +- backend/app/core/adaptive_sampling.py | 202 ---------- backend/app/core/container.py | 2 + backend/app/core/correlation.py | 67 +--- backend/app/core/database_context.py | 14 - backend/app/core/dishka_lifespan.py | 29 -- backend/app/core/logging.py | 197 ++++------ backend/app/core/middlewares/metrics.py | 10 +- backend/app/core/providers.py | 123 +++--- backend/app/core/tracing/__init__.py | 110 +++--- backend/app/core/tracing/config.py | 200 ---------- backend/app/core/tracing/models.py | 92 ----- backend/app/core/tracing/utils.py | 62 --- .../admin/admin_settings_repository.py | 5 +- backend/app/db/repositories/dlq_repository.py | 4 +- .../app/db/repositories/event_repository.py | 18 +- .../db/repositories/execution_repository.py | 4 +- .../repositories/notification_repository.py | 4 +- .../app/db/repositories/replay_repository.py | 4 +- .../repositories/user_settings_repository.py | 4 +- backend/app/dlq/manager.py | 4 +- backend/app/domain/events/typed.py | 3 +- backend/app/events/core/producer.py | 4 +- backend/app/main.py | 1 - .../services/admin/admin_events_service.py | 48 +-- .../services/admin/admin_settings_service.py | 4 +- .../app/services/admin/admin_user_service.py | 5 +- backend/app/services/auth_service.py | 10 +- .../app/services/coordinator/coordinator.py | 5 +- .../services/event_replay/replay_service.py | 4 +- backend/app/services/execution_service.py | 20 +- .../idempotency/idempotency_manager.py | 4 +- backend/app/services/k8s_worker/worker.py | 4 +- backend/app/services/kafka_event_service.py | 4 +- backend/app/services/login_lockout.py | 5 +- .../app/services/notification_scheduler.py | 4 +- backend/app/services/notification_service.py | 28 +- .../app/services/pod_monitor/event_mapper.py | 4 +- backend/app/services/pod_monitor/monitor.py | 4 +- .../services/result_processor/processor.py | 4 +- .../result_processor/resource_cleaner.py | 4 +- backend/app/services/runtime_settings.py | 5 +- backend/app/services/saga/execution_saga.py | 5 +- .../app/services/saga/saga_orchestrator.py | 16 +- backend/app/services/saga/saga_service.py | 4 +- backend/app/services/saved_script_service.py | 4 +- backend/app/services/sse/redis_bus.py | 6 +- backend/app/services/sse/sse_service.py | 5 +- backend/app/services/user_settings_service.py | 4 +- backend/app/settings.py | 14 +- backend/config.test.toml | 12 +- backend/config.toml | 9 +- backend/otel-collector-config.yaml | 19 +- backend/pyproject.toml | 1 + backend/tests/e2e/core/test_middlewares.py | 18 +- .../db/repositories/test_dlq_repository.py | 4 +- .../repositories/test_execution_repository.py | 4 +- backend/tests/e2e/dlq/test_dlq_manager.py | 4 +- .../tests/e2e/idempotency/test_idempotency.py | 4 +- .../result_processor/test_result_processor.py | 4 +- .../sse/test_partitioned_event_router.py | 4 +- .../tests/e2e/services/sse/test_redis_bus.py | 4 +- .../tests/e2e/test_k8s_worker_create_pod.py | 4 +- .../tests/unit/core/test_adaptive_sampling.py | 62 --- .../unit/core/test_logging_and_correlation.py | 355 +++++------------- .../coordinator/test_coordinator_queue.py | 4 +- .../idempotency/test_idempotency_manager.py | 4 +- .../services/pod_monitor/test_event_mapper.py | 4 +- .../unit/services/pod_monitor/test_monitor.py | 4 +- .../result_processor/test_processor.py | 4 +- .../saga/test_saga_orchestrator_unit.py | 4 +- .../services/sse/test_kafka_redis_bridge.py | 4 +- .../unit/services/sse/test_sse_service.py | 4 +- .../services/test_admin_settings_service.py | 4 +- .../tests/unit/services/test_login_lockout.py | 4 +- .../unit/services/test_runtime_settings.py | 4 +- backend/uv.lock | 11 + backend/workers/run_coordinator.py | 13 - backend/workers/run_dlq_processor.py | 13 - backend/workers/run_event_replay.py | 12 - backend/workers/run_k8s_worker.py | 13 - backend/workers/run_pod_monitor.py | 13 - backend/workers/run_result_processor.py | 13 - backend/workers/run_saga_orchestrator.py | 13 - 85 files changed, 536 insertions(+), 1530 deletions(-) delete mode 100644 backend/app/core/adaptive_sampling.py delete mode 100644 backend/app/core/database_context.py delete mode 100644 backend/app/core/tracing/config.py delete mode 100644 backend/app/core/tracing/models.py delete mode 100644 backend/app/core/tracing/utils.py delete mode 100644 backend/tests/unit/core/test_adaptive_sampling.py diff --git a/backend/app/api/routes/admin/events.py b/backend/app/api/routes/admin/events.py index a2a537c5..e6b26cfa 100644 --- a/backend/app/api/routes/admin/events.py +++ b/backend/app/api/routes/admin/events.py @@ -1,5 +1,6 @@ from datetime import datetime from typing import Annotated +from uuid import uuid4 from dishka import FromDishka from dishka.integrations.fastapi import DishkaRoute @@ -7,7 +8,6 @@ from fastapi.responses import StreamingResponse from app.api.dependencies import admin_user -from app.core.correlation import CorrelationContext from app.domain.enums import EventType, ExportFormat from app.domain.events import EventFilter from app.domain.replay import ReplayFilter @@ -105,7 +105,7 @@ async def replay_events( request: EventReplayRequest, background_tasks: BackgroundTasks, service: FromDishka[AdminEventsService] ) -> EventReplayResponse: """Replay events by filter criteria, with optional dry-run mode.""" - replay_correlation_id = f"replay_{CorrelationContext.get_correlation_id()}" + replay_correlation_id = f"replay-{uuid4().hex}" result = await service.prepare_or_schedule_replay( replay_filter=ReplayFilter.model_validate(request), dry_run=request.dry_run, diff --git a/backend/app/api/routes/execution.py b/backend/app/api/routes/execution.py index 42340cb2..03a17f50 100644 --- a/backend/app/api/routes/execution.py +++ b/backend/app/api/routes/execution.py @@ -4,9 +4,9 @@ from dishka import FromDishka from dishka.integrations.fastapi import DishkaRoute, inject from fastapi import APIRouter, Depends, Header, HTTPException, Path, Query, Request +from opentelemetry import trace from app.api.dependencies import admin_user, current_user -from app.core.tracing import EventAttributes, add_span_attributes from app.core.utils import get_client_ip from app.domain.enums import EventType, ExecutionStatus, UserRole from app.domain.events import DomainEvent @@ -57,17 +57,15 @@ async def create_execution( idempotency_key: Annotated[str | None, Header(alias="Idempotency-Key")] = None, ) -> ExecutionResponse: """Submit a script for execution in an isolated Kubernetes pod.""" - add_span_attributes( - **{ - "http.method": "POST", - "http.route": "/api/v1/execute", - "execution.language": execution.lang, - "execution.language_version": execution.lang_version, - "execution.script_length": len(execution.script), - EventAttributes.USER_ID: current_user.user_id, - "client.address": get_client_ip(request), - } - ) + trace.get_current_span().set_attributes({ + "http.method": "POST", + "http.route": "/api/v1/execute", + "execution.language": execution.lang, + "execution.language_version": execution.lang_version, + "execution.script_length": len(execution.script), + "user.id": current_user.user_id, + "client.address": get_client_ip(request), + }) exec_result = await execution_service.execute_script_idempotent( script=execution.script, diff --git a/backend/app/core/adaptive_sampling.py b/backend/app/core/adaptive_sampling.py deleted file mode 100644 index ccbfe0e3..00000000 --- a/backend/app/core/adaptive_sampling.py +++ /dev/null @@ -1,202 +0,0 @@ -import logging -import time -from collections import deque -from collections.abc import Sequence - -from opentelemetry.context import Context -from opentelemetry.sdk.trace.sampling import Decision, Sampler, SamplingResult -from opentelemetry.trace import Link, SpanKind, TraceState, get_current_span -from opentelemetry.util.types import Attributes - -from app.settings import Settings - - -class AdaptiveSampler(Sampler): - """ - Adaptive sampler that adjusts sampling rate based on: - - Error rate - - Request rate - - Resource utilization - - Rate adjustment is lazy: it runs inline during should_sample() - when the adjustment interval has elapsed. - """ - - def __init__( - self, - base_rate: float = 0.1, - min_rate: float = 0.01, - max_rate: float = 1.0, - error_rate_threshold: float = 0.05, - high_traffic_threshold: int = 1000, - adjustment_interval: int = 60, - ): - """ - Initialize adaptive sampler - - Args: - base_rate: Base sampling rate (default 10%) - min_rate: Minimum sampling rate (default 1%) - max_rate: Maximum sampling rate (default 100%) - error_rate_threshold: Error rate that triggers increased sampling (default 5%) - high_traffic_threshold: Requests per minute to consider high traffic - adjustment_interval: Seconds between rate adjustments - """ - self.base_rate = base_rate - self.min_rate = min_rate - self.max_rate = max_rate - self.error_rate_threshold = error_rate_threshold - self.high_traffic_threshold = high_traffic_threshold - self.adjustment_interval = adjustment_interval - - self._current_rate = base_rate - self._last_adjustment = time.time() - - # Sliding window for rate calculation (1 minute window, pruned by _calculate_metrics) - self._request_window: deque[float] = deque() - self._error_window: deque[float] = deque() - - logging.getLogger("integr8scode").info(f"Adaptive sampler initialized with base rate: {base_rate}") - - def should_sample( - self, - parent_context: Context | None, - trace_id: int, - name: str, - kind: SpanKind | None = None, - attributes: Attributes | None = None, - links: Sequence[Link] | None = None, - trace_state: TraceState | None = None, - ) -> SamplingResult: - """Determine if a span should be sampled""" - parent_span_context = get_current_span(parent_context).get_span_context() - parent_trace_state = None - - # Always sample if parent was sampled - if parent_span_context is not None and parent_span_context.is_valid: - parent_trace_state = parent_span_context.trace_state - if parent_span_context.trace_flags.sampled: - if parent_trace_state is not None: - return SamplingResult( - decision=Decision.RECORD_AND_SAMPLE, attributes=attributes, trace_state=parent_trace_state - ) - else: - return SamplingResult(decision=Decision.RECORD_AND_SAMPLE, attributes=attributes) - - # Track request - self._request_window.append(time.time()) - - # Always sample errors - if self._is_error(attributes): - self._error_window.append(time.time()) - if parent_trace_state is not None: - return SamplingResult( - decision=Decision.RECORD_AND_SAMPLE, attributes=attributes, trace_state=parent_trace_state - ) - else: - return SamplingResult(decision=Decision.RECORD_AND_SAMPLE, attributes=attributes) - - # Lazy adjustment: re-evaluate rate when interval has elapsed - now = time.time() - if now - self._last_adjustment >= self.adjustment_interval: - self._last_adjustment = now - self._adjust_sampling_rate() - - # Apply current sampling rate using trace ID for deterministic sampling - max_trace_id = (1 << 64) - 1 - masked_trace_id = trace_id & max_trace_id - threshold = int(self._current_rate * max_trace_id) - if self._current_rate >= 1.0: - threshold = max_trace_id - should_sample = masked_trace_id < threshold - - if parent_trace_state is not None: - return SamplingResult( - decision=Decision.RECORD_AND_SAMPLE if should_sample else Decision.DROP, - attributes=attributes if should_sample else None, - trace_state=parent_trace_state, - ) - else: - return SamplingResult( - decision=Decision.RECORD_AND_SAMPLE if should_sample else Decision.DROP, - attributes=attributes if should_sample else None, - ) - - def get_description(self) -> str: - """Return sampler description""" - return f"AdaptiveSampler(current_rate={self._current_rate:.2%})" - - def _is_error(self, attributes: Attributes | None) -> bool: - """Check if span attributes indicate an error""" - if not attributes: - return False - - if attributes.get("error", False): - return True - - status_code = attributes.get("http.status_code") - if status_code and isinstance(status_code, (int, float)): - if int(status_code) >= 500: - return True - elif status_code and isinstance(status_code, str) and status_code.isdigit(): - if int(status_code) >= 500: - return True - - if attributes.get("exception.type"): - return True - - return False - - def _calculate_metrics(self) -> tuple[float, int]: - """Calculate current error rate and request rate""" - minute_ago = time.time() - 60 - - while self._request_window and self._request_window[0] < minute_ago: - self._request_window.popleft() - while self._error_window and self._error_window[0] < minute_ago: - self._error_window.popleft() - - request_rate = len(self._request_window) - error_rate = len(self._error_window) / max(1, len(self._request_window)) - - return error_rate, request_rate - - def _adjust_sampling_rate(self) -> None: - """Adjust sampling rate based on current metrics""" - error_rate, request_rate = self._calculate_metrics() - - new_rate = self.base_rate - - if error_rate > self.error_rate_threshold: - error_multiplier: float = min(10.0, 1 + (error_rate / self.error_rate_threshold)) - new_rate = min(self.max_rate, self.base_rate * error_multiplier) - logging.getLogger("integr8scode").warning( - f"High error rate detected ({error_rate:.1%}), increasing sampling to {new_rate:.1%}" - ) - elif request_rate > self.high_traffic_threshold: - traffic_divisor = request_rate / self.high_traffic_threshold - new_rate = max(self.min_rate, self.base_rate / traffic_divisor) - logging.getLogger("integr8scode").info( - f"High traffic detected ({request_rate} req/min), decreasing sampling to {new_rate:.1%}" - ) - - if new_rate != self._current_rate: - change_rate = 0.5 - self._current_rate = self._current_rate + (new_rate - self._current_rate) * change_rate - - logging.getLogger("integr8scode").info( - f"Adjusted sampling rate to {self._current_rate:.1%} " - f"(error_rate: {error_rate:.1%}, request_rate: {request_rate} req/min)" - ) - - -def create_adaptive_sampler(settings: Settings) -> AdaptiveSampler: - """Create adaptive sampler with settings""" - return AdaptiveSampler( - base_rate=settings.TRACING_SAMPLING_RATE, - min_rate=max(0.001, settings.TRACING_SAMPLING_RATE / 100), - max_rate=1.0, - error_rate_threshold=0.05, - high_traffic_threshold=1000, - adjustment_interval=60, - ) diff --git a/backend/app/core/container.py b/backend/app/core/container.py index 43b737f1..3f3838d0 100644 --- a/backend/app/core/container.py +++ b/backend/app/core/container.py @@ -21,6 +21,7 @@ PodMonitorProvider, RedisProvider, RepositoryProvider, + RequestScopedProvider, ResourceCleanerProvider, ResultProcessorProvider, SagaOrchestratorProvider, @@ -63,6 +64,7 @@ def create_app_container(settings: Settings) -> AsyncContainer: CoordinatorProvider(), KubernetesProvider(), ResourceCleanerProvider(), + RequestScopedProvider(), FastapiProvider(), context={Settings: settings}, ) diff --git a/backend/app/core/correlation.py b/backend/app/core/correlation.py index dcaec4c2..9015fb27 100644 --- a/backend/app/core/correlation.py +++ b/backend/app/core/correlation.py @@ -1,40 +1,11 @@ import uuid -from datetime import datetime, timezone -from typing import Any -from starlette.datastructures import MutableHeaders +from starlette.datastructures import Headers, MutableHeaders from starlette.types import ASGIApp, Message, Receive, Scope, Send -from app.core.logging import correlation_id_context, request_metadata_context - - -class CorrelationContext: - @staticmethod - def generate_correlation_id() -> str: - return f"req_{uuid.uuid4()}_{int(datetime.now(timezone.utc).timestamp())}" - - @staticmethod - def set_correlation_id(correlation_id: str) -> str: - correlation_id_context.set(correlation_id) - return correlation_id - - @staticmethod - def get_correlation_id() -> str: - return correlation_id_context.get() or "" - - @staticmethod - def set_request_metadata(metadata: dict[str, Any]) -> None: - request_metadata_context.set(metadata) - - @staticmethod - def clear() -> None: - correlation_id_context.set(None) - request_metadata_context.set(None) - class CorrelationMiddleware: CORRELATION_HEADER = "X-Correlation-ID" - REQUEST_ID_HEADER = "X-Request-ID" def __init__(self, app: ASGIApp) -> None: self.app = app @@ -44,42 +15,14 @@ async def __call__(self, scope: Scope, receive: Receive, send: Send) -> None: await self.app(scope, receive, send) return - # Try to get correlation ID from headers - headers = dict(scope["headers"]) - correlation_id = None - - for header_name in [b"x-correlation-id", b"x-request-id"]: - if header_name in headers: - correlation_id = headers[header_name].decode("latin-1") - break - - # Generate correlation ID if not provided - if not correlation_id: - correlation_id = CorrelationContext.generate_correlation_id() + headers = Headers(scope=scope) + correlation_id = headers.get("x-correlation-id") or f"req-{uuid.uuid4().hex}" - # Set correlation ID - correlation_id = CorrelationContext.set_correlation_id(correlation_id) + scope.setdefault("state", {})["correlation_id"] = correlation_id - # Set request metadata - client = scope.get("client") - client_ip = client[0] if client else None - - metadata = { - "method": scope["method"], - "path": scope["path"], - "client": {"host": client_ip} if client_ip else None, - } - CorrelationContext.set_request_metadata(metadata) - - # Add correlation ID to response headers async def send_wrapper(message: Message) -> None: if message["type"] == "http.response.start": - headers = MutableHeaders(scope=message) - headers[self.CORRELATION_HEADER] = correlation_id + MutableHeaders(scope=message)[self.CORRELATION_HEADER] = correlation_id await send(message) - # Process request await self.app(scope, receive, send_wrapper) - - # Clear context after request - CorrelationContext.clear() diff --git a/backend/app/core/database_context.py b/backend/app/core/database_context.py deleted file mode 100644 index 06913e03..00000000 --- a/backend/app/core/database_context.py +++ /dev/null @@ -1,14 +0,0 @@ -from typing import Any - -from pymongo.asynchronous.client_session import AsyncClientSession -from pymongo.asynchronous.collection import AsyncCollection -from pymongo.asynchronous.cursor import AsyncCursor -from pymongo.asynchronous.database import AsyncDatabase -from pymongo.asynchronous.mongo_client import AsyncMongoClient - -type MongoDocument = dict[str, Any] -type DBClient = AsyncMongoClient[MongoDocument] -type Database = AsyncDatabase[MongoDocument] -type Collection = AsyncCollection[MongoDocument] -type Cursor = AsyncCursor[MongoDocument] -type DBSession = AsyncClientSession diff --git a/backend/app/core/dishka_lifespan.py b/backend/app/core/dishka_lifespan.py index f923c0ca..a7d23c4a 100644 --- a/backend/app/core/dishka_lifespan.py +++ b/backend/app/core/dishka_lifespan.py @@ -11,7 +11,6 @@ from pymongo import AsyncMongoClient from app.core.logging import setup_logger -from app.core.tracing import init_tracing from app.db.docs import ALL_DOCUMENTS from app.events.handlers import ( register_notification_subscriber, @@ -53,34 +52,6 @@ async def lifespan(app: FastAPI) -> AsyncGenerator[None, None]: }, ) - # Initialize tracing only when enabled (avoid exporter retries in tests) - if settings.ENABLE_TRACING and not settings.TESTING: - instrumentation_report = init_tracing( - service_name=settings.TRACING_SERVICE_NAME, - settings=settings, - logger=logger, - service_version=settings.TRACING_SERVICE_VERSION, - sampling_rate=settings.TRACING_SAMPLING_RATE, - enable_console_exporter=settings.TESTING, - adaptive_sampling=settings.TRACING_ADAPTIVE_SAMPLING, - ) - - if instrumentation_report.has_failures(): - logger.warning( - "Some instrumentation libraries failed to initialize", - extra={"instrumentation_summary": instrumentation_report.get_summary()}, - ) - else: - logger.info( - "Distributed tracing initialized successfully", - extra={"instrumentation_summary": instrumentation_report.get_summary()}, - ) - else: - logger.info( - "Distributed tracing disabled", - extra={"testing": settings.TESTING, "enable_tracing": settings.ENABLE_TRACING}, - ) - # Get unstarted broker from DI (BrokerProvider yields without starting) broker: KafkaBroker = await container.get(KafkaBroker) app.state.kafka_broker = broker diff --git a/backend/app/core/logging.py b/backend/app/core/logging.py index 5c79f09a..bb48287c 100644 --- a/backend/app/core/logging.py +++ b/backend/app/core/logging.py @@ -1,101 +1,49 @@ -import contextvars -import json import logging import re -from datetime import datetime, timezone -from typing import Any +import structlog from opentelemetry import trace -correlation_id_context: contextvars.ContextVar[str | None] = contextvars.ContextVar("correlation_id", default=None) - -request_metadata_context: contextvars.ContextVar[dict[str, Any] | None] = contextvars.ContextVar( - "request_metadata", default=None -) - - -class CorrelationFilter(logging.Filter): - def filter(self, record: logging.LogRecord) -> bool: - correlation_id = correlation_id_context.get() - if correlation_id: - record.correlation_id = correlation_id - - metadata = request_metadata_context.get() - if metadata: - record.request_method = metadata.get("method") - record.request_path = metadata.get("path") - # Client IP is now safely extracted without DNS lookup - if metadata.get("client"): - record.client_host = metadata["client"].get("host") - - return True - - -class JSONFormatter(logging.Formatter): - def _sanitize_sensitive_data(self, data: str) -> str: - """Remove or mask sensitive information from log data.""" - # Mask API keys, tokens, and similar sensitive data - patterns = [ - # API keys and tokens - ( - r'(["\']?(?:api[_-]?)?(?:key|token|secret|password|passwd|pwd)["\']?\s*[:=]\s*["\']?)([^"\']+)(["\']?)', - r"\1***API_KEY_OR_TOKEN_REDACTED***\3", - ), - # Bearer tokens - (r"(Bearer\s+)([A-Za-z0-9\-_]+)", r"\1***BEARER_TOKEN_REDACTED***"), - # JWT tokens - (r"(eyJ[A-Za-z0-9\-_]+\.eyJ[A-Za-z0-9\-_]+\.[A-Za-z0-9\-_]+)", r"***JWT_REDACTED***"), - # MongoDB URLs with credentials - (r"(mongodb(?:\+srv)?://[^:]+:)([^@]+)(@)", r"\1***MONGODB_REDACTED***\3"), - # Generic URLs with credentials - (r"(https?://[^:]+:)([^@]+)(@)", r"\1***URL_CREDS_REDACTED***\3"), - # Email addresses (optional - uncomment if needed) - (r"([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,})", r"***EMAIL_REDACTED***"), - ] - - for pattern, replacement in patterns: - data = re.sub(pattern, replacement, data, flags=re.IGNORECASE) - - return data - - def format(self, record: logging.LogRecord) -> str: - # Sanitize the message - message = self._sanitize_sensitive_data(record.getMessage()) - - log_data = { - "timestamp": datetime.now(timezone.utc).isoformat(), - "level": record.levelname, - "logger": record.name, - "message": message, - } - - if hasattr(record, "correlation_id"): - log_data["correlation_id"] = record.correlation_id - - if hasattr(record, "request_method"): - log_data["request_method"] = record.request_method - - if hasattr(record, "request_path"): - log_data["request_path"] = record.request_path - - if hasattr(record, "client_host"): - log_data["client_host"] = record.client_host - - # OpenTelemetry trace context (hexadecimal ids) - if hasattr(record, "trace_id"): - log_data["trace_id"] = record.trace_id - if hasattr(record, "span_id"): - log_data["span_id"] = record.span_id - - if record.exc_info: - exc_text = self.formatException(record.exc_info) - log_data["exc_info"] = self._sanitize_sensitive_data(exc_text) - - if hasattr(record, "stack_info") and record.stack_info: - stack_text = self.formatStack(record.stack_info) - log_data["stack_info"] = self._sanitize_sensitive_data(stack_text) - - return json.dumps(log_data, ensure_ascii=False) +SENSITIVE_PATTERNS: list[tuple[str, str]] = [ + ( + r'(["\']?(?:api[_-]?)?(?:key|token|secret|password|passwd|pwd)["\']?\s*[:=]\s*["\']?)([^"\']+)(["\']?)', + r"\1***API_KEY_OR_TOKEN_REDACTED***\3", + ), + (r"(Bearer\s+)([A-Za-z0-9\-_]+)", r"\1***BEARER_TOKEN_REDACTED***"), + (r"(eyJ[A-Za-z0-9\-_]+\.eyJ[A-Za-z0-9\-_]+\.[A-Za-z0-9\-_]+)", r"***JWT_REDACTED***"), + (r"(mongodb(?:\+srv)?://[^:]+:)([^@]+)(@)", r"\1***MONGODB_REDACTED***\3"), + (r"(https?://[^:]+:)([^@]+)(@)", r"\1***URL_CREDS_REDACTED***\3"), + (r"([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,})", r"***EMAIL_REDACTED***"), +] + + +def sanitize_sensitive_data( + logger: structlog.types.WrappedLogger, + method_name: str, + event_dict: structlog.types.EventDict, +) -> structlog.types.EventDict: + """Structlog processor that redacts sensitive data from the event message.""" + event = event_dict.get("event", "") + if isinstance(event, str): + for pattern, replacement in SENSITIVE_PATTERNS: + event = re.sub(pattern, replacement, event, flags=re.IGNORECASE) + event_dict["event"] = event + return event_dict + + +def add_otel_context( + logger: structlog.types.WrappedLogger, + method_name: str, + event_dict: structlog.types.EventDict, +) -> structlog.types.EventDict: + """Structlog processor that adds OpenTelemetry trace/span IDs.""" + span = trace.get_current_span() + if span and span.is_recording(): + span_context = span.get_span_context() + if span_context.is_valid: + event_dict["trace_id"] = format(span_context.trace_id, "032x") + event_dict["span_id"] = format(span_context.span_id, "016x") + return event_dict LOG_LEVELS: dict[str, int] = { @@ -107,41 +55,34 @@ def format(self, record: logging.LogRecord) -> str: } -def setup_logger(log_level: str) -> logging.Logger: - """Create and configure the application logger. Called by DI with Settings.LOG_LEVEL.""" - new_logger = logging.getLogger("integr8scode") - new_logger.handlers.clear() - - console_handler = logging.StreamHandler() - formatter = JSONFormatter() - - console_handler.setFormatter(formatter) - - correlation_filter = CorrelationFilter() - console_handler.addFilter(correlation_filter) - - class TracingFilter(logging.Filter): - def filter(self, record: logging.LogRecord) -> bool: - # Inline minimal helpers to avoid circular import on tracing.utils - span = trace.get_current_span() - trace_id = None - span_id = None - if span and span.is_recording(): - span_context = span.get_span_context() - if span_context.is_valid: - trace_id = format(span_context.trace_id, "032x") - span_id = format(span_context.span_id, "016x") - if trace_id: - record.trace_id = trace_id - if span_id: - record.span_id = span_id - return True - - console_handler.addFilter(TracingFilter()) - - new_logger.addHandler(console_handler) +def setup_logger(log_level: str) -> structlog.stdlib.BoundLogger: + """Configure structlog and return a bound logger for the application. + Called by DI with Settings.LOG_LEVEL and also directly by main.py/lifespan. + """ level = LOG_LEVELS.get(log_level.upper(), logging.DEBUG) - new_logger.setLevel(level) - return new_logger + structlog.configure( + processors=[ + structlog.contextvars.merge_contextvars, + structlog.stdlib.filter_by_level, + structlog.stdlib.add_logger_name, + structlog.stdlib.add_log_level, + structlog.processors.TimeStamper(fmt="iso"), + sanitize_sensitive_data, + add_otel_context, + structlog.processors.StackInfoRenderer(), + structlog.processors.format_exc_info, + structlog.processors.JSONRenderer(), + ], + wrapper_class=structlog.stdlib.BoundLogger, + context_class=dict, + logger_factory=structlog.stdlib.LoggerFactory(), + cache_logger_on_first_use=True, + ) + + logging.basicConfig(level=level, format="%(message)s", handlers=[logging.StreamHandler()]) + logging.getLogger().setLevel(level) + + logger: structlog.stdlib.BoundLogger = structlog.get_logger("integr8scode") + return logger diff --git a/backend/app/core/middlewares/metrics.py b/backend/app/core/middlewares/metrics.py index 93a00f98..34bcdb8b 100644 --- a/backend/app/core/middlewares/metrics.py +++ b/backend/app/core/middlewares/metrics.py @@ -1,9 +1,9 @@ -import logging import os import re import time import psutil +import structlog from opentelemetry import metrics from opentelemetry.exporter.otlp.proto.grpc.metric_exporter import OTLPMetricExporter from opentelemetry.metrics import CallbackOptions, Observation @@ -117,7 +117,7 @@ def _get_path_template(path: str) -> str: return path -def setup_metrics(settings: Settings, logger: logging.Logger) -> None: +def setup_metrics(settings: Settings, logger: structlog.stdlib.BoundLogger) -> None: """Set up the global OpenTelemetry MeterProvider with OTLP exporter. This is the single initialization point for metrics export. ``BaseMetrics`` @@ -127,9 +127,9 @@ def setup_metrics(settings: Settings, logger: logging.Logger) -> None: """ if settings.TESTING or not settings.OTEL_EXPORTER_OTLP_ENDPOINT: logger.info( - "Metrics OTLP export disabled (testing=%s, endpoint=%s)", - settings.TESTING, - settings.OTEL_EXPORTER_OTLP_ENDPOINT, + "Metrics OTLP export disabled", + testing=settings.TESTING, + endpoint=settings.OTEL_EXPORTER_OTLP_ENDPOINT, ) return diff --git a/backend/app/core/providers.py b/backend/app/core/providers.py index 033132a9..8c982f9e 100644 --- a/backend/app/core/providers.py +++ b/backend/app/core/providers.py @@ -1,11 +1,12 @@ from __future__ import annotations -import logging from typing import AsyncIterator import redis.asyncio as redis +import structlog from apscheduler.schedulers.asyncio import AsyncIOScheduler from dishka import Provider, Scope, from_context, provide +from fastapi import Request from faststream.kafka import KafkaBroker from faststream.kafka.opentelemetry import KafkaTelemetryMiddleware from kubernetes_asyncio import client as k8s_client @@ -28,7 +29,7 @@ SecurityMetrics, ) from app.core.security import SecurityService -from app.core.tracing import TracerManager +from app.core.tracing import Tracer from app.db.repositories import ( AdminEventsRepository, AdminSettingsRepository, @@ -90,7 +91,7 @@ class BrokerProvider(Provider): @provide async def get_broker( - self, settings: Settings, logger: logging.Logger + self, settings: Settings, logger: structlog.stdlib.BoundLogger, _tracer: Tracer, ) -> AsyncIterator[KafkaBroker]: broker = KafkaBroker( settings.KAFKA_BOOTSTRAP_SERVERS, @@ -117,15 +118,33 @@ class LoggingProvider(Provider): scope = Scope.APP @provide - def get_logger(self, settings: Settings) -> logging.Logger: + def get_logger(self, settings: Settings) -> structlog.stdlib.BoundLogger: return setup_logger(settings.LOG_LEVEL) +class RequestScopedProvider(Provider): + """Provides REQUEST-scoped logger with correlation context bound from the request.""" + + scope = Scope.REQUEST + + @provide + def get_request_logger( + self, base: structlog.stdlib.BoundLogger, request: Request, + ) -> structlog.stdlib.BoundLogger: + return base.bind( + correlation_id=request.state.correlation_id, + request_path=str(request.url.path), + request_method=request.method, + ) + + class RedisProvider(Provider): scope = Scope.APP @provide - async def get_redis_client(self, settings: Settings, logger: logging.Logger) -> AsyncIterator[redis.Redis]: + async def get_redis_client( + self, settings: Settings, logger: structlog.stdlib.BoundLogger + ) -> AsyncIterator[redis.Redis]: # Create Redis client - it will automatically use the current event loop client = redis.Redis( host=settings.REDIS_HOST, @@ -152,7 +171,7 @@ async def get_rate_limit_service( redis_client: redis.Redis, settings: Settings, rate_limit_metrics: RateLimitMetrics, - logger: logging.Logger, + logger: structlog.stdlib.BoundLogger, ) -> RateLimitService: service = RateLimitService(redis_client, settings, rate_limit_metrics) try: @@ -176,8 +195,10 @@ def get_security_service(self, settings: Settings) -> SecurityService: return SecurityService(settings) @provide - def get_tracer_manager(self, settings: Settings) -> TracerManager: - return TracerManager(tracer_name=settings.TRACING_SERVICE_NAME) + def get_tracer( + self, settings: Settings, logger: structlog.stdlib.BoundLogger, + ) -> Tracer: + return Tracer(settings, logger) class MessagingProvider(Provider): @@ -188,7 +209,7 @@ def get_unified_producer( self, broker: KafkaBroker, event_repository: EventRepository, - logger: logging.Logger, + logger: structlog.stdlib.BoundLogger, settings: Settings, event_metrics: EventMetrics, ) -> UnifiedProducer: @@ -200,7 +221,10 @@ def get_idempotency_repository(self, redis_client: redis.Redis) -> RedisIdempote @provide def get_idempotency_manager( - self, repo: RedisIdempotencyRepository, logger: logging.Logger, database_metrics: DatabaseMetrics + self, + repo: RedisIdempotencyRepository, + logger: structlog.stdlib.BoundLogger, + database_metrics: DatabaseMetrics, ) -> IdempotencyManager: return IdempotencyManager(IdempotencyConfig(), repo, logger, database_metrics) @@ -270,7 +294,7 @@ def get_dlq_manager( self, broker: KafkaBroker, settings: Settings, - logger: logging.Logger, + logger: structlog.stdlib.BoundLogger, dlq_metrics: DLQMetrics, repository: DLQRepository, ) -> DLQManager: @@ -298,7 +322,7 @@ async def get_dlq_manager( self, broker: KafkaBroker, settings: Settings, - logger: logging.Logger, + logger: structlog.stdlib.BoundLogger, dlq_metrics: DLQMetrics, repository: DLQRepository, ) -> AsyncIterator[DLQManager]: @@ -335,7 +359,9 @@ class KubernetesProvider(Provider): scope = Scope.APP @provide - async def get_api_client(self, settings: Settings, logger: logging.Logger) -> AsyncIterator[k8s_client.ApiClient]: + async def get_api_client( + self, settings: Settings, logger: structlog.stdlib.BoundLogger + ) -> AsyncIterator[k8s_client.ApiClient]: """Provide Kubernetes ApiClient with config loading and cleanup.""" await k8s_config.load_kube_config(config_file=settings.KUBERNETES_CONFIG_PATH) api_client = k8s_client.ApiClient() @@ -350,7 +376,9 @@ class ResourceCleanerProvider(Provider): scope = Scope.APP @provide - def get_resource_cleaner(self, api_client: k8s_client.ApiClient, logger: logging.Logger) -> ResourceCleaner: + def get_resource_cleaner( + self, api_client: k8s_client.ApiClient, logger: structlog.stdlib.BoundLogger + ) -> ResourceCleaner: return ResourceCleaner(api_client=api_client, logger=logger) @@ -418,7 +446,7 @@ class RepositoryProvider(Provider): scope = Scope.APP @provide - def get_execution_repository(self, logger: logging.Logger) -> ExecutionRepository: + def get_execution_repository(self, logger: structlog.stdlib.BoundLogger) -> ExecutionRepository: return ExecutionRepository(logger) @provide @@ -434,19 +462,19 @@ def get_saved_script_repository(self) -> SavedScriptRepository: return SavedScriptRepository() @provide - def get_dlq_repository(self, logger: logging.Logger) -> DLQRepository: + def get_dlq_repository(self, logger: structlog.stdlib.BoundLogger) -> DLQRepository: return DLQRepository(logger) @provide - def get_replay_repository(self, logger: logging.Logger) -> ReplayRepository: + def get_replay_repository(self, logger: structlog.stdlib.BoundLogger) -> ReplayRepository: return ReplayRepository(logger) @provide - def get_event_repository(self, logger: logging.Logger) -> EventRepository: + def get_event_repository(self, logger: structlog.stdlib.BoundLogger) -> EventRepository: return EventRepository(logger) @provide - def get_user_settings_repository(self, logger: logging.Logger) -> UserSettingsRepository: + def get_user_settings_repository(self, logger: structlog.stdlib.BoundLogger) -> UserSettingsRepository: return UserSettingsRepository(logger) @provide @@ -454,7 +482,7 @@ def get_admin_events_repository(self) -> AdminEventsRepository: return AdminEventsRepository() @provide - def get_admin_settings_repository(self, logger: logging.Logger) -> AdminSettingsRepository: + def get_admin_settings_repository(self, logger: structlog.stdlib.BoundLogger) -> AdminSettingsRepository: return AdminSettingsRepository(logger) @provide @@ -462,7 +490,7 @@ def get_admin_user_repository(self) -> AdminUserRepository: return AdminUserRepository() @provide - def get_notification_repository(self, logger: logging.Logger) -> NotificationRepository: + def get_notification_repository(self, logger: structlog.stdlib.BoundLogger) -> NotificationRepository: return NotificationRepository(logger) @provide @@ -480,7 +508,7 @@ class SSEProvider(Provider): scope = Scope.APP @provide - def get_sse_redis_bus(self, redis_client: redis.Redis, logger: logging.Logger) -> SSERedisBus: + def get_sse_redis_bus(self, redis_client: redis.Redis, logger: structlog.stdlib.BoundLogger) -> SSERedisBus: return SSERedisBus(redis_client, logger) @provide(scope=Scope.REQUEST) @@ -489,7 +517,7 @@ def get_sse_service( sse_repository: SSERepository, sse_redis_bus: SSERedisBus, settings: Settings, - logger: logging.Logger, + logger: structlog.stdlib.BoundLogger, connection_metrics: ConnectionMetrics, ) -> SSEService: return SSEService( @@ -506,7 +534,10 @@ class AuthProvider(Provider): @provide def get_auth_service( - self, user_repository: UserRepository, security_service: SecurityService, logger: logging.Logger + self, + user_repository: UserRepository, + security_service: SecurityService, + logger: structlog.stdlib.BoundLogger, ) -> AuthService: return AuthService(user_repository, security_service, logger) @@ -525,7 +556,7 @@ def get_kafka_event_service( self, kafka_producer: UnifiedProducer, settings: Settings, - logger: logging.Logger, + logger: structlog.stdlib.BoundLogger, event_metrics: EventMetrics, ) -> KafkaEventService: return KafkaEventService( @@ -545,7 +576,7 @@ def get_user_settings_service( repository: UserSettingsRepository, kafka_event_service: KafkaEventService, settings: Settings, - logger: logging.Logger, + logger: structlog.stdlib.BoundLogger, ) -> UserSettingsService: return UserSettingsService(repository, kafka_event_service, settings, logger) @@ -558,7 +589,7 @@ def get_runtime_settings_loader( self, admin_settings_repository: AdminSettingsRepository, settings: Settings, - logger: logging.Logger, + logger: structlog.stdlib.BoundLogger, ) -> RuntimeSettingsLoader: return RuntimeSettingsLoader(admin_settings_repository, settings, logger) @@ -567,7 +598,7 @@ def get_login_lockout_service( self, redis_client: redis.Redis, runtime_settings: RuntimeSettingsLoader, - logger: logging.Logger, + logger: structlog.stdlib.BoundLogger, ) -> LoginLockoutService: return LoginLockoutService(redis_client, runtime_settings, logger) @@ -576,7 +607,7 @@ def get_admin_events_service( self, admin_events_repository: AdminEventsRepository, event_replay_service: EventReplayService, - logger: logging.Logger, + logger: structlog.stdlib.BoundLogger, ) -> AdminEventsService: return AdminEventsService(admin_events_repository, event_replay_service, logger) @@ -585,7 +616,7 @@ def get_admin_settings_service( self, admin_settings_repository: AdminSettingsRepository, runtime_settings: RuntimeSettingsLoader, - logger: logging.Logger, + logger: structlog.stdlib.BoundLogger, ) -> AdminSettingsService: return AdminSettingsService(admin_settings_repository, runtime_settings, logger) @@ -596,7 +627,7 @@ def get_notification_service( kafka_event_service: KafkaEventService, sse_redis_bus: SSERedisBus, settings: Settings, - logger: logging.Logger, + logger: structlog.stdlib.BoundLogger, notification_metrics: NotificationMetrics, ) -> NotificationService: return NotificationService( @@ -613,7 +644,7 @@ async def get_notification_scheduler( self, notification_repository: NotificationRepository, notification_service: NotificationService, - logger: logging.Logger, + logger: structlog.stdlib.BoundLogger, ) -> AsyncIterator[NotificationScheduler]: scheduler_service = NotificationScheduler( @@ -668,7 +699,7 @@ def get_saga_service( saga_repository: SagaRepository, execution_repository: ExecutionRepository, saga_orchestrator: SagaOrchestrator, - logger: logging.Logger, + logger: structlog.stdlib.BoundLogger, ) -> SagaService: return SagaService( saga_repo=saga_repository, @@ -683,10 +714,11 @@ def get_execution_service( execution_repository: ExecutionRepository, kafka_producer: UnifiedProducer, settings: Settings, - logger: logging.Logger, + logger: structlog.stdlib.BoundLogger, execution_metrics: ExecutionMetrics, idempotency_manager: IdempotencyManager, runtime_settings: RuntimeSettingsLoader, + request: Request, ) -> ExecutionService: return ExecutionService( execution_repo=execution_repository, @@ -696,11 +728,12 @@ def get_execution_service( execution_metrics=execution_metrics, idempotency_manager=idempotency_manager, runtime_settings=runtime_settings, + correlation_id=request.state.correlation_id, ) @provide def get_saved_script_service( - self, saved_script_repository: SavedScriptRepository, logger: logging.Logger + self, saved_script_repository: SavedScriptRepository, logger: structlog.stdlib.BoundLogger ) -> SavedScriptService: return SavedScriptService(saved_script_repository, logger) @@ -712,7 +745,7 @@ def get_admin_user_service( execution_service: ExecutionService, rate_limit_service: RateLimitService, security_service: SecurityService, - logger: logging.Logger, + logger: structlog.stdlib.BoundLogger, ) -> AdminUserService: return AdminUserService( user_repository=admin_user_repository, @@ -732,7 +765,7 @@ def get_execution_coordinator( self, producer: UnifiedProducer, execution_repository: ExecutionRepository, - logger: logging.Logger, + logger: structlog.stdlib.BoundLogger, coordinator_metrics: CoordinatorMetrics, ) -> ExecutionCoordinator: return ExecutionCoordinator( @@ -752,7 +785,7 @@ def get_kubernetes_worker( api_client: k8s_client.ApiClient, kafka_producer: UnifiedProducer, settings: Settings, - logger: logging.Logger, + logger: structlog.stdlib.BoundLogger, event_metrics: EventMetrics, ) -> KubernetesWorker: return KubernetesWorker( @@ -770,7 +803,7 @@ class PodMonitorProvider(Provider): @provide def get_event_mapper( self, - logger: logging.Logger, + logger: structlog.stdlib.BoundLogger, api_client: k8s_client.ApiClient, ) -> PodEventMapper: return PodEventMapper(logger=logger, k8s_api=k8s_client.CoreV1Api(api_client)) @@ -780,7 +813,7 @@ async def get_pod_monitor( self, kafka_event_service: KafkaEventService, api_client: k8s_client.ApiClient, - logger: logging.Logger, + logger: structlog.stdlib.BoundLogger, event_mapper: PodEventMapper, kubernetes_metrics: KubernetesMetrics, ) -> AsyncIterator[PodMonitor]: @@ -839,7 +872,7 @@ def get_saga_orchestrator( saga_repository: SagaRepository, kafka_producer: UnifiedProducer, resource_allocation_repository: ResourceAllocationRepository, - logger: logging.Logger, + logger: structlog.stdlib.BoundLogger, ) -> SagaOrchestrator: return SagaOrchestrator( config=_create_default_saga_config(), @@ -865,7 +898,7 @@ async def get_saga_orchestrator( saga_repository: SagaRepository, kafka_producer: UnifiedProducer, resource_allocation_repository: ResourceAllocationRepository, - logger: logging.Logger, + logger: structlog.stdlib.BoundLogger, ) -> AsyncIterator[SagaOrchestrator]: orchestrator = SagaOrchestrator( @@ -904,7 +937,7 @@ def get_result_processor( execution_repo: ExecutionRepository, kafka_producer: UnifiedProducer, settings: Settings, - logger: logging.Logger, + logger: structlog.stdlib.BoundLogger, execution_metrics: ExecutionMetrics, ) -> ResultProcessor: return ResultProcessor( @@ -925,7 +958,7 @@ def get_event_replay_service( replay_repository: ReplayRepository, kafka_producer: UnifiedProducer, replay_metrics: ReplayMetrics, - logger: logging.Logger, + logger: structlog.stdlib.BoundLogger, ) -> EventReplayService: return EventReplayService( repository=replay_repository, @@ -950,7 +983,7 @@ async def get_event_replay_service( replay_repository: ReplayRepository, kafka_producer: UnifiedProducer, replay_metrics: ReplayMetrics, - logger: logging.Logger, + logger: structlog.stdlib.BoundLogger, ) -> AsyncIterator[EventReplayService]: service = EventReplayService( diff --git a/backend/app/core/tracing/__init__.py b/backend/app/core/tracing/__init__.py index 2116b5b2..77236470 100644 --- a/backend/app/core/tracing/__init__.py +++ b/backend/app/core/tracing/__init__.py @@ -1,46 +1,64 @@ -# Re-export commonly used OpenTelemetry types for convenience -from opentelemetry import context -from opentelemetry.trace import SpanKind, Status, StatusCode - -# Import configuration and initialization -from app.core.tracing.config import ( - TracingConfiguration, - TracingInitializer, - init_tracing, -) -from app.core.tracing.models import ( - EventAttributes, - InstrumentationReport, - InstrumentationResult, - InstrumentationStatus, - TracerManager, -) - -# Import utilities and decorators -from app.core.tracing.utils import ( - add_span_attributes, - get_tracer, - trace_span, -) - -__all__ = [ - # Models and enums - "EventAttributes", - "InstrumentationReport", - "InstrumentationResult", - "InstrumentationStatus", - "TracerManager", - # Configuration and initialization - "TracingConfiguration", - "TracingInitializer", - "init_tracing", - # Utilities and decorators - "add_span_attributes", - "get_tracer", - "trace_span", - # OpenTelemetry types - "context", - "SpanKind", - "Status", - "StatusCode", -] +import os + +import structlog +from opentelemetry import trace +from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import OTLPSpanExporter +from opentelemetry.instrumentation.fastapi import FastAPIInstrumentor +from opentelemetry.instrumentation.httpx import HTTPXClientInstrumentor +from opentelemetry.instrumentation.logging import LoggingInstrumentor +from opentelemetry.instrumentation.pymongo import PymongoInstrumentor +from opentelemetry.propagate import set_global_textmap +from opentelemetry.sdk.resources import SERVICE_NAME, SERVICE_VERSION, Resource +from opentelemetry.sdk.trace import TracerProvider +from opentelemetry.sdk.trace.export import BatchSpanProcessor +from opentelemetry.sdk.trace.sampling import ALWAYS_OFF, ALWAYS_ON, ParentBased, Sampler, TraceIdRatioBased +from opentelemetry.trace.propagation.tracecontext import TraceContextTextMapPropagator + +from app.settings import Settings + + +class Tracer: + """DI-managed OpenTelemetry tracer. Initialization happens on construction.""" + + def __init__(self, settings: Settings, logger: structlog.stdlib.BoundLogger) -> None: + name = settings.TRACING_SERVICE_NAME + rate = settings.TRACING_SAMPLING_RATE + + resource = Resource.create({ + SERVICE_NAME: name, + SERVICE_VERSION: settings.TRACING_SERVICE_VERSION, + "deployment.environment": "test" if settings.TESTING else "production", + "service.namespace": "integr8scode", + "service.instance.id": os.environ.get("HOSTNAME", "unknown"), + }) + + sampler: Sampler + if rate <= 0: + sampler = ALWAYS_OFF + elif rate >= 1.0: + sampler = ALWAYS_ON + else: + sampler = ParentBased(root=TraceIdRatioBased(rate)) + + provider = TracerProvider(resource=resource, sampler=sampler) + + if settings.OTLP_TRACES_ENDPOINT: + provider.add_span_processor( + BatchSpanProcessor(OTLPSpanExporter( + endpoint=settings.OTLP_TRACES_ENDPOINT, + insecure=True, + )) + ) + + trace.set_tracer_provider(provider) + set_global_textmap(TraceContextTextMapPropagator()) + + tp = trace.get_tracer_provider() + FastAPIInstrumentor().instrument( + tracer_provider=tp, excluded_urls="health,metrics,docs,openapi.json", + ) + HTTPXClientInstrumentor().instrument(tracer_provider=tp) + PymongoInstrumentor().instrument(tracer_provider=tp) + LoggingInstrumentor().instrument(set_logging_format=True, log_level="INFO") + + logger.info(f"Tracing initialized for {name}") diff --git a/backend/app/core/tracing/config.py b/backend/app/core/tracing/config.py deleted file mode 100644 index 5aa61fa3..00000000 --- a/backend/app/core/tracing/config.py +++ /dev/null @@ -1,200 +0,0 @@ -import logging -import os - -from opentelemetry import trace -from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import OTLPSpanExporter -from opentelemetry.instrumentation.fastapi import FastAPIInstrumentor -from opentelemetry.instrumentation.httpx import HTTPXClientInstrumentor -from opentelemetry.instrumentation.logging import LoggingInstrumentor -from opentelemetry.instrumentation.pymongo import PymongoInstrumentor -from opentelemetry.propagate import set_global_textmap -from opentelemetry.sdk.resources import SERVICE_NAME, SERVICE_VERSION, Resource -from opentelemetry.sdk.trace import TracerProvider -from opentelemetry.sdk.trace.export import BatchSpanProcessor, ConsoleSpanExporter -from opentelemetry.sdk.trace.sampling import ALWAYS_OFF, ALWAYS_ON, Sampler, TraceIdRatioBased -from opentelemetry.trace.propagation.tracecontext import TraceContextTextMapPropagator - -from app.core.adaptive_sampling import create_adaptive_sampler -from app.core.tracing.models import ( - InstrumentationReport, - InstrumentationResult, - InstrumentationStatus, - LibraryInstrumentation, -) -from app.settings import Settings - - -class TracingConfiguration: - """Configuration for OpenTelemetry tracing.""" - - def __init__( - self, - service_name: str, - settings: Settings, - service_version: str = "1.0.0", - otlp_endpoint: str | None = None, - enable_console_exporter: bool = False, - sampling_rate: float = 1.0, - adaptive_sampling: bool = False, - ) -> None: - self.service_name = service_name - self.service_version = service_version - self.otlp_endpoint = otlp_endpoint - self.enable_console_exporter = enable_console_exporter - self.sampling_rate = sampling_rate - self.adaptive_sampling = adaptive_sampling - self._settings = settings - - def create_resource(self) -> Resource: - """Create OpenTelemetry resource with service metadata.""" - return Resource.create( - { - SERVICE_NAME: self.service_name, - SERVICE_VERSION: self.service_version, - "deployment.environment": self._get_environment(), - "service.namespace": "integr8scode", - "service.instance.id": os.environ.get("HOSTNAME", "unknown"), - } - ) - - def create_sampler(self) -> Sampler: - """Create appropriate sampler based on configuration.""" - if self.adaptive_sampling: - return create_adaptive_sampler(self._settings) - - if self.sampling_rate <= 0: - return ALWAYS_OFF - - if self.sampling_rate >= 1.0: - return ALWAYS_ON - - return TraceIdRatioBased(self.sampling_rate) - - def get_otlp_endpoint(self) -> str | None: - """Get OTLP endpoint from config or environment.""" - if self.otlp_endpoint: - return self.otlp_endpoint - - if self._settings.JAEGER_AGENT_HOST: - return f"{self._settings.JAEGER_AGENT_HOST}:4317" - - return None - - def _get_environment(self) -> str: - """Get deployment environment.""" - return "test" if self._settings.TESTING else "production" - - -class TracingInitializer: - """Initializes OpenTelemetry tracing with instrumentation.""" - - def __init__(self, config: TracingConfiguration, logger: logging.Logger) -> None: - self.config = config - self.instrumentation_report = InstrumentationReport() - self.logger = logger - - def initialize(self) -> InstrumentationReport: - """Initialize tracing and instrument libraries.""" - provider = self._create_provider() - self._configure_exporters(provider) - trace.set_tracer_provider(provider) - set_global_textmap(TraceContextTextMapPropagator()) - - self._instrument_libraries() - - self.logger.info( - f"OpenTelemetry tracing initialized for {self.config.service_name}", - extra={"instrumentation_summary": self.instrumentation_report.get_summary()}, - ) - - return self.instrumentation_report - - def _create_provider(self) -> TracerProvider: - """Create tracer provider with resource and sampler.""" - return TracerProvider(resource=self.config.create_resource(), sampler=self.config.create_sampler()) - - def _configure_exporters(self, provider: TracerProvider) -> None: - """Configure span exporters.""" - otlp_endpoint = self.config.get_otlp_endpoint() - if otlp_endpoint: - otlp_exporter = OTLPSpanExporter( - endpoint=otlp_endpoint, - insecure=True, - ) - provider.add_span_processor(BatchSpanProcessor(otlp_exporter)) - - if self.config.enable_console_exporter: - console_exporter = ConsoleSpanExporter() - provider.add_span_processor(BatchSpanProcessor(console_exporter)) - - def _instrument_libraries(self) -> None: - """Instrument all configured libraries.""" - libraries = self._get_libraries_to_instrument() - - for lib in libraries: - result = self._instrument_library(lib) - self.instrumentation_report.add_result(result) - - def _get_libraries_to_instrument(self) -> list[LibraryInstrumentation]: - """Get list of libraries to instrument.""" - return [ - LibraryInstrumentation( - name="fastapi", - instrumentor=FastAPIInstrumentor(), - config={ - "tracer_provider": trace.get_tracer_provider(), - "excluded_urls": "health,metrics,docs,openapi.json", - }, - ), - LibraryInstrumentation( - name="httpx", - instrumentor=HTTPXClientInstrumentor(), - config={"tracer_provider": trace.get_tracer_provider()}, - ), - LibraryInstrumentation( - name="pymongo", - instrumentor=PymongoInstrumentor(), - config={"tracer_provider": trace.get_tracer_provider()}, - ), - LibraryInstrumentation( - name="logging", - instrumentor=LoggingInstrumentor(), - config={"set_logging_format": True, "log_level": "INFO"}, - ), - ] - - def _instrument_library(self, lib: LibraryInstrumentation) -> InstrumentationResult: - """Instrument a single library and return result.""" - try: - lib.instrumentor.instrument(**lib.config) - return InstrumentationResult(library=lib.name, status=InstrumentationStatus.SUCCESS) - except Exception as e: - self.logger.warning( - f"Failed to instrument {lib.name}", exc_info=True, extra={"library": lib.name, "error": str(e)} - ) - return InstrumentationResult(library=lib.name, status=InstrumentationStatus.FAILED, error=e) - - -def init_tracing( - service_name: str, - settings: Settings, - logger: logging.Logger, - service_version: str = "1.0.0", - otlp_endpoint: str | None = None, - enable_console_exporter: bool = False, - sampling_rate: float = 1.0, - adaptive_sampling: bool = False, -) -> InstrumentationReport: - """Initialize OpenTelemetry tracing with the given configuration.""" - config = TracingConfiguration( - service_name=service_name, - settings=settings, - service_version=service_version, - otlp_endpoint=otlp_endpoint, - enable_console_exporter=enable_console_exporter, - sampling_rate=sampling_rate, - adaptive_sampling=adaptive_sampling, - ) - - initializer = TracingInitializer(config, logger) - return initializer.initialize() diff --git a/backend/app/core/tracing/models.py b/backend/app/core/tracing/models.py deleted file mode 100644 index 2ced50ae..00000000 --- a/backend/app/core/tracing/models.py +++ /dev/null @@ -1,92 +0,0 @@ -from dataclasses import dataclass, field -from typing import Any, Protocol - -from opentelemetry import trace - -from app.core.utils import StringEnum - - -class EventAttributes(StringEnum): - """Standard attribute names for tracing events.""" - - EVENT_TYPE = "event.type" - EVENT_ID = "event.id" - EXECUTION_ID = "execution.id" - USER_ID = "user.id" - POD_NAME = "k8s.pod.name" - POD_NAMESPACE = "k8s.pod.namespace" - KAFKA_TOPIC = "messaging.kafka.topic" - KAFKA_PARTITION = "messaging.kafka.partition" - KAFKA_OFFSET = "messaging.kafka.offset" - KAFKA_KEY = "messaging.kafka.message_key" - CONSUMER_GROUP = "messaging.kafka.consumer_group" - SAGA_NAME = "saga.name" - SAGA_ID = "saga.id" - SAGA_STEP = "saga.step" - QUEUE_NAME = "queue.name" - QUEUE_POSITION = "queue.position" - - -class InstrumentationStatus(StringEnum): - """Status of library instrumentation.""" - - SUCCESS = "success" - FAILED = "failed" - NOT_ATTEMPTED = "not_attempted" - - -@dataclass -class InstrumentationResult: - """Result of instrumenting a single library.""" - - library: str - status: InstrumentationStatus - error: Exception | None = None - - -@dataclass -class InstrumentationReport: - """Report of all instrumentation results.""" - - results: dict[str, InstrumentationResult] = field(default_factory=dict) - - def add_result(self, result: InstrumentationResult) -> None: - """Add an instrumentation result to the report.""" - self.results[result.library] = result - - def get_summary(self) -> dict[str, str]: - """Get a summary of instrumentation statuses.""" - return {library: result.status for library, result in self.results.items()} - - def has_failures(self) -> bool: - """Check if any instrumentation failed.""" - return any(result.status == InstrumentationStatus.FAILED for result in self.results.values()) - - -class Instrumentor(Protocol): - """Protocol for OpenTelemetry instrumentors.""" - - def instrument(self, **kwargs: Any) -> None: ... - - -@dataclass -class LibraryInstrumentation: - """Configuration for instrumenting a library.""" - - name: str - instrumentor: Instrumentor - config: dict[str, Any] = field(default_factory=dict) - - -class TracerManager: - """Manager for OpenTelemetry tracers.""" - - def __init__(self, tracer_name: str = __name__) -> None: - self._tracer_name = tracer_name - self._tracer: trace.Tracer | None = None - - def get_tracer(self) -> trace.Tracer: - """Get or create a tracer instance.""" - if self._tracer is None: - self._tracer = trace.get_tracer(self._tracer_name) - return self._tracer diff --git a/backend/app/core/tracing/utils.py b/backend/app/core/tracing/utils.py deleted file mode 100644 index 54e84f83..00000000 --- a/backend/app/core/tracing/utils.py +++ /dev/null @@ -1,62 +0,0 @@ -from collections.abc import Generator -from contextlib import contextmanager -from typing import Any - -from opentelemetry import trace -from opentelemetry.trace import SpanKind, Status, StatusCode - - -def get_tracer() -> trace.Tracer: - """Get a tracer for the current module.""" - return trace.get_tracer(__name__) - - -@contextmanager -def trace_span( - name: str, - kind: SpanKind = SpanKind.INTERNAL, - attributes: dict[str, Any] | None = None, - set_status_on_exception: bool = True, - tracer: trace.Tracer | None = None, -) -> Generator[trace.Span, None, None]: - """ - Context manager for creating a traced span. - - Args: - name: Name of the span - kind: Kind of span (INTERNAL, CLIENT, SERVER, etc.) - attributes: Additional attributes to set on the span - set_status_on_exception: Whether to set error status on exception - tracer: Optional tracer to use, defaults to module tracer - - Yields: - The created span - """ - if tracer is None: - tracer = get_tracer() - - with tracer.start_as_current_span(name, kind=kind, attributes=attributes or {}) as span: - try: - yield span - except Exception as e: - if set_status_on_exception: - span.set_status(Status(StatusCode.ERROR, str(e))) - span.record_exception(e) - raise - - - -def add_span_attributes(**attributes: Any) -> None: - """ - Add attributes to the current span. - - Args: - **attributes: Key-value pairs to add as span attributes - """ - span = trace.get_current_span() - if span and span.is_recording(): - for key, value in attributes.items(): - if value is not None: - span.set_attribute(key, value) - - diff --git a/backend/app/db/repositories/admin/admin_settings_repository.py b/backend/app/db/repositories/admin/admin_settings_repository.py index 227a8236..fa9df639 100644 --- a/backend/app/db/repositories/admin/admin_settings_repository.py +++ b/backend/app/db/repositories/admin/admin_settings_repository.py @@ -1,12 +1,13 @@ -import logging from datetime import datetime, timezone +import structlog + from app.db.docs.admin_settings import AuditLogDocument, SystemSettingsDocument from app.domain.admin import AuditAction, SystemSettings class AdminSettingsRepository: - def __init__(self, logger: logging.Logger): + def __init__(self, logger: structlog.stdlib.BoundLogger): self.logger = logger async def get_system_settings( diff --git a/backend/app/db/repositories/dlq_repository.py b/backend/app/db/repositories/dlq_repository.py index 9c7fa172..f697685d 100644 --- a/backend/app/db/repositories/dlq_repository.py +++ b/backend/app/db/repositories/dlq_repository.py @@ -1,7 +1,7 @@ -import logging from datetime import datetime, timezone from typing import Any +import structlog from beanie.odm.enums import SortDirection from beanie.operators import Set from monggregate import Pipeline, S @@ -18,7 +18,7 @@ class DLQRepository: - def __init__(self, logger: logging.Logger): + def __init__(self, logger: structlog.stdlib.BoundLogger): self.logger = logger async def get_messages( diff --git a/backend/app/db/repositories/event_repository.py b/backend/app/db/repositories/event_repository.py index 9f360854..ddcc16de 100644 --- a/backend/app/db/repositories/event_repository.py +++ b/backend/app/db/repositories/event_repository.py @@ -1,14 +1,14 @@ -import logging from datetime import datetime, timezone from typing import Any, Mapping +import structlog from beanie.odm.enums import SortDirection from beanie.odm.operators.find import BaseFindOperator from beanie.operators import GTE, LTE, Eq, In, Not, Or, RegEx from monggregate import Pipeline, S +from opentelemetry import trace from pymongo.errors import DuplicateKeyError -from app.core.tracing import EventAttributes, add_span_attributes from app.db.docs import EventArchiveDocument, EventDocument from app.domain.enums import EventType from app.domain.events import ( @@ -24,7 +24,7 @@ class EventRepository: - def __init__(self, logger: logging.Logger) -> None: + def __init__(self, logger: structlog.stdlib.BoundLogger) -> None: self.logger = logger def _time_conditions(self, start_time: datetime | None, end_time: datetime | None) -> list[Any]: @@ -44,13 +44,11 @@ async def store_event(self, event: DomainEvent) -> str: data = event.model_dump(exclude_none=True) data.setdefault("stored_at", datetime.now(timezone.utc)) doc = EventDocument(**data) - add_span_attributes( - **{ - str(EventAttributes.EVENT_TYPE): event.event_type, - str(EventAttributes.EVENT_ID): event.event_id, - str(EventAttributes.EXECUTION_ID): event.aggregate_id or "", - } - ) + trace.get_current_span().set_attributes({ + "event.type": event.event_type, + "event.id": event.event_id, + "execution.id": event.aggregate_id or "", + }) try: await doc.insert() except DuplicateKeyError: diff --git a/backend/app/db/repositories/execution_repository.py b/backend/app/db/repositories/execution_repository.py index 0dfaf64a..de1aa1dc 100644 --- a/backend/app/db/repositories/execution_repository.py +++ b/backend/app/db/repositories/execution_repository.py @@ -1,7 +1,7 @@ -import logging from datetime import datetime, timezone from typing import Any +import structlog from beanie.odm.enums import SortDirection from app.db.docs import ExecutionDocument @@ -14,7 +14,7 @@ class ExecutionRepository: - def __init__(self, logger: logging.Logger): + def __init__(self, logger: structlog.stdlib.BoundLogger): self.logger = logger async def create_execution(self, create_data: DomainExecutionCreate) -> DomainExecution: diff --git a/backend/app/db/repositories/notification_repository.py b/backend/app/db/repositories/notification_repository.py index ab3ad677..eb8718ce 100644 --- a/backend/app/db/repositories/notification_repository.py +++ b/backend/app/db/repositories/notification_repository.py @@ -1,6 +1,6 @@ -import logging from datetime import UTC, datetime, timedelta +import structlog from beanie.odm.enums import SortDirection from beanie.operators import GTE, LTE, ElemMatch, In, NotIn, Or @@ -16,7 +16,7 @@ class NotificationRepository: - def __init__(self, logger: logging.Logger): + def __init__(self, logger: structlog.stdlib.BoundLogger): self.logger = logger async def create_notification(self, create_data: DomainNotificationCreate) -> DomainNotification: diff --git a/backend/app/db/repositories/replay_repository.py b/backend/app/db/repositories/replay_repository.py index c3f44586..10f1a6e7 100644 --- a/backend/app/db/repositories/replay_repository.py +++ b/backend/app/db/repositories/replay_repository.py @@ -1,7 +1,7 @@ -import logging from datetime import datetime from typing import Any, AsyncIterator +import structlog from beanie.odm.enums import SortDirection from beanie.operators import LT, In @@ -12,7 +12,7 @@ class ReplayRepository: - def __init__(self, logger: logging.Logger) -> None: + def __init__(self, logger: structlog.stdlib.BoundLogger) -> None: self.logger = logger async def save_session(self, session: ReplaySessionState) -> None: diff --git a/backend/app/db/repositories/user_settings_repository.py b/backend/app/db/repositories/user_settings_repository.py index db53a68c..f15f5d28 100644 --- a/backend/app/db/repositories/user_settings_repository.py +++ b/backend/app/db/repositories/user_settings_repository.py @@ -1,6 +1,6 @@ -import logging from datetime import datetime +import structlog from beanie.odm.enums import SortDirection from beanie.odm.operators.find import BaseFindOperator from beanie.operators import GT, LTE, Eq, In @@ -11,7 +11,7 @@ class UserSettingsRepository: - def __init__(self, logger: logging.Logger) -> None: + def __init__(self, logger: structlog.stdlib.BoundLogger) -> None: self.logger = logger async def get_snapshot(self, user_id: str) -> DomainUserSettings | None: diff --git a/backend/app/dlq/manager.py b/backend/app/dlq/manager.py index 323a12c4..bfc910f5 100644 --- a/backend/app/dlq/manager.py +++ b/backend/app/dlq/manager.py @@ -1,7 +1,7 @@ -import logging from datetime import datetime, timezone from typing import Callable +import structlog from faststream.kafka import KafkaBroker from app.core.metrics import DLQMetrics @@ -37,7 +37,7 @@ def __init__( self, settings: Settings, broker: KafkaBroker, - logger: logging.Logger, + logger: structlog.stdlib.BoundLogger, dlq_metrics: DLQMetrics, repository: DLQRepository, default_retry_policy: RetryPolicy, diff --git a/backend/app/domain/events/typed.py b/backend/app/domain/events/typed.py index 7ce79bb0..acbaca32 100644 --- a/backend/app/domain/events/typed.py +++ b/backend/app/domain/events/typed.py @@ -4,7 +4,6 @@ from pydantic import BaseModel, ConfigDict, Discriminator, Field, TypeAdapter -from app.core.correlation import CorrelationContext from app.domain.enums import ( Environment, EventType, @@ -35,7 +34,7 @@ class EventMetadata(BaseModel): service_name: str service_version: str - correlation_id: str = Field(default_factory=CorrelationContext.get_correlation_id) + correlation_id: str = "" user_id: str = Field(default_factory=lambda: str(uuid4())) environment: Environment = Environment.PRODUCTION diff --git a/backend/app/events/core/producer.py b/backend/app/events/core/producer.py index a1e5caf7..1946f479 100644 --- a/backend/app/events/core/producer.py +++ b/backend/app/events/core/producer.py @@ -1,8 +1,8 @@ import asyncio -import logging import socket from datetime import datetime, timezone +import structlog from faststream.kafka import KafkaBroker from app.core.metrics import EventMetrics @@ -25,7 +25,7 @@ def __init__( self, broker: KafkaBroker, event_repository: EventRepository, - logger: logging.Logger, + logger: structlog.stdlib.BoundLogger, settings: Settings, event_metrics: EventMetrics, ): diff --git a/backend/app/main.py b/backend/app/main.py index afcf28c6..33450805 100644 --- a/backend/app/main.py +++ b/backend/app/main.py @@ -101,7 +101,6 @@ def create_app(settings: Settings | None = None) -> FastAPI: "X-Requested-With", "X-CSRF-Token", "X-Correlation-ID", - "X-Request-ID", ], expose_headers=["Content-Length", "Content-Range", "X-Correlation-ID"], ) diff --git a/backend/app/services/admin/admin_events_service.py b/backend/app/services/admin/admin_events_service.py index 824fd299..cd1dbfdc 100644 --- a/backend/app/services/admin/admin_events_service.py +++ b/backend/app/services/admin/admin_events_service.py @@ -1,11 +1,12 @@ import csv import json -import logging from dataclasses import dataclass from datetime import datetime, timedelta, timezone from io import StringIO from typing import Any +import structlog + from app.db.docs.replay import ReplaySessionDocument from app.db.repositories import AdminEventsRepository from app.domain.admin import ReplaySessionData, ReplaySessionStatusDetail, ReplaySessionUpdate @@ -42,14 +43,14 @@ def _export_row_to_dict(row: EventExportRow) -> dict[str, str]: class AdminReplayResult: def __init__( - self, - *, - dry_run: bool, - total_events: int, - replay_correlation_id: str, - status: ReplayStatus, - session_id: str | None = None, - events_preview: list[EventSummary] | None = None, + self, + *, + dry_run: bool, + total_events: int, + replay_correlation_id: str, + status: ReplayStatus, + session_id: str | None = None, + events_preview: list[EventSummary] | None = None, ) -> None: self.dry_run = dry_run self.total_events = total_events @@ -68,18 +69,21 @@ class ExportResult: class AdminEventsService: def __init__( - self, repository: AdminEventsRepository, replay_service: EventReplayService, logger: logging.Logger + self, + repository: AdminEventsRepository, + replay_service: EventReplayService, + logger: structlog.stdlib.BoundLogger, ) -> None: self._repo = repository self._replay_service = replay_service self.logger = logger async def browse_events( - self, - *, - event_filter: EventFilter, - skip: int, - limit: int, + self, + *, + event_filter: EventFilter, + skip: int, + limit: int, ) -> EventBrowseResult: return await self._repo.get_events_page(event_filter, skip=skip, limit=limit) @@ -90,12 +94,12 @@ async def get_event_stats(self, *, hours: int) -> EventStatistics: return await self._repo.get_event_stats(hours=hours) async def prepare_or_schedule_replay( - self, - *, - replay_filter: ReplayFilter, - dry_run: bool, - replay_correlation_id: str, - target_service: str | None, + self, + *, + replay_filter: ReplayFilter, + dry_run: bool, + replay_correlation_id: str, + target_service: str | None, ) -> AdminReplayResult: if replay_filter.is_empty(): raise ValidationError("Must specify at least one filter for replay") @@ -212,7 +216,7 @@ def _estimate_completion(self, doc: ReplaySessionDocument, now: datetime) -> dat return now + timedelta(seconds=remaining / rate) if rate > 0 else None async def export_events( - self, *, event_filter: EventFilter, limit: int, export_format: ExportFormat + self, *, event_filter: EventFilter, limit: int, export_format: ExportFormat ) -> ExportResult: if export_format == ExportFormat.CSV: return await self._export_csv(event_filter=event_filter, limit=limit) diff --git a/backend/app/services/admin/admin_settings_service.py b/backend/app/services/admin/admin_settings_service.py index eee352da..48ea0cae 100644 --- a/backend/app/services/admin/admin_settings_service.py +++ b/backend/app/services/admin/admin_settings_service.py @@ -1,4 +1,4 @@ -import logging +import structlog from app.db.repositories import AdminSettingsRepository from app.domain.admin import SystemSettings @@ -10,7 +10,7 @@ def __init__( self, repository: AdminSettingsRepository, runtime_settings: RuntimeSettingsLoader, - logger: logging.Logger, + logger: structlog.stdlib.BoundLogger, ): self._repo = repository self._runtime_settings = runtime_settings diff --git a/backend/app/services/admin/admin_user_service.py b/backend/app/services/admin/admin_user_service.py index 69bca48c..c60e1267 100644 --- a/backend/app/services/admin/admin_user_service.py +++ b/backend/app/services/admin/admin_user_service.py @@ -1,6 +1,7 @@ -import logging from datetime import datetime, timedelta, timezone +import structlog + from app.core.security import SecurityService from app.db.repositories import AdminUserRepository from app.domain.admin import AdminUserOverviewDomain, DerivedCountsDomain, RateLimitSummaryDomain @@ -22,7 +23,7 @@ def __init__( execution_service: ExecutionService, rate_limit_service: RateLimitService, security_service: SecurityService, - logger: logging.Logger, + logger: structlog.stdlib.BoundLogger, ) -> None: self._users = user_repository self._events = event_service diff --git a/backend/app/services/auth_service.py b/backend/app/services/auth_service.py index b1cdf066..fd05954e 100644 --- a/backend/app/services/auth_service.py +++ b/backend/app/services/auth_service.py @@ -1,5 +1,4 @@ -import logging - +import structlog from fastapi import Request from app.core.security import SecurityService @@ -9,7 +8,12 @@ class AuthService: - def __init__(self, user_repo: UserRepository, security_service: SecurityService, logger: logging.Logger): + def __init__( + self, + user_repo: UserRepository, + security_service: SecurityService, + logger: structlog.stdlib.BoundLogger, + ): self.user_repo = user_repo self.security_service = security_service self.logger = logger diff --git a/backend/app/services/coordinator/coordinator.py b/backend/app/services/coordinator/coordinator.py index 6afa7023..32ee9c29 100644 --- a/backend/app/services/coordinator/coordinator.py +++ b/backend/app/services/coordinator/coordinator.py @@ -1,10 +1,11 @@ import asyncio import heapq -import logging import time from collections import defaultdict from uuid import uuid4 +import structlog + from app.core.metrics import CoordinatorMetrics from app.db.repositories import ExecutionRepository from app.domain.enums import ExecutionErrorType, QueuePriority @@ -42,7 +43,7 @@ def __init__( self, producer: UnifiedProducer, execution_repository: ExecutionRepository, - logger: logging.Logger, + logger: structlog.stdlib.BoundLogger, coordinator_metrics: CoordinatorMetrics, max_queue_size: int = 10000, max_executions_per_user: int = 100, diff --git a/backend/app/services/event_replay/replay_service.py b/backend/app/services/event_replay/replay_service.py index 598b9767..ebb6a8ec 100644 --- a/backend/app/services/event_replay/replay_service.py +++ b/backend/app/services/event_replay/replay_service.py @@ -1,12 +1,12 @@ import asyncio import json -import logging from collections.abc import AsyncIterator from datetime import datetime, timedelta, timezone from uuid import uuid4 import aiofiles import backoff +import structlog from apscheduler.schedulers.asyncio import AsyncIOScheduler from pydantic import ValidationError @@ -33,7 +33,7 @@ def __init__( repository: ReplayRepository, producer: UnifiedProducer, replay_metrics: ReplayMetrics, - logger: logging.Logger, + logger: structlog.stdlib.BoundLogger, ) -> None: self._sessions: dict[str, ReplaySessionState] = {} self._schedulers: dict[str, AsyncIOScheduler] = {} diff --git a/backend/app/services/execution_service.py b/backend/app/services/execution_service.py index 4c3c2c75..f6dedfd5 100644 --- a/backend/app/services/execution_service.py +++ b/backend/app/services/execution_service.py @@ -1,9 +1,10 @@ -import logging from datetime import datetime, timezone from time import time from typing import Any from uuid import uuid4 +import structlog + from app.core.metrics import ExecutionMetrics from app.db.repositories import ExecutionRepository from app.domain.enums import CancelStatus, EventType, ExecutionStatus, QueuePriority @@ -45,23 +46,12 @@ def __init__( execution_repo: ExecutionRepository, producer: UnifiedProducer, settings: Settings, - logger: logging.Logger, + logger: structlog.stdlib.BoundLogger, execution_metrics: ExecutionMetrics, idempotency_manager: IdempotencyManager, runtime_settings: RuntimeSettingsLoader, + correlation_id: str, ) -> None: - """ - Initialize execution service. - - Args: - execution_repo: Repository for execution data persistence. - producer: Kafka producer for publishing events. - settings: Application settings. - logger: Logger instance. - execution_metrics: Metrics for tracking execution operations. - idempotency_manager: Manager for HTTP idempotency. - runtime_settings: Loader for admin-configurable runtime settings. - """ self.execution_repo = execution_repo self.producer = producer self.settings = settings @@ -69,6 +59,7 @@ def __init__( self.metrics = execution_metrics self.idempotency_manager = idempotency_manager self._runtime_settings = runtime_settings + self._correlation_id = correlation_id async def get_k8s_resource_limits(self) -> ResourceLimitsDomain: effective = await self._runtime_settings.get_effective_settings() @@ -90,6 +81,7 @@ def _create_event_metadata(self, user_id: str) -> EventMetadata: service_name="execution-service", service_version="2.0.0", user_id=user_id, + correlation_id=self._correlation_id, ) async def execute_script( diff --git a/backend/app/services/idempotency/idempotency_manager.py b/backend/app/services/idempotency/idempotency_manager.py index c8ed1d18..0a6c781b 100644 --- a/backend/app/services/idempotency/idempotency_manager.py +++ b/backend/app/services/idempotency/idempotency_manager.py @@ -1,8 +1,8 @@ import hashlib import json -import logging from datetime import datetime, timedelta, timezone +import structlog from pydantic import BaseModel from pymongo.errors import DuplicateKeyError @@ -37,7 +37,7 @@ def __init__( self, config: IdempotencyConfig, repository: RedisIdempotencyRepository, - logger: logging.Logger, + logger: structlog.stdlib.BoundLogger, database_metrics: DatabaseMetrics, ) -> None: self.config = config diff --git a/backend/app/services/k8s_worker/worker.py b/backend/app/services/k8s_worker/worker.py index be6cb836..591a27c1 100644 --- a/backend/app/services/k8s_worker/worker.py +++ b/backend/app/services/k8s_worker/worker.py @@ -1,9 +1,9 @@ import asyncio -import logging import time from pathlib import Path from typing import Any +import structlog from kubernetes_asyncio import client as k8s_client from kubernetes_asyncio.client.rest import ApiException @@ -41,7 +41,7 @@ def __init__( api_client: k8s_client.ApiClient, producer: UnifiedProducer, settings: Settings, - logger: logging.Logger, + logger: structlog.stdlib.BoundLogger, event_metrics: EventMetrics, ): self._event_metrics = event_metrics diff --git a/backend/app/services/kafka_event_service.py b/backend/app/services/kafka_event_service.py index 4e21fec5..82127b27 100644 --- a/backend/app/services/kafka_event_service.py +++ b/backend/app/services/kafka_event_service.py @@ -1,6 +1,6 @@ -import logging import time +import structlog from opentelemetry import trace from app.core.metrics import EventMetrics @@ -16,7 +16,7 @@ def __init__( self, kafka_producer: UnifiedProducer, settings: Settings, - logger: logging.Logger, + logger: structlog.stdlib.BoundLogger, event_metrics: EventMetrics, ): self.kafka_producer = kafka_producer diff --git a/backend/app/services/login_lockout.py b/backend/app/services/login_lockout.py index e0d7e023..1b2cf156 100644 --- a/backend/app/services/login_lockout.py +++ b/backend/app/services/login_lockout.py @@ -1,6 +1,5 @@ -import logging - import redis.asyncio as redis +import structlog from app.services.runtime_settings import RuntimeSettingsLoader @@ -14,7 +13,7 @@ def __init__( self, redis_client: redis.Redis, runtime_settings: RuntimeSettingsLoader, - logger: logging.Logger, + logger: structlog.stdlib.BoundLogger, ) -> None: self._redis = redis_client self._runtime_settings = runtime_settings diff --git a/backend/app/services/notification_scheduler.py b/backend/app/services/notification_scheduler.py index 91bbcd81..a481b83c 100644 --- a/backend/app/services/notification_scheduler.py +++ b/backend/app/services/notification_scheduler.py @@ -1,4 +1,4 @@ -import logging +import structlog from app.db.repositories import NotificationRepository from app.services.notification_service import NotificationService @@ -16,7 +16,7 @@ def __init__( self, notification_repository: NotificationRepository, notification_service: NotificationService, - logger: logging.Logger, + logger: structlog.stdlib.BoundLogger, ) -> None: self.repository = notification_repository self.service = notification_service diff --git a/backend/app/services/notification_service.py b/backend/app/services/notification_service.py index cadde066..8c7171d6 100644 --- a/backend/app/services/notification_service.py +++ b/backend/app/services/notification_service.py @@ -1,14 +1,14 @@ import asyncio -import logging from dataclasses import dataclass, field from datetime import UTC, datetime, timedelta from typing import Awaitable, Callable import backoff import httpx +import structlog +from opentelemetry import trace from app.core.metrics import NotificationMetrics -from app.core.tracing import add_span_attributes from app.db.repositories import NotificationRepository from app.domain.enums import NotificationChannel, NotificationSeverity, NotificationStatus, UserRole from app.domain.events import ( @@ -99,7 +99,7 @@ def __init__( event_service: KafkaEventService, sse_bus: SSERedisBus, settings: Settings, - logger: logging.Logger, + logger: structlog.stdlib.BoundLogger, notification_metrics: NotificationMetrics, ) -> None: self.repository = notification_repository @@ -356,13 +356,11 @@ async def _send_webhook( }, ) - add_span_attributes( - **{ - "notification.id": str(notification.notification_id), - "notification.channel": "webhook", - "notification.webhook_url": webhook_url, - } - ) + trace.get_current_span().set_attributes({ + "notification.id": str(notification.notification_id), + "notification.channel": "webhook", + "notification.webhook_url": webhook_url, + }) async with httpx.AsyncClient() as client: response = await client.post(webhook_url, json=payload, headers=headers, timeout=30.0) response.raise_for_status() @@ -411,12 +409,10 @@ async def _send_slack(self, notification: DomainNotification, subscription: Doma }, ) - add_span_attributes( - **{ - "notification.id": str(notification.notification_id), - "notification.channel": "slack", - } - ) + trace.get_current_span().set_attributes({ + "notification.id": str(notification.notification_id), + "notification.channel": "slack", + }) async with httpx.AsyncClient() as client: response = await client.post(subscription.slack_webhook, json=slack_message, timeout=30.0) response.raise_for_status() diff --git a/backend/app/services/pod_monitor/event_mapper.py b/backend/app/services/pod_monitor/event_mapper.py index b23a5afe..8710060a 100644 --- a/backend/app/services/pod_monitor/event_mapper.py +++ b/backend/app/services/pod_monitor/event_mapper.py @@ -1,9 +1,9 @@ import ast -import logging from collections.abc import Awaitable, Callable from dataclasses import dataclass from uuid import uuid4 +import structlog from kubernetes_asyncio import client as k8s_client from app.core.utils import StringEnum @@ -59,7 +59,7 @@ class PodLogs: class PodEventMapper: """Maps Kubernetes pod objects to application events""" - def __init__(self, logger: logging.Logger, k8s_api: k8s_client.CoreV1Api | None = None) -> None: + def __init__(self, logger: structlog.stdlib.BoundLogger, k8s_api: k8s_client.CoreV1Api | None = None) -> None: self.logger = logger self._event_cache: dict[str, PodPhase] = {} self._k8s_api = k8s_api diff --git a/backend/app/services/pod_monitor/monitor.py b/backend/app/services/pod_monitor/monitor.py index 5a1efdc4..2542b92c 100644 --- a/backend/app/services/pod_monitor/monitor.py +++ b/backend/app/services/pod_monitor/monitor.py @@ -1,9 +1,9 @@ -import logging import time from dataclasses import dataclass from enum import auto from typing import Any +import structlog from kubernetes_asyncio import client as k8s_client from kubernetes_asyncio import watch as k8s_watch @@ -54,7 +54,7 @@ def __init__( self, config: PodMonitorConfig, kafka_event_service: KafkaEventService, - logger: logging.Logger, + logger: structlog.stdlib.BoundLogger, api_client: k8s_client.ApiClient, event_mapper: PodEventMapper, kubernetes_metrics: KubernetesMetrics, diff --git a/backend/app/services/result_processor/processor.py b/backend/app/services/result_processor/processor.py index a2faaa08..eaea4a81 100644 --- a/backend/app/services/result_processor/processor.py +++ b/backend/app/services/result_processor/processor.py @@ -1,4 +1,4 @@ -import logging +import structlog from app.core.metrics import ExecutionMetrics from app.db.repositories import ExecutionRepository @@ -25,7 +25,7 @@ def __init__( execution_repo: ExecutionRepository, producer: UnifiedProducer, settings: Settings, - logger: logging.Logger, + logger: structlog.stdlib.BoundLogger, execution_metrics: ExecutionMetrics, ) -> None: self._execution_repo = execution_repo diff --git a/backend/app/services/result_processor/resource_cleaner.py b/backend/app/services/result_processor/resource_cleaner.py index fbce860a..4734e0db 100644 --- a/backend/app/services/result_processor/resource_cleaner.py +++ b/backend/app/services/result_processor/resource_cleaner.py @@ -1,8 +1,8 @@ import asyncio -import logging from datetime import datetime, timedelta, timezone from typing import Any +import structlog from kubernetes_asyncio import client as k8s_client from kubernetes_asyncio.client.rest import ApiException @@ -19,7 +19,7 @@ class ResourceCleaner: Accepts ApiClient via dependency injection for proper configuration management. """ - def __init__(self, api_client: k8s_client.ApiClient, logger: logging.Logger) -> None: + def __init__(self, api_client: k8s_client.ApiClient, logger: structlog.stdlib.BoundLogger) -> None: self.v1 = k8s_client.CoreV1Api(api_client) self.networking_v1 = k8s_client.NetworkingV1Api(api_client) self.logger = logger diff --git a/backend/app/services/runtime_settings.py b/backend/app/services/runtime_settings.py index d61d9a60..59e34c55 100644 --- a/backend/app/services/runtime_settings.py +++ b/backend/app/services/runtime_settings.py @@ -1,6 +1,7 @@ -import logging from time import monotonic +import structlog + from app.db.repositories import AdminSettingsRepository from app.domain.admin import SystemSettings from app.settings import Settings @@ -16,7 +17,7 @@ def __init__( self, repo: AdminSettingsRepository, settings: Settings, - logger: logging.Logger, + logger: structlog.stdlib.BoundLogger, ) -> None: self._repo = repo self._settings = settings diff --git a/backend/app/services/saga/execution_saga.py b/backend/app/services/saga/execution_saga.py index c385f6be..003c08df 100644 --- a/backend/app/services/saga/execution_saga.py +++ b/backend/app/services/saga/execution_saga.py @@ -1,7 +1,8 @@ -import logging from typing import Any from uuid import uuid4 +import structlog + from app.db.repositories import ResourceAllocationRepository from app.domain.events import CreatePodCommandEvent, DeletePodCommandEvent, EventMetadata, ExecutionRequestedEvent from app.domain.saga import DomainResourceAllocationCreate @@ -9,7 +10,7 @@ from .saga_step import CompensationStep, SagaContext, SagaStep -logger = logging.getLogger(__name__) +logger = structlog.get_logger(__name__) class ValidateExecutionStep(SagaStep[ExecutionRequestedEvent]): diff --git a/backend/app/services/saga/saga_orchestrator.py b/backend/app/services/saga/saga_orchestrator.py index c7d57a80..b2e3fa0e 100644 --- a/backend/app/services/saga/saga_orchestrator.py +++ b/backend/app/services/saga/saga_orchestrator.py @@ -1,10 +1,10 @@ -import logging from datetime import UTC, datetime, timedelta from uuid import uuid4 +import structlog +from opentelemetry import trace from opentelemetry.trace import SpanKind -from app.core.tracing import EventAttributes, get_tracer from app.db.repositories import ResourceAllocationRepository, SagaRepository from app.domain.enums import SagaState from app.domain.events import ( @@ -35,7 +35,7 @@ def __init__( saga_repository: SagaRepository, producer: UnifiedProducer, resource_allocation_repository: ResourceAllocationRepository, - logger: logging.Logger, + logger: structlog.stdlib.BoundLogger, ): self.config = config self._producer = producer @@ -139,7 +139,7 @@ async def _execute_saga( trigger_event: DomainEvent, ) -> None: """Execute saga steps.""" - tracer = get_tracer() + tracer = trace.get_tracer(__name__) try: steps = saga.get_steps() @@ -153,10 +153,10 @@ async def _execute_saga( name="saga.step", kind=SpanKind.INTERNAL, attributes={ - str(EventAttributes.SAGA_NAME): instance.saga_name, - str(EventAttributes.SAGA_ID): instance.saga_id, - str(EventAttributes.SAGA_STEP): step.name, - str(EventAttributes.EXECUTION_ID): instance.execution_id, + "saga.name": instance.saga_name, + "saga.id": instance.saga_id, + "saga.step": step.name, + "execution.id": instance.execution_id, }, ): success = await step.execute(context, trigger_event) diff --git a/backend/app/services/saga/saga_service.py b/backend/app/services/saga/saga_service.py index dea0f534..e3bd5b7a 100644 --- a/backend/app/services/saga/saga_service.py +++ b/backend/app/services/saga/saga_service.py @@ -1,4 +1,4 @@ -import logging +import structlog from app.db.repositories import ExecutionRepository, SagaRepository from app.domain.enums import SagaState, UserRole @@ -23,7 +23,7 @@ def __init__( saga_repo: SagaRepository, execution_repo: ExecutionRepository, orchestrator: SagaOrchestrator, - logger: logging.Logger, + logger: structlog.stdlib.BoundLogger, ): self.saga_repo = saga_repo self.execution_repo = execution_repo diff --git a/backend/app/services/saved_script_service.py b/backend/app/services/saved_script_service.py index 64e5aef1..e24bfb5e 100644 --- a/backend/app/services/saved_script_service.py +++ b/backend/app/services/saved_script_service.py @@ -1,4 +1,4 @@ -import logging +import structlog from app.db.repositories import SavedScriptRepository from app.domain.saved_script import ( @@ -11,7 +11,7 @@ class SavedScriptService: - def __init__(self, saved_script_repo: SavedScriptRepository, logger: logging.Logger): + def __init__(self, saved_script_repo: SavedScriptRepository, logger: structlog.stdlib.BoundLogger): self.saved_script_repo = saved_script_repo self.logger = logger diff --git a/backend/app/services/sse/redis_bus.py b/backend/app/services/sse/redis_bus.py index 23408fdb..72dff3e2 100644 --- a/backend/app/services/sse/redis_bus.py +++ b/backend/app/services/sse/redis_bus.py @@ -1,9 +1,9 @@ from __future__ import annotations -import logging from typing import ClassVar, Type, TypeVar import redis.asyncio as redis +import structlog from pydantic import BaseModel from app.domain.enums import EventType @@ -16,7 +16,7 @@ class SSERedisSubscription: """Subscription wrapper for Redis pubsub with typed message parsing.""" - def __init__(self, pubsub: redis.client.PubSub, channel: str, logger: logging.Logger) -> None: + def __init__(self, pubsub: redis.client.PubSub, channel: str, logger: structlog.stdlib.BoundLogger) -> None: self._pubsub = pubsub self._channel = channel self.logger = logger @@ -67,7 +67,7 @@ class SSERedisBus: def __init__( self, redis_client: redis.Redis, - logger: logging.Logger, + logger: structlog.stdlib.BoundLogger, exec_prefix: str = "sse:exec:", notif_prefix: str = "sse:notif:", ) -> None: diff --git a/backend/app/services/sse/sse_service.py b/backend/app/services/sse/sse_service.py index cbd3d487..7a7e4cd7 100644 --- a/backend/app/services/sse/sse_service.py +++ b/backend/app/services/sse/sse_service.py @@ -1,9 +1,10 @@ import asyncio -import logging from collections.abc import AsyncGenerator from datetime import datetime, timezone from typing import Any +import structlog + from app.core.metrics import ConnectionMetrics from app.db.repositories import SSERepository from app.domain.enums import EventType, NotificationChannel, SSEControlEvent @@ -40,7 +41,7 @@ def __init__( repository: SSERepository, sse_bus: SSERedisBus, settings: Settings, - logger: logging.Logger, + logger: structlog.stdlib.BoundLogger, connection_metrics: ConnectionMetrics, ) -> None: self.repository = repository diff --git a/backend/app/services/user_settings_service.py b/backend/app/services/user_settings_service.py index 0721c072..a2af9029 100644 --- a/backend/app/services/user_settings_service.py +++ b/backend/app/services/user_settings_service.py @@ -1,7 +1,7 @@ -import logging from datetime import datetime, timedelta, timezone from typing import Any +import structlog from cachetools import TTLCache from app.db.repositories import UserSettingsRepository @@ -25,7 +25,7 @@ def __init__( repository: UserSettingsRepository, event_service: KafkaEventService, settings: Settings, - logger: logging.Logger, + logger: structlog.stdlib.BoundLogger, ) -> None: self.repository = repository self.event_service = event_service diff --git a/backend/app/settings.py b/backend/app/settings.py index d943bc56..8c12c163 100644 --- a/backend/app/settings.py +++ b/backend/app/settings.py @@ -111,11 +111,8 @@ def __init__( NOTIF_THROTTLE_MAX_PER_HOUR: int = 5 NOTIF_MAX_SCHEDULE_DAYS: int = 25 # Max days ahead a notification can be scheduled (must be < TTL) - # OpenTelemetry / Jaeger Configuration - ENABLE_TRACING: bool = True - JAEGER_AGENT_HOST: str = "jaeger" - JAEGER_AGENT_PORT: int = 6831 - JAEGER_COLLECTOR_ENDPOINT: str | None = None + # OpenTelemetry / Tracing Configuration + OTLP_TRACES_ENDPOINT: str = "" TRACING_SAMPLING_RATE: float = Field( default=0.1, # 10% sampling by default ge=0.0, @@ -124,8 +121,6 @@ def __init__( ) TRACING_SERVICE_NAME: str = "integr8scode-backend" TRACING_SERVICE_VERSION: str = "1.0.0" - TRACING_ADAPTIVE_SAMPLING: bool = False # Enable adaptive sampling in production - # Dead Letter Queue Configuration DLQ_RETRY_MAX_ATTEMPTS: int = 5 DLQ_RETRY_BASE_DELAY_SECONDS: float = 60.0 @@ -158,11 +153,8 @@ def __init__( SERVICE_VERSION: str = "1.0.0" ENVIRONMENT: str = "production" # deployment environment (production, staging, development) - # OpenTelemetry Configuration + # OpenTelemetry metrics export endpoint OTEL_EXPORTER_OTLP_ENDPOINT: str | None = None - OTEL_SERVICE_NAME: str | None = None - OTEL_SERVICE_VERSION: str | None = None - OTEL_RESOURCE_ATTRIBUTES: str | None = None # Web server (Gunicorn/Uvicorn) concurrency settings WEB_CONCURRENCY: int = 4 diff --git a/backend/config.test.toml b/backend/config.test.toml index a1bba07b..a4e0ec97 100644 --- a/backend/config.test.toml +++ b/backend/config.test.toml @@ -40,10 +40,8 @@ SSE_HEARTBEAT_INTERVAL = 30 LOG_LEVEL = "WARNING" -# Tracing -ENABLE_TRACING = true -JAEGER_AGENT_HOST = "jaeger" -JAEGER_AGENT_PORT = 6831 +# Tracing (no exporter in tests — spans created but not exported) +OTLP_TRACES_ENDPOINT = "" TRACING_SERVICE_NAME = "integr8scode-backend" TRACING_SERVICE_VERSION = "1.0.0" TRACING_SAMPLING_RATE = 1.0 @@ -61,12 +59,6 @@ APP_URL = "https://localhost" SERVICE_NAME = "integr8scode-backend" SERVICE_VERSION = "1.0.0" -# OpenTelemetry -OTEL_EXPORTER_OTLP_ENDPOINT = "http://otel-collector:4317" -OTEL_SERVICE_NAME = "integr8scode-backend" -OTEL_SERVICE_VERSION = "1.0.0" -OTEL_RESOURCE_ATTRIBUTES = "environment=test,team=backend" - # Gunicorn / Uvicorn WEB_CONCURRENCY = 1 WEB_THREADS = 4 diff --git a/backend/config.toml b/backend/config.toml index 7a757b5c..369d4ea5 100644 --- a/backend/config.toml +++ b/backend/config.toml @@ -42,9 +42,7 @@ SSE_HEARTBEAT_INTERVAL = 30 LOG_LEVEL = "DEBUG" # Tracing -ENABLE_TRACING = true -JAEGER_AGENT_HOST = "jaeger" -JAEGER_AGENT_PORT = 6831 +OTLP_TRACES_ENDPOINT = "http://jaeger:4317" TRACING_SERVICE_NAME = "integr8scode-backend" TRACING_SERVICE_VERSION = "1.0.0" TRACING_SAMPLING_RATE = 1.0 @@ -62,11 +60,8 @@ APP_URL = "https://localhost" SERVICE_NAME = "integr8scode-backend" SERVICE_VERSION = "1.0.0" -# OpenTelemetry +# OpenTelemetry metrics export OTEL_EXPORTER_OTLP_ENDPOINT = "http://otel-collector:4317" -OTEL_SERVICE_NAME = "integr8scode-backend" -OTEL_SERVICE_VERSION = "1.0.0" -OTEL_RESOURCE_ATTRIBUTES = "environment=production,team=backend" # Gunicorn / Uvicorn WEB_CONCURRENCY = 4 diff --git a/backend/otel-collector-config.yaml b/backend/otel-collector-config.yaml index 3504b6c5..83002a47 100644 --- a/backend/otel-collector-config.yaml +++ b/backend/otel-collector-config.yaml @@ -28,6 +28,23 @@ processors: check_interval: 1s limit_mib: 512 spike_limit_mib: 128 + + tail_sampling: + decision_wait: 10s + num_traces: 50000 + policies: + - name: error-traces + type: status_code + status_code: + status_codes: [ERROR] + - name: slow-traces + type: latency + latency: + threshold_ms: 1000 + - name: base-rate + type: probabilistic + probabilistic: + sampling_percentage: 10 resource: attributes: @@ -82,7 +99,7 @@ service: pipelines: traces: receivers: [otlp] - processors: [memory_limiter, batch, resource, attributes] + processors: [memory_limiter, tail_sampling, batch, resource, attributes] exporters: [otlp/jaeger, logging] metrics: diff --git a/backend/pyproject.toml b/backend/pyproject.toml index f845fcc6..ba388c3d 100644 --- a/backend/pyproject.toml +++ b/backend/pyproject.toml @@ -122,6 +122,7 @@ dependencies = [ "aiofiles==25.1.0", "APScheduler==3.10.4", "faststream[kafka]==0.6.6", + "structlog>=25.5.0", ] [build-system] diff --git a/backend/tests/e2e/core/test_middlewares.py b/backend/tests/e2e/core/test_middlewares.py index 9c8f9a2c..d0edc147 100644 --- a/backend/tests/e2e/core/test_middlewares.py +++ b/backend/tests/e2e/core/test_middlewares.py @@ -16,7 +16,7 @@ async def test_generates_correlation_id(self, client: httpx.AsyncClient) -> None assert response.status_code == 200 assert "X-Correlation-ID" in response.headers correlation_id = response.headers["X-Correlation-ID"] - assert correlation_id.startswith("req_") + assert correlation_id.startswith("req-") @pytest.mark.asyncio async def test_passes_through_correlation_id( @@ -33,22 +33,6 @@ async def test_passes_through_correlation_id( assert response.status_code == 200 assert response.headers["X-Correlation-ID"] == custom_id - @pytest.mark.asyncio - async def test_accepts_request_id_header( - self, client: httpx.AsyncClient - ) -> None: - """Middleware accepts X-Request-ID as alternative header.""" - request_id = "request-id-67890" - - response = await client.get( - "/api/v1/health/live", - headers={"X-Request-ID": request_id}, - ) - - assert response.status_code == 200 - # Should use request ID as correlation ID - assert response.headers["X-Correlation-ID"] == request_id - class TestCSRFMiddleware: """Tests for CSRFMiddleware.""" diff --git a/backend/tests/e2e/db/repositories/test_dlq_repository.py b/backend/tests/e2e/db/repositories/test_dlq_repository.py index 474c5c44..4238b7c6 100644 --- a/backend/tests/e2e/db/repositories/test_dlq_repository.py +++ b/backend/tests/e2e/db/repositories/test_dlq_repository.py @@ -1,4 +1,4 @@ -import logging +import structlog from datetime import datetime, timezone import pytest @@ -9,7 +9,7 @@ pytestmark = pytest.mark.e2e -_test_logger = logging.getLogger("test.db.repositories.dlq_repository") +_test_logger = structlog.get_logger("test.db.repositories.dlq_repository") @pytest.fixture() diff --git a/backend/tests/e2e/db/repositories/test_execution_repository.py b/backend/tests/e2e/db/repositories/test_execution_repository.py index 60694fd1..083ebf82 100644 --- a/backend/tests/e2e/db/repositories/test_execution_repository.py +++ b/backend/tests/e2e/db/repositories/test_execution_repository.py @@ -1,4 +1,4 @@ -import logging +import structlog from uuid import uuid4 import pytest @@ -6,7 +6,7 @@ from app.domain.enums import ExecutionStatus from app.domain.execution import DomainExecutionCreate, DomainExecutionUpdate -_test_logger = logging.getLogger("test.db.repositories.execution_repository") +_test_logger = structlog.get_logger("test.db.repositories.execution_repository") pytestmark = pytest.mark.e2e diff --git a/backend/tests/e2e/dlq/test_dlq_manager.py b/backend/tests/e2e/dlq/test_dlq_manager.py index 90f96865..d0b7c3d4 100644 --- a/backend/tests/e2e/dlq/test_dlq_manager.py +++ b/backend/tests/e2e/dlq/test_dlq_manager.py @@ -1,6 +1,6 @@ import asyncio import json -import logging +import structlog import uuid from datetime import datetime, timezone @@ -24,7 +24,7 @@ # Serial execution ensures each test's manager processes only its own messages. pytestmark = [pytest.mark.e2e, pytest.mark.kafka, pytest.mark.mongodb, pytest.mark.xdist_group("dlq")] -_test_logger = logging.getLogger("test.dlq.manager") +_test_logger = structlog.get_logger("test.dlq.manager") @pytest.mark.asyncio diff --git a/backend/tests/e2e/idempotency/test_idempotency.py b/backend/tests/e2e/idempotency/test_idempotency.py index 88814471..f458475c 100644 --- a/backend/tests/e2e/idempotency/test_idempotency.py +++ b/backend/tests/e2e/idempotency/test_idempotency.py @@ -1,6 +1,6 @@ import asyncio import json -import logging +import structlog import uuid from datetime import datetime, timedelta, timezone @@ -17,7 +17,7 @@ pytestmark = [pytest.mark.e2e, pytest.mark.redis] # Test logger for all tests -_test_logger = logging.getLogger("test.idempotency") +_test_logger = structlog.get_logger("test.idempotency") class TestIdempotencyManager: diff --git a/backend/tests/e2e/result_processor/test_result_processor.py b/backend/tests/e2e/result_processor/test_result_processor.py index 2d149ef9..359ac1d7 100644 --- a/backend/tests/e2e/result_processor/test_result_processor.py +++ b/backend/tests/e2e/result_processor/test_result_processor.py @@ -1,4 +1,4 @@ -import logging +import structlog import pytest from app.core.metrics import ExecutionMetrics @@ -23,7 +23,7 @@ pytest.mark.xdist_group("kafka_consumers"), ] -_test_logger = logging.getLogger("test.result_processor.processor") +_test_logger = structlog.get_logger("test.result_processor.processor") @pytest.mark.asyncio diff --git a/backend/tests/e2e/services/sse/test_partitioned_event_router.py b/backend/tests/e2e/services/sse/test_partitioned_event_router.py index 11a18091..228c33fe 100644 --- a/backend/tests/e2e/services/sse/test_partitioned_event_router.py +++ b/backend/tests/e2e/services/sse/test_partitioned_event_router.py @@ -1,5 +1,5 @@ import asyncio -import logging +import structlog from uuid import uuid4 import pytest @@ -12,7 +12,7 @@ pytestmark = [pytest.mark.e2e, pytest.mark.redis] -_test_logger = logging.getLogger("test.services.sse.partitioned_event_router_integration") +_test_logger = structlog.get_logger("test.services.sse.partitioned_event_router_integration") @pytest.mark.asyncio diff --git a/backend/tests/e2e/services/sse/test_redis_bus.py b/backend/tests/e2e/services/sse/test_redis_bus.py index fffea4a0..d236fc2e 100644 --- a/backend/tests/e2e/services/sse/test_redis_bus.py +++ b/backend/tests/e2e/services/sse/test_redis_bus.py @@ -1,5 +1,5 @@ import asyncio -import logging +import structlog from datetime import datetime, timezone from typing import Any, cast @@ -12,7 +12,7 @@ pytestmark = pytest.mark.e2e -_test_logger = logging.getLogger("test.services.sse.redis_bus") +_test_logger = structlog.get_logger("test.services.sse.redis_bus") class _FakePubSub: diff --git a/backend/tests/e2e/test_k8s_worker_create_pod.py b/backend/tests/e2e/test_k8s_worker_create_pod.py index 6ec9984d..0f2dd99c 100644 --- a/backend/tests/e2e/test_k8s_worker_create_pod.py +++ b/backend/tests/e2e/test_k8s_worker_create_pod.py @@ -1,4 +1,4 @@ -import logging +import structlog import uuid import pytest @@ -14,7 +14,7 @@ pytestmark = [pytest.mark.e2e, pytest.mark.k8s] -_test_logger = logging.getLogger("test.k8s.worker_create_pod") +_test_logger = structlog.get_logger("test.k8s.worker_create_pod") @pytest.mark.asyncio diff --git a/backend/tests/unit/core/test_adaptive_sampling.py b/backend/tests/unit/core/test_adaptive_sampling.py deleted file mode 100644 index 9dde2fab..00000000 --- a/backend/tests/unit/core/test_adaptive_sampling.py +++ /dev/null @@ -1,62 +0,0 @@ -import time -from unittest.mock import MagicMock - -import pytest -from app.core.adaptive_sampling import AdaptiveSampler, create_adaptive_sampler -from app.settings import Settings - - -def test_is_error_variants() -> None: - s = AdaptiveSampler(base_rate=0.5, adjustment_interval=1) - assert s._is_error({"error": True}) is True - assert s._is_error({"http.status_code": 500}) is True - assert s._is_error({"http.status_code": "503"}) is True - assert s._is_error({"exception.type": "ValueError"}) is True - assert s._is_error({"http.status_code": 200}) is False - - -def test_should_sample_respects_rate() -> None: - s = AdaptiveSampler(base_rate=1.0, adjustment_interval=1) - # With current_rate=1.0, all trace_ids sample - res = s.should_sample(None, trace_id=123, name="op") - assert res.decision.value == 2 # RECORD_AND_SAMPLE - # With rate ~0, most should drop; we choose large id to exceed threshold - s._current_rate = 0.0 - res2 = s.should_sample(None, trace_id=(1 << 64) - 1, name="op") - assert res2.decision.value in (0, 1) # DROP or RECORD_ONLY depending impl - - -def test_adjust_sampling_rate_error_and_traffic() -> None: - s = AdaptiveSampler(base_rate=0.1, adjustment_interval=1) - now = time.time() - # Simulate 100 requests in window with 10 errors (> threshold 5%) - s._request_window.clear() - s._error_window.clear() - for _ in range(100): - s._request_window.append(now) - for _ in range(10): - s._error_window.append(now) - old = s._current_rate - s._adjust_sampling_rate() - assert s._current_rate >= old - # Simulate high traffic and low errors -> decrease toward min_rate - s._request_window.clear() - s._error_window.clear() - for _ in range(2000): - s._request_window.append(now) - old2 = s._current_rate - s._adjust_sampling_rate() - assert s._current_rate <= old2 - - -def test_get_description_and_factory(monkeypatch: pytest.MonkeyPatch) -> None: - s = AdaptiveSampler(base_rate=0.2, adjustment_interval=1) - desc = s.get_description() - assert "AdaptiveSampler(" in desc - - mock_settings = MagicMock(spec=Settings) - mock_settings.TRACING_SAMPLING_RATE = 0.2 - - monkeypatch.setenv("TRACING_SAMPLING_RATE", "0.2") - sampler = create_adaptive_sampler(mock_settings) - assert sampler._current_rate == 0.2 diff --git a/backend/tests/unit/core/test_logging_and_correlation.py b/backend/tests/unit/core/test_logging_and_correlation.py index e54cc0be..934e4482 100644 --- a/backend/tests/unit/core/test_logging_and_correlation.py +++ b/backend/tests/unit/core/test_logging_and_correlation.py @@ -1,15 +1,13 @@ -import io -import json -import logging from typing import Any +from unittest.mock import MagicMock, patch import pytest -from app.core.correlation import CorrelationContext, CorrelationMiddleware +import structlog +from app.core.correlation import CorrelationMiddleware from app.core.logging import ( - CorrelationFilter, - JSONFormatter, - correlation_id_context, - request_metadata_context, + SENSITIVE_PATTERNS, + add_otel_context, + sanitize_sensitive_data, setup_logger, ) from starlette.applications import Starlette @@ -19,112 +17,15 @@ from starlette.testclient import TestClient -def capture_log( - formatter: logging.Formatter, - msg: str, - extra: dict[str, Any] | None = None, -) -> dict[str, Any]: - """Capture log output as parsed JSON.""" - logger = logging.getLogger("test_capture") - - string_io = io.StringIO() - stream = logging.StreamHandler(string_io) - stream.setFormatter(formatter) - - correlation_filter = CorrelationFilter() - stream.addFilter(correlation_filter) - - logger.handlers = [stream] - logger.setLevel(logging.INFO) - logger.propagate = False - - logger.info(msg, extra=extra or {}) - stream.flush() - - output = string_io.getvalue() - string_io.close() - - if output: - result: dict[str, Any] = json.loads(output) - return result - - # Fallback: create and format record manually - lr = logging.LogRecord("test", logging.INFO, __file__, 1, msg, (), None, None) - correlation_filter.filter(lr) - s = formatter.format(lr) - fallback_result: dict[str, Any] = json.loads(s) - return fallback_result - - -class TestJSONFormatter: - """Tests for JSON log formatter.""" - - def test_formats_as_valid_json(self) -> None: - """Formatter outputs valid JSON.""" - formatter = JSONFormatter() - record = logging.LogRecord( - name="test", - level=logging.INFO, - pathname="test.py", - lineno=1, - msg="Test message", - args=(), - exc_info=None, - ) - - output = formatter.format(record) - parsed = json.loads(output) - - assert parsed["message"] == "Test message" - assert parsed["level"] == "INFO" - assert parsed["logger"] == "test" - assert "timestamp" in parsed - - def test_includes_correlation_id_from_record(self) -> None: - """Formatter includes correlation_id when present on record.""" - formatter = JSONFormatter() - record = logging.LogRecord( - name="test", - level=logging.INFO, - pathname="test.py", - lineno=1, - msg="Test", - args=(), - exc_info=None, - ) - record.correlation_id = "req_12345" - - output = formatter.format(record) - parsed = json.loads(output) - - assert parsed["correlation_id"] == "req_12345" - - def test_includes_request_metadata_from_record(self) -> None: - """Formatter includes request metadata when present on record.""" - formatter = JSONFormatter() - record = logging.LogRecord( - name="test", - level=logging.INFO, - pathname="test.py", - lineno=1, - msg="Test", - args=(), - exc_info=None, - ) - record.request_method = "POST" - record.request_path = "/api/v1/execute" - record.client_host = "192.168.1.1" - - output = formatter.format(record) - parsed = json.loads(output) - - assert parsed["request_method"] == "POST" - assert parsed["request_path"] == "/api/v1/execute" - assert parsed["client_host"] == "192.168.1.1" - - -class TestSensitiveDataSanitization: - """Tests for sensitive data sanitization in logs.""" +class TestSanitizeSensitiveData: + """Tests for the sanitize_sensitive_data structlog processor.""" + + @staticmethod + def _run_processor(event: str) -> str: + """Run sanitize_sensitive_data processor and return the sanitized event string.""" + event_dict: dict[str, Any] = {"event": event} + result = sanitize_sensitive_data(None, "info", event_dict) + return str(result["event"]) @pytest.mark.parametrize( ("input_data", "forbidden_text", "expected_marker"), @@ -164,125 +65,55 @@ class TestSensitiveDataSanitization: def test_sanitizes_sensitive_data( self, input_data: str, forbidden_text: str, expected_marker: str ) -> None: - """Sensitive data is redacted from logs.""" - formatter = JSONFormatter() - - result = formatter._sanitize_sensitive_data(input_data) - + result = self._run_processor(input_data) assert forbidden_text not in result assert expected_marker in result def test_sanitizes_multiple_types_in_one_message(self) -> None: - """Multiple sensitive data types are sanitized in a single message.""" - formatter = JSONFormatter() msg = "Bearer abcd1234 and mongodb://user:secret@host/db and email a@b.com" + result = self._run_processor(msg) + assert "BEARER_TOKEN_REDACTED" in result + assert "MONGODB_REDACTED" in result + assert "EMAIL_REDACTED" in result + + def test_non_string_event_unchanged(self) -> None: + event_dict: dict[str, Any] = {"event": 42} + result = sanitize_sensitive_data(None, "info", event_dict) + assert result["event"] == 42 + + def test_has_expected_pattern_count(self) -> None: + assert len(SENSITIVE_PATTERNS) == 6 + + +class TestAddOtelContext: + """Tests for the add_otel_context structlog processor.""" + + def test_no_span_no_ids(self) -> None: + event_dict: dict[str, Any] = {"event": "test"} + result = add_otel_context(None, "info", event_dict) + assert "trace_id" not in result + assert "span_id" not in result + + def test_with_valid_span(self) -> None: + mock_span = MagicMock() + mock_span.is_recording.return_value = True + mock_ctx = MagicMock() + mock_ctx.is_valid = True + mock_ctx.trace_id = 0x1234567890ABCDEF1234567890ABCDEF + mock_ctx.span_id = 0x1234567890ABCDEF + mock_span.get_span_context.return_value = mock_ctx - result = capture_log(formatter, msg) - sanitized = result["message"] - - assert "BEARER_TOKEN_REDACTED" in sanitized - assert "MONGODB_REDACTED" in sanitized - assert "EMAIL_REDACTED" in sanitized - - -class TestCorrelationFilter: - """Tests for correlation filter.""" - - def test_adds_correlation_id_from_context(self) -> None: - """Filter adds correlation_id from context to record.""" - filter_ = CorrelationFilter() - - token = correlation_id_context.set("test-correlation-123") - try: - record = logging.LogRecord( - name="test", - level=logging.INFO, - pathname="test.py", - lineno=1, - msg="Test", - args=(), - exc_info=None, - ) - - result = filter_.filter(record) - - assert result is True - assert record.correlation_id == "test-correlation-123" # type: ignore[attr-defined] - finally: - correlation_id_context.reset(token) - - def test_adds_request_metadata_from_context(self) -> None: - """Filter adds request metadata from context to record.""" - filter_ = CorrelationFilter() - - metadata = { - "method": "GET", - "path": "/api/v1/test", - "client": {"host": "127.0.0.1"}, - } - token = request_metadata_context.set(metadata) - try: - record = logging.LogRecord( - name="test", - level=logging.INFO, - pathname="test.py", - lineno=1, - msg="Test", - args=(), - exc_info=None, - ) - - result = filter_.filter(record) - - assert result is True - assert record.request_method == "GET" # type: ignore[attr-defined] - assert record.request_path == "/api/v1/test" # type: ignore[attr-defined] - assert record.client_host == "127.0.0.1" # type: ignore[attr-defined] - finally: - request_metadata_context.reset(token) - - def test_always_returns_true(self) -> None: - """Filter always returns True (never drops records).""" - filter_ = CorrelationFilter() - record = logging.LogRecord( - name="test", - level=logging.INFO, - pathname="test.py", - lineno=1, - msg="Test", - args=(), - exc_info=None, - ) - - assert filter_.filter(record) is True - - -class TestCorrelationContext: - """Tests for CorrelationContext usage.""" - - def test_context_and_filter_integration(self) -> None: - """CorrelationContext integrates with CorrelationFilter.""" - CorrelationContext.set_correlation_id("cid-1") - CorrelationContext.set_request_metadata( - {"method": "GET", "path": "/x", "client": {"host": "1.2.3.4"}} - ) - - result = capture_log(JSONFormatter(), "hello") - - assert result["correlation_id"] == "cid-1" - assert result["request_method"] == "GET" - assert result["request_path"] == "/x" - assert result["client_host"] == "1.2.3.4" - - CorrelationContext.clear() + with patch("app.core.logging.trace.get_current_span", return_value=mock_span): + event_dict: dict[str, Any] = {"event": "test"} + result = add_otel_context(None, "info", event_dict) + assert result["trace_id"] == "1234567890abcdef1234567890abcdef" + assert result["span_id"] == "1234567890abcdef" class TestCorrelationMiddleware: """Tests for CorrelationMiddleware.""" def test_sets_correlation_header(self) -> None: - """Middleware sets X-Correlation-ID response header.""" - async def ping(request: Request) -> JSONResponse: return JSONResponse({"ok": True}) @@ -295,56 +126,58 @@ async def ping(request: Request) -> JSONResponse: assert response.status_code == 200 assert "X-Correlation-ID" in response.headers + def test_preserves_provided_correlation_id(self) -> None: + async def ping(request: Request) -> JSONResponse: + return JSONResponse({"ok": True}) -class TestSetupLogger: - """Tests for logger setup.""" + app = Starlette(routes=[Route("/ping", ping)]) + app.add_middleware(CorrelationMiddleware) - def test_creates_named_logger(self) -> None: - """setup_logger creates logger with correct name.""" - logger = setup_logger("INFO") + with TestClient(app) as client: + response = client.get("/ping", headers={"X-Correlation-ID": "custom-id-123"}) + assert response.headers["X-Correlation-ID"] == "custom-id-123" - assert logger.name == "integr8scode" + def test_stores_correlation_in_scope_state(self) -> None: + captured_state: dict[str, Any] = {} - def test_sets_correct_level(self) -> None: - """Logger is set to correct level.""" - logger = setup_logger("WARNING") + async def capture(request: Request) -> JSONResponse: + captured_state["correlation_id"] = request.state.correlation_id + return JSONResponse({"ok": True}) - assert logger.level == logging.WARNING + app = Starlette(routes=[Route("/capture", capture)]) + app.add_middleware(CorrelationMiddleware) - def test_handles_case_insensitive_level(self) -> None: - """Logger handles case-insensitive level strings.""" - logger = setup_logger("debug") + with TestClient(app) as client: + client.get("/capture") - assert logger.level == logging.DEBUG + assert "correlation_id" in captured_state + assert captured_state["correlation_id"].startswith("req-") - def test_has_json_formatter(self) -> None: - """Logger has JSON formatter attached.""" - logger = setup_logger("INFO") - assert len(logger.handlers) > 0 - handler = logger.handlers[0] - assert isinstance(handler.formatter, JSONFormatter) +class TestSetupLogger: + """Tests for logger setup.""" - def test_has_correlation_filter(self) -> None: - """Logger has correlation filter attached.""" + def test_returns_bound_logger(self) -> None: logger = setup_logger("INFO") + assert hasattr(logger, "info") + assert hasattr(logger, "bind") + assert "BoundLogger" in type(logger).__name__ or "LazyProxy" in type(logger).__name__ - assert len(logger.handlers) > 0 - handler = logger.handlers[0] - filter_types = [type(f).__name__ for f in handler.filters] - assert "CorrelationFilter" in filter_types - - def test_clears_existing_handlers(self) -> None: - """setup_logger clears existing handlers.""" - logger1 = setup_logger("INFO") - initial_handlers = len(logger1.handlers) - - logger2 = setup_logger("DEBUG") - - assert len(logger2.handlers) == initial_handlers - - def test_returns_logger(self) -> None: - """setup_logger returns a logger instance.""" - lg = setup_logger(log_level="INFO") + def test_has_info_method(self) -> None: + logger = setup_logger("INFO") + assert hasattr(logger, "info") + assert hasattr(logger, "warning") + assert hasattr(logger, "error") - assert hasattr(lg, "info") + def test_handles_case_insensitive_level(self) -> None: + logger = setup_logger("debug") + assert hasattr(logger, "debug") + + def test_logger_captures_event_and_keys(self) -> None: + structlog.reset_defaults() + setup_logger("INFO") + with structlog.testing.capture_logs() as cap_logs: + structlog.get_logger("integr8scode").info("test message", key="value") + assert len(cap_logs) >= 1 + assert cap_logs[0]["event"] == "test message" + assert cap_logs[0]["key"] == "value" diff --git a/backend/tests/unit/services/coordinator/test_coordinator_queue.py b/backend/tests/unit/services/coordinator/test_coordinator_queue.py index f7c4782b..5fdf15f4 100644 --- a/backend/tests/unit/services/coordinator/test_coordinator_queue.py +++ b/backend/tests/unit/services/coordinator/test_coordinator_queue.py @@ -1,4 +1,4 @@ -import logging +import structlog from unittest.mock import AsyncMock import pytest @@ -9,7 +9,7 @@ from tests.conftest import make_execution_requested_event -_test_logger = logging.getLogger("test.services.coordinator") +_test_logger = structlog.get_logger("test.services.coordinator") pytestmark = pytest.mark.unit diff --git a/backend/tests/unit/services/idempotency/test_idempotency_manager.py b/backend/tests/unit/services/idempotency/test_idempotency_manager.py index e00ef993..c4fa9ebe 100644 --- a/backend/tests/unit/services/idempotency/test_idempotency_manager.py +++ b/backend/tests/unit/services/idempotency/test_idempotency_manager.py @@ -1,4 +1,4 @@ -import logging +import structlog from unittest.mock import MagicMock import pytest @@ -10,7 +10,7 @@ pytestmark = pytest.mark.unit # Test logger -_test_logger = logging.getLogger("test.idempotency_manager") +_test_logger = structlog.get_logger("test.idempotency_manager") class TestIdempotencyConfig: diff --git a/backend/tests/unit/services/pod_monitor/test_event_mapper.py b/backend/tests/unit/services/pod_monitor/test_event_mapper.py index fbb57573..8f956d59 100644 --- a/backend/tests/unit/services/pod_monitor/test_event_mapper.py +++ b/backend/tests/unit/services/pod_monitor/test_event_mapper.py @@ -1,5 +1,5 @@ import json -import logging +import structlog from unittest.mock import AsyncMock, MagicMock import pytest @@ -18,7 +18,7 @@ pytestmark = pytest.mark.unit -_test_logger = logging.getLogger("test.services.pod_monitor.event_mapper") +_test_logger = structlog.get_logger("test.services.pod_monitor.event_mapper") def _ctx(pod: V1Pod, event_type: WatchEventType = WatchEventType.ADDED) -> PodContext: diff --git a/backend/tests/unit/services/pod_monitor/test_monitor.py b/backend/tests/unit/services/pod_monitor/test_monitor.py index 7ca6ddab..28d9ecdc 100644 --- a/backend/tests/unit/services/pod_monitor/test_monitor.py +++ b/backend/tests/unit/services/pod_monitor/test_monitor.py @@ -1,4 +1,4 @@ -import logging +import structlog import types from typing import Any from unittest.mock import AsyncMock, MagicMock @@ -30,7 +30,7 @@ pytestmark = pytest.mark.unit -_test_logger = logging.getLogger("test.pod_monitor") +_test_logger = structlog.get_logger("test.pod_monitor") # ===== Test doubles for KafkaEventService dependencies ===== diff --git a/backend/tests/unit/services/result_processor/test_processor.py b/backend/tests/unit/services/result_processor/test_processor.py index 3ede4891..56b86ce1 100644 --- a/backend/tests/unit/services/result_processor/test_processor.py +++ b/backend/tests/unit/services/result_processor/test_processor.py @@ -1,4 +1,4 @@ -import logging +import structlog from unittest.mock import AsyncMock, MagicMock import pytest @@ -17,7 +17,7 @@ pytestmark = pytest.mark.unit -_test_logger = logging.getLogger("test.services.result_processor.processor") +_test_logger = structlog.get_logger("test.services.result_processor.processor") _METADATA = EventMetadata(service_name="tests", service_version="1.0.0") diff --git a/backend/tests/unit/services/saga/test_saga_orchestrator_unit.py b/backend/tests/unit/services/saga/test_saga_orchestrator_unit.py index 2cb21a8b..53b8ae9e 100644 --- a/backend/tests/unit/services/saga/test_saga_orchestrator_unit.py +++ b/backend/tests/unit/services/saga/test_saga_orchestrator_unit.py @@ -1,4 +1,4 @@ -import logging +import structlog import pytest from app.db.repositories import ResourceAllocationRepository, SagaRepository @@ -12,7 +12,7 @@ pytestmark = pytest.mark.unit -_test_logger = logging.getLogger("test.services.saga.orchestrator") +_test_logger = structlog.get_logger("test.services.saga.orchestrator") class _FakeRepo(SagaRepository): diff --git a/backend/tests/unit/services/sse/test_kafka_redis_bridge.py b/backend/tests/unit/services/sse/test_kafka_redis_bridge.py index ce0b281e..150f5a1a 100644 --- a/backend/tests/unit/services/sse/test_kafka_redis_bridge.py +++ b/backend/tests/unit/services/sse/test_kafka_redis_bridge.py @@ -1,4 +1,4 @@ -import logging +import structlog import pytest from app.domain.events import DomainEvent, EventMetadata, ExecutionStartedEvent @@ -6,7 +6,7 @@ pytestmark = pytest.mark.unit -_test_logger = logging.getLogger("test.services.sse.redis_bus_routing") +_test_logger = structlog.get_logger("test.services.sse.redis_bus_routing") class _FakeBus(SSERedisBus): diff --git a/backend/tests/unit/services/sse/test_sse_service.py b/backend/tests/unit/services/sse/test_sse_service.py index a43a82ea..a05fc762 100644 --- a/backend/tests/unit/services/sse/test_sse_service.py +++ b/backend/tests/unit/services/sse/test_sse_service.py @@ -1,6 +1,6 @@ import asyncio import json -import logging +import structlog from datetime import datetime, timezone from typing import Any from unittest.mock import MagicMock @@ -18,7 +18,7 @@ pytestmark = pytest.mark.unit -_test_logger = logging.getLogger("test.services.sse.sse_service") +_test_logger = structlog.get_logger("test.services.sse.sse_service") class _FakeSubscription(SSERedisSubscription): diff --git a/backend/tests/unit/services/test_admin_settings_service.py b/backend/tests/unit/services/test_admin_settings_service.py index bc553371..c5a961d8 100644 --- a/backend/tests/unit/services/test_admin_settings_service.py +++ b/backend/tests/unit/services/test_admin_settings_service.py @@ -1,4 +1,4 @@ -import logging +import structlog from unittest.mock import AsyncMock, MagicMock import pytest @@ -7,7 +7,7 @@ pytestmark = pytest.mark.unit -_logger = logging.getLogger("test.services.admin_settings") +_logger = structlog.get_logger("test.services.admin_settings") def _make_service( diff --git a/backend/tests/unit/services/test_login_lockout.py b/backend/tests/unit/services/test_login_lockout.py index e79641cd..fc1f9d6d 100644 --- a/backend/tests/unit/services/test_login_lockout.py +++ b/backend/tests/unit/services/test_login_lockout.py @@ -1,4 +1,4 @@ -import logging +import structlog from unittest.mock import AsyncMock import pytest @@ -7,7 +7,7 @@ pytestmark = pytest.mark.unit -_logger = logging.getLogger("test.services.login_lockout") +_logger = structlog.get_logger("test.services.login_lockout") def _make_service( diff --git a/backend/tests/unit/services/test_runtime_settings.py b/backend/tests/unit/services/test_runtime_settings.py index 06938558..62b3bcb6 100644 --- a/backend/tests/unit/services/test_runtime_settings.py +++ b/backend/tests/unit/services/test_runtime_settings.py @@ -1,4 +1,4 @@ -import logging +import structlog from time import monotonic from unittest.mock import AsyncMock @@ -9,7 +9,7 @@ pytestmark = pytest.mark.unit -_logger = logging.getLogger("test.services.runtime_settings") +_logger = structlog.get_logger("test.services.runtime_settings") def _make_settings() -> Settings: diff --git a/backend/uv.lock b/backend/uv.lock index bde0be0b..3c78102d 100644 --- a/backend/uv.lock +++ b/backend/uv.lock @@ -1145,6 +1145,7 @@ dependencies = [ { name = "sortedcontainers" }, { name = "sse-starlette" }, { name = "starlette" }, + { name = "structlog" }, { name = "tiktoken" }, { name = "tomli" }, { name = "typing-extensions" }, @@ -1287,6 +1288,7 @@ requires-dist = [ { name = "sortedcontainers", specifier = "==2.4.0" }, { name = "sse-starlette", specifier = "==3.2.0" }, { name = "starlette", specifier = "==0.49.1" }, + { name = "structlog", specifier = ">=25.5.0" }, { name = "tiktoken", specifier = "==0.11.0" }, { name = "tomli", specifier = "==2.0.2" }, { name = "typing-extensions", specifier = "==4.12.2" }, @@ -2924,6 +2926,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/51/da/545b75d420bb23b5d494b0517757b351963e974e79933f01e05c929f20a6/starlette-0.49.1-py3-none-any.whl", hash = "sha256:d92ce9f07e4a3caa3ac13a79523bd18e3bc0042bb8ff2d759a8e7dd0e1859875", size = 74175, upload-time = "2025-10-28T17:34:09.13Z" }, ] +[[package]] +name = "structlog" +version = "25.5.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/ef/52/9ba0f43b686e7f3ddfeaa78ac3af750292662284b3661e91ad5494f21dbc/structlog-25.5.0.tar.gz", hash = "sha256:098522a3bebed9153d4570c6d0288abf80a031dfdb2048d59a49e9dc2190fc98", size = 1460830, upload-time = "2025-10-27T08:28:23.028Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/a8/45/a132b9074aa18e799b891b91ad72133c98d8042c70f6240e4c5f9dabee2f/structlog-25.5.0-py3-none-any.whl", hash = "sha256:a8453e9b9e636ec59bd9e79bbd4a72f025981b3ba0f5837aebf48f02f37a7f9f", size = 72510, upload-time = "2025-10-27T08:28:21.535Z" }, +] + [[package]] name = "tiktoken" version = "0.11.0" diff --git a/backend/workers/run_coordinator.py b/backend/workers/run_coordinator.py index e1dc2d1e..32bd60db 100644 --- a/backend/workers/run_coordinator.py +++ b/backend/workers/run_coordinator.py @@ -3,9 +3,7 @@ from app.core.container import create_coordinator_container from app.core.logging import setup_logger -from app.core.tracing import init_tracing from app.db.docs import ALL_DOCUMENTS -from app.domain.enums import GroupId from app.events.handlers import register_coordinator_subscriber from app.settings import Settings from beanie import init_beanie @@ -23,17 +21,6 @@ def main() -> None: logger.info("Starting ExecutionCoordinator worker...") - if settings.ENABLE_TRACING: - init_tracing( - service_name=GroupId.EXECUTION_COORDINATOR, - settings=settings, - logger=logger, - service_version=settings.TRACING_SERVICE_VERSION, - enable_console_exporter=False, - sampling_rate=settings.TRACING_SAMPLING_RATE, - ) - logger.info("Tracing initialized for ExecutionCoordinator") - async def run() -> None: # Initialize Beanie with tz_aware client (so MongoDB returns aware datetimes) client: AsyncMongoClient[dict[str, Any]] = AsyncMongoClient(settings.MONGODB_URL, tz_aware=True) diff --git a/backend/workers/run_dlq_processor.py b/backend/workers/run_dlq_processor.py index 8ae604ed..609b884a 100644 --- a/backend/workers/run_dlq_processor.py +++ b/backend/workers/run_dlq_processor.py @@ -3,10 +3,8 @@ from app.core.container import create_dlq_processor_container from app.core.logging import setup_logger -from app.core.tracing import init_tracing from app.db.docs import ALL_DOCUMENTS from app.dlq.manager import DLQManager -from app.domain.enums import GroupId from app.events.handlers import register_dlq_subscriber from app.settings import Settings from beanie import init_beanie @@ -24,17 +22,6 @@ def main() -> None: logger.info("Starting DLQ Processor worker...") - if settings.ENABLE_TRACING: - init_tracing( - service_name=GroupId.DLQ_MANAGER, - settings=settings, - logger=logger, - service_version=settings.TRACING_SERVICE_VERSION, - enable_console_exporter=False, - sampling_rate=settings.TRACING_SAMPLING_RATE, - ) - logger.info("Tracing initialized for DLQ Processor") - async def run() -> None: # Initialize Beanie with tz_aware client (so MongoDB returns aware datetimes) client: AsyncMongoClient[dict[str, Any]] = AsyncMongoClient(settings.MONGODB_URL, tz_aware=True) diff --git a/backend/workers/run_event_replay.py b/backend/workers/run_event_replay.py index 43c53c0e..4c474a0f 100644 --- a/backend/workers/run_event_replay.py +++ b/backend/workers/run_event_replay.py @@ -4,7 +4,6 @@ from app.core.container import create_event_replay_container from app.core.logging import setup_logger -from app.core.tracing import init_tracing from app.db.docs import ALL_DOCUMENTS from app.services.event_replay import EventReplayService from app.settings import Settings @@ -61,17 +60,6 @@ def main() -> None: logger.info("Starting Event Replay Service...") - if settings.ENABLE_TRACING: - init_tracing( - service_name="event-replay", - settings=settings, - logger=logger, - service_version=settings.TRACING_SERVICE_VERSION, - enable_console_exporter=False, - sampling_rate=settings.TRACING_SAMPLING_RATE, - ) - logger.info("Tracing initialized for Event Replay Service") - asyncio.run(run_replay_service(settings)) diff --git a/backend/workers/run_k8s_worker.py b/backend/workers/run_k8s_worker.py index 5ddec7f8..4e5df1c9 100644 --- a/backend/workers/run_k8s_worker.py +++ b/backend/workers/run_k8s_worker.py @@ -3,9 +3,7 @@ from app.core.container import create_k8s_worker_container from app.core.logging import setup_logger -from app.core.tracing import init_tracing from app.db.docs import ALL_DOCUMENTS -from app.domain.enums import GroupId from app.events.handlers import register_k8s_worker_subscriber from app.services.k8s_worker import KubernetesWorker from app.settings import Settings @@ -24,17 +22,6 @@ def main() -> None: logger.info("Starting KubernetesWorker...") - if settings.ENABLE_TRACING: - init_tracing( - service_name=GroupId.K8S_WORKER, - settings=settings, - logger=logger, - service_version=settings.TRACING_SERVICE_VERSION, - enable_console_exporter=False, - sampling_rate=settings.TRACING_SAMPLING_RATE, - ) - logger.info("Tracing initialized for KubernetesWorker") - async def run() -> None: # Initialize Beanie with tz_aware client (so MongoDB returns aware datetimes) client: AsyncMongoClient[dict[str, Any]] = AsyncMongoClient(settings.MONGODB_URL, tz_aware=True) diff --git a/backend/workers/run_pod_monitor.py b/backend/workers/run_pod_monitor.py index e038d4d5..7ba1235b 100644 --- a/backend/workers/run_pod_monitor.py +++ b/backend/workers/run_pod_monitor.py @@ -3,9 +3,7 @@ from app.core.container import create_pod_monitor_container from app.core.logging import setup_logger -from app.core.tracing import init_tracing from app.db.docs import ALL_DOCUMENTS -from app.domain.enums import GroupId from app.services.pod_monitor import PodMonitor from app.settings import Settings from beanie import init_beanie @@ -23,17 +21,6 @@ def main() -> None: logger.info("Starting PodMonitor worker...") - if settings.ENABLE_TRACING: - init_tracing( - service_name=GroupId.POD_MONITOR, - settings=settings, - logger=logger, - service_version=settings.TRACING_SERVICE_VERSION, - enable_console_exporter=False, - sampling_rate=settings.TRACING_SAMPLING_RATE, - ) - logger.info("Tracing initialized for PodMonitor Service") - async def run() -> None: # Initialize Beanie with tz_aware client (so MongoDB returns aware datetimes) client: AsyncMongoClient[dict[str, Any]] = AsyncMongoClient(settings.MONGODB_URL, tz_aware=True) diff --git a/backend/workers/run_result_processor.py b/backend/workers/run_result_processor.py index e96b04e6..0d508f99 100644 --- a/backend/workers/run_result_processor.py +++ b/backend/workers/run_result_processor.py @@ -3,9 +3,7 @@ from app.core.container import create_result_processor_container from app.core.logging import setup_logger -from app.core.tracing import init_tracing from app.db.docs import ALL_DOCUMENTS -from app.domain.enums import GroupId from app.events.handlers import register_result_processor_subscriber from app.settings import Settings from beanie import init_beanie @@ -23,17 +21,6 @@ def main() -> None: logger.info("Starting ResultProcessor worker...") - if settings.ENABLE_TRACING: - init_tracing( - service_name=GroupId.RESULT_PROCESSOR, - settings=settings, - logger=logger, - service_version=settings.TRACING_SERVICE_VERSION, - enable_console_exporter=False, - sampling_rate=settings.TRACING_SAMPLING_RATE, - ) - logger.info("Tracing initialized for ResultProcessor Service") - async def run() -> None: # Initialize Beanie with tz_aware client (so MongoDB returns aware datetimes) client: AsyncMongoClient[dict[str, Any]] = AsyncMongoClient(settings.MONGODB_URL, tz_aware=True) diff --git a/backend/workers/run_saga_orchestrator.py b/backend/workers/run_saga_orchestrator.py index e07b0334..d2f999f8 100644 --- a/backend/workers/run_saga_orchestrator.py +++ b/backend/workers/run_saga_orchestrator.py @@ -3,9 +3,7 @@ from app.core.container import create_saga_orchestrator_container from app.core.logging import setup_logger -from app.core.tracing import init_tracing from app.db.docs import ALL_DOCUMENTS -from app.domain.enums import GroupId from app.events.handlers import register_saga_subscriber from app.services.saga import SagaOrchestrator from app.settings import Settings @@ -24,17 +22,6 @@ def main() -> None: logger.info("Starting Saga Orchestrator worker...") - if settings.ENABLE_TRACING: - init_tracing( - service_name=GroupId.SAGA_ORCHESTRATOR, - settings=settings, - logger=logger, - service_version=settings.TRACING_SERVICE_VERSION, - enable_console_exporter=False, - sampling_rate=settings.TRACING_SAMPLING_RATE, - ) - logger.info("Tracing initialized for Saga Orchestrator Service") - async def run() -> None: # Initialize Beanie with tz_aware client (so MongoDB returns aware datetimes) client: AsyncMongoClient[dict[str, Any]] = AsyncMongoClient(settings.MONGODB_URL, tz_aware=True) From cd329577ce67d7ea637bcab33be37060f8fcb268 Mon Sep 17 00:00:00 2001 From: HardMax71 Date: Fri, 13 Feb 2026 00:43:01 +0100 Subject: [PATCH 02/12] fix: removed request scoped provider --- backend/app/core/container.py | 2 -- backend/app/core/providers.py | 15 --------------- 2 files changed, 17 deletions(-) diff --git a/backend/app/core/container.py b/backend/app/core/container.py index 3f3838d0..43b737f1 100644 --- a/backend/app/core/container.py +++ b/backend/app/core/container.py @@ -21,7 +21,6 @@ PodMonitorProvider, RedisProvider, RepositoryProvider, - RequestScopedProvider, ResourceCleanerProvider, ResultProcessorProvider, SagaOrchestratorProvider, @@ -64,7 +63,6 @@ def create_app_container(settings: Settings) -> AsyncContainer: CoordinatorProvider(), KubernetesProvider(), ResourceCleanerProvider(), - RequestScopedProvider(), FastapiProvider(), context={Settings: settings}, ) diff --git a/backend/app/core/providers.py b/backend/app/core/providers.py index 8c982f9e..f414636d 100644 --- a/backend/app/core/providers.py +++ b/backend/app/core/providers.py @@ -122,21 +122,6 @@ def get_logger(self, settings: Settings) -> structlog.stdlib.BoundLogger: return setup_logger(settings.LOG_LEVEL) -class RequestScopedProvider(Provider): - """Provides REQUEST-scoped logger with correlation context bound from the request.""" - - scope = Scope.REQUEST - - @provide - def get_request_logger( - self, base: structlog.stdlib.BoundLogger, request: Request, - ) -> structlog.stdlib.BoundLogger: - return base.bind( - correlation_id=request.state.correlation_id, - request_path=str(request.url.path), - request_method=request.method, - ) - class RedisProvider(Provider): scope = Scope.APP From 8d2716edea8ec713680e930ab29998c2da851171 Mon Sep 17 00:00:00 2001 From: HardMax71 Date: Fri, 13 Feb 2026 00:54:43 +0100 Subject: [PATCH 03/12] fix: logging -> structlog --- backend/app/api/routes/auth.py | 10 +++++----- backend/app/events/handlers.py | 24 ++++++++++++------------ backend/tests/e2e/app/test_main_app.py | 7 ++++--- backend/tests/e2e/core/test_container.py | 10 ++++------ 4 files changed, 25 insertions(+), 26 deletions(-) diff --git a/backend/app/api/routes/auth.py b/backend/app/api/routes/auth.py index df209913..3e1aff7f 100644 --- a/backend/app/api/routes/auth.py +++ b/backend/app/api/routes/auth.py @@ -1,6 +1,6 @@ -import logging from datetime import timedelta +import structlog from dishka import FromDishka from dishka.integrations.fastapi import DishkaRoute from fastapi import APIRouter, Depends, HTTPException, Request, Response @@ -40,7 +40,7 @@ async def login( security_service: FromDishka[SecurityService], runtime_settings: FromDishka[RuntimeSettingsLoader], lockout_service: FromDishka[LoginLockoutService], - logger: FromDishka[logging.Logger], + logger: FromDishka[structlog.stdlib.BoundLogger], form_data: OAuth2PasswordRequestForm = Depends(), ) -> LoginResponse: """Authenticate and receive session cookies.""" @@ -169,7 +169,7 @@ async def register( user_repo: FromDishka[UserRepository], security_service: FromDishka[SecurityService], runtime_settings: FromDishka[RuntimeSettingsLoader], - logger: FromDishka[logging.Logger], + logger: FromDishka[structlog.stdlib.BoundLogger], ) -> UserResponse: """Register a new user account.""" logger.info( @@ -227,7 +227,7 @@ async def get_current_user_profile( request: Request, response: Response, auth_service: FromDishka[AuthService], - logger: FromDishka[logging.Logger], + logger: FromDishka[structlog.stdlib.BoundLogger], ) -> UserResponse: """Get the authenticated user's profile.""" current_user = await auth_service.get_current_user(request) @@ -252,7 +252,7 @@ async def get_current_user_profile( async def logout( request: Request, response: Response, - logger: FromDishka[logging.Logger], + logger: FromDishka[structlog.stdlib.BoundLogger], ) -> MessageResponse: """Log out and clear session cookies.""" logger.info( diff --git a/backend/app/events/handlers.py b/backend/app/events/handlers.py index c8bfc4e7..b65cc51e 100644 --- a/backend/app/events/handlers.py +++ b/backend/app/events/handlers.py @@ -1,8 +1,8 @@ import asyncio -import logging from datetime import datetime, timezone from typing import Any +import structlog from dishka.integrations.faststream import FromDishka from faststream import AckPolicy from faststream.kafka import KafkaBroker, KafkaMessage @@ -39,7 +39,7 @@ async def with_idempotency( idem: IdempotencyManager, key_strategy: KeyStrategy, ttl_seconds: int, - logger: logging.Logger, + logger: structlog.stdlib.BoundLogger, ) -> None: """Run *handler* inside an idempotency guard (check -> execute -> mark).""" result = await idem.check_and_reserve( @@ -82,7 +82,7 @@ async def on_execution_requested( body: ExecutionRequestedEvent, coordinator: FromDishka[ExecutionCoordinator], idem: FromDishka[IdempotencyManager], - logger: FromDishka[logging.Logger], + logger: FromDishka[structlog.stdlib.BoundLogger], ) -> None: await with_idempotency( body, coordinator.handle_execution_requested, idem, KeyStrategy.EVENT_BASED, 7200, logger, @@ -93,7 +93,7 @@ async def on_execution_completed( body: ExecutionCompletedEvent, coordinator: FromDishka[ExecutionCoordinator], idem: FromDishka[IdempotencyManager], - logger: FromDishka[logging.Logger], + logger: FromDishka[structlog.stdlib.BoundLogger], ) -> None: await with_idempotency( body, coordinator.handle_execution_completed, idem, KeyStrategy.EVENT_BASED, 7200, logger, @@ -104,7 +104,7 @@ async def on_execution_failed( body: ExecutionFailedEvent, coordinator: FromDishka[ExecutionCoordinator], idem: FromDishka[IdempotencyManager], - logger: FromDishka[logging.Logger], + logger: FromDishka[structlog.stdlib.BoundLogger], ) -> None: await with_idempotency( body, coordinator.handle_execution_failed, idem, KeyStrategy.EVENT_BASED, 7200, logger, @@ -115,7 +115,7 @@ async def on_execution_cancelled( body: ExecutionCancelledEvent, coordinator: FromDishka[ExecutionCoordinator], idem: FromDishka[IdempotencyManager], - logger: FromDishka[logging.Logger], + logger: FromDishka[structlog.stdlib.BoundLogger], ) -> None: await with_idempotency( body, coordinator.handle_execution_cancelled, idem, KeyStrategy.EVENT_BASED, 7200, logger, @@ -138,7 +138,7 @@ async def on_create_pod( body: CreatePodCommandEvent, worker: FromDishka[KubernetesWorker], idem: FromDishka[IdempotencyManager], - logger: FromDishka[logging.Logger], + logger: FromDishka[structlog.stdlib.BoundLogger], ) -> None: await with_idempotency( body, worker.handle_create_pod_command, idem, KeyStrategy.CONTENT_HASH, 3600, logger, @@ -149,7 +149,7 @@ async def on_delete_pod( body: DeletePodCommandEvent, worker: FromDishka[KubernetesWorker], idem: FromDishka[IdempotencyManager], - logger: FromDishka[logging.Logger], + logger: FromDishka[structlog.stdlib.BoundLogger], ) -> None: await with_idempotency( body, worker.handle_delete_pod_command, idem, KeyStrategy.CONTENT_HASH, 3600, logger, @@ -174,7 +174,7 @@ async def on_execution_completed( body: ExecutionCompletedEvent, processor: FromDishka[ResultProcessor], idem: FromDishka[IdempotencyManager], - logger: FromDishka[logging.Logger], + logger: FromDishka[structlog.stdlib.BoundLogger], ) -> None: await with_idempotency( body, processor.handle_execution_completed, idem, KeyStrategy.CONTENT_HASH, 7200, logger, @@ -185,7 +185,7 @@ async def on_execution_failed( body: ExecutionFailedEvent, processor: FromDishka[ResultProcessor], idem: FromDishka[IdempotencyManager], - logger: FromDishka[logging.Logger], + logger: FromDishka[structlog.stdlib.BoundLogger], ) -> None: await with_idempotency( body, processor.handle_execution_failed, idem, KeyStrategy.CONTENT_HASH, 7200, logger, @@ -196,7 +196,7 @@ async def on_execution_timeout( body: ExecutionTimeoutEvent, processor: FromDishka[ResultProcessor], idem: FromDishka[IdempotencyManager], - logger: FromDishka[logging.Logger], + logger: FromDishka[structlog.stdlib.BoundLogger], ) -> None: await with_idempotency( body, processor.handle_execution_timeout, idem, KeyStrategy.CONTENT_HASH, 7200, logger, @@ -317,7 +317,7 @@ async def on_dlq_message( body: DLQMessage, msg: KafkaMessage, manager: FromDishka[DLQManager], - logger: FromDishka[logging.Logger], + logger: FromDishka[structlog.stdlib.BoundLogger], ) -> None: start = asyncio.get_running_loop().time() raw = msg.raw_message diff --git a/backend/tests/e2e/app/test_main_app.py b/backend/tests/e2e/app/test_main_app.py index 52c63706..657e9ca5 100644 --- a/backend/tests/e2e/app/test_main_app.py +++ b/backend/tests/e2e/app/test_main_app.py @@ -1,4 +1,3 @@ -import logging from importlib import import_module from typing import Any @@ -260,8 +259,10 @@ async def test_container_resolves_settings(self, scope: AsyncContainer) -> None: @pytest.mark.asyncio async def test_container_resolves_logger(self, scope: AsyncContainer) -> None: """Container can resolve Logger.""" - logger = await scope.get(logging.Logger) - assert isinstance(logger, logging.Logger) + import structlog + + logger = await scope.get(structlog.stdlib.BoundLogger) + assert isinstance(logger, structlog.stdlib.BoundLogger) class TestExceptionHandlers: diff --git a/backend/tests/e2e/core/test_container.py b/backend/tests/e2e/core/test_container.py index 9380bfe9..a6baa761 100644 --- a/backend/tests/e2e/core/test_container.py +++ b/backend/tests/e2e/core/test_container.py @@ -1,6 +1,5 @@ -import logging - import pytest +import structlog import redis.asyncio as aioredis from app.core.security import SecurityService from app.db.docs import UserDocument @@ -32,11 +31,10 @@ async def test_resolves_settings(self, scope: AsyncContainer) -> None: @pytest.mark.asyncio async def test_resolves_logger(self, scope: AsyncContainer) -> None: - """Container resolves Logger.""" - logger = await scope.get(logging.Logger) + """Container resolves BoundLogger.""" + logger = await scope.get(structlog.stdlib.BoundLogger) - assert isinstance(logger, logging.Logger) - assert logger.name == "integr8scode" + assert isinstance(logger, structlog.stdlib.BoundLogger) @pytest.mark.asyncio async def test_beanie_initialized(self, app: FastAPI) -> None: From db4484aa65b766ead27dcabcfb9feb6f0a6771c0 Mon Sep 17 00:00:00 2001 From: HardMax71 Date: Fri, 13 Feb 2026 01:11:07 +0100 Subject: [PATCH 04/12] fix: other issues --- backend/app/api/routes/execution.py | 8 +++++++- backend/app/core/providers.py | 3 --- backend/app/services/execution_service.py | 23 +++++++++++++---------- backend/tests/e2e/app/test_main_app.py | 3 ++- backend/tests/e2e/core/test_container.py | 3 ++- 5 files changed, 24 insertions(+), 16 deletions(-) diff --git a/backend/app/api/routes/execution.py b/backend/app/api/routes/execution.py index 03a17f50..21593abc 100644 --- a/backend/app/api/routes/execution.py +++ b/backend/app/api/routes/execution.py @@ -73,6 +73,7 @@ async def create_execution( lang_version=execution.lang_version, user_id=current_user.user_id, idempotency_key=idempotency_key, + correlation_id=request.state.correlation_id, ) return ExecutionResponse.model_validate(exec_result) @@ -98,6 +99,7 @@ async def get_result( }, ) async def cancel_execution( + request: Request, execution: Annotated[ExecutionInDB, Depends(get_execution_with_access)], current_user: Annotated[User, Depends(current_user)], cancel_request: CancelExecutionRequest, @@ -109,6 +111,7 @@ async def cancel_execution( current_status=execution.status, user_id=current_user.user_id, reason=cancel_request.reason, + correlation_id=request.state.correlation_id, ) return CancelResponse.model_validate(result) @@ -122,6 +125,7 @@ async def cancel_execution( }, ) async def retry_execution( + request: Request, original_execution: Annotated[ExecutionInDB, Depends(get_execution_with_access)], current_user: Annotated[User, Depends(current_user)], execution_service: FromDishka[ExecutionService], @@ -136,6 +140,7 @@ async def retry_execution( lang=original_execution.lang, lang_version=original_execution.lang_version, user_id=current_user.user_id, + correlation_id=request.state.correlation_id, ) return ExecutionResponse.model_validate(new_result) @@ -212,10 +217,11 @@ async def get_k8s_resource_limits( @router.delete("/executions/{execution_id}", response_model=DeleteResponse) async def delete_execution( + request: Request, execution_id: str, admin: Annotated[User, Depends(admin_user)], execution_service: FromDishka[ExecutionService], ) -> DeleteResponse: """Delete an execution and its associated data (admin only).""" - await execution_service.delete_execution(execution_id, admin.user_id) + await execution_service.delete_execution(execution_id, admin.user_id, request.state.correlation_id) return DeleteResponse(message="Execution deleted successfully", execution_id=execution_id) diff --git a/backend/app/core/providers.py b/backend/app/core/providers.py index f414636d..65733e53 100644 --- a/backend/app/core/providers.py +++ b/backend/app/core/providers.py @@ -6,7 +6,6 @@ import structlog from apscheduler.schedulers.asyncio import AsyncIOScheduler from dishka import Provider, Scope, from_context, provide -from fastapi import Request from faststream.kafka import KafkaBroker from faststream.kafka.opentelemetry import KafkaTelemetryMiddleware from kubernetes_asyncio import client as k8s_client @@ -703,7 +702,6 @@ def get_execution_service( execution_metrics: ExecutionMetrics, idempotency_manager: IdempotencyManager, runtime_settings: RuntimeSettingsLoader, - request: Request, ) -> ExecutionService: return ExecutionService( execution_repo=execution_repository, @@ -713,7 +711,6 @@ def get_execution_service( execution_metrics=execution_metrics, idempotency_manager=idempotency_manager, runtime_settings=runtime_settings, - correlation_id=request.state.correlation_id, ) @provide diff --git a/backend/app/services/execution_service.py b/backend/app/services/execution_service.py index f6dedfd5..3ed9d9ab 100644 --- a/backend/app/services/execution_service.py +++ b/backend/app/services/execution_service.py @@ -50,7 +50,6 @@ def __init__( execution_metrics: ExecutionMetrics, idempotency_manager: IdempotencyManager, runtime_settings: RuntimeSettingsLoader, - correlation_id: str, ) -> None: self.execution_repo = execution_repo self.producer = producer @@ -59,7 +58,6 @@ def __init__( self.metrics = execution_metrics self.idempotency_manager = idempotency_manager self._runtime_settings = runtime_settings - self._correlation_id = correlation_id async def get_k8s_resource_limits(self) -> ResourceLimitsDomain: effective = await self._runtime_settings.get_effective_settings() @@ -75,13 +73,13 @@ async def get_k8s_resource_limits(self) -> ResourceLimitsDomain: async def get_example_scripts(self) -> dict[str, str]: return self.settings.EXAMPLE_SCRIPTS - def _create_event_metadata(self, user_id: str) -> EventMetadata: + def _create_event_metadata(self, user_id: str, correlation_id: str = "") -> EventMetadata: """Create standardized event metadata.""" return EventMetadata( service_name="execution-service", service_version="2.0.0", user_id=user_id, - correlation_id=self._correlation_id, + correlation_id=correlation_id, ) async def execute_script( @@ -92,6 +90,7 @@ async def execute_script( lang: str = "python", lang_version: str = "3.11", priority: QueuePriority = QueuePriority.NORMAL, + correlation_id: str = "", ) -> DomainExecution: """ Execute a script by creating an execution record and publishing an event. @@ -149,7 +148,7 @@ async def execute_script( ) # Metadata and event — use admin-configurable limits - metadata = self._create_event_metadata(user_id=user_id) + metadata = self._create_event_metadata(user_id=user_id, correlation_id=correlation_id) effective = await self._runtime_settings.get_effective_settings() event = ExecutionRequestedEvent( execution_id=created_execution.execution_id, @@ -203,6 +202,7 @@ async def cancel_execution( current_status: ExecutionStatus, user_id: str, reason: str = "User requested cancellation", + correlation_id: str = "", ) -> CancelResult: """ Cancel a running or queued execution. @@ -237,7 +237,7 @@ async def cancel_execution( event_id=None, ) - metadata = self._create_event_metadata(user_id=user_id) + metadata = self._create_event_metadata(user_id=user_id, correlation_id=correlation_id) event = ExecutionCancelledEvent( execution_id=execution_id, aggregate_id=execution_id, @@ -268,6 +268,7 @@ async def execute_script_idempotent( lang: str = "python", lang_version: str = "3.11", idempotency_key: str | None = None, + correlation_id: str = "", ) -> DomainExecution: """ Execute a script with optional idempotency support. @@ -288,6 +289,7 @@ async def execute_script_idempotent( lang=lang, lang_version=lang_version, user_id=user_id, + correlation_id=correlation_id, ) pseudo_event = BaseEvent( @@ -332,6 +334,7 @@ async def execute_script_idempotent( lang=lang, lang_version=lang_version, user_id=user_id, + correlation_id=correlation_id, ) await self.idempotency_manager.mark_completed_with_json( @@ -507,7 +510,7 @@ def _build_user_query( return query - async def delete_execution(self, execution_id: str, user_id: str) -> bool: + async def delete_execution(self, execution_id: str, user_id: str, correlation_id: str = "") -> bool: """ Delete an execution and publish deletion event. @@ -527,11 +530,11 @@ async def delete_execution(self, execution_id: str, user_id: str) -> bool: self.logger.info("Deleted execution", extra={"execution_id": execution_id}) - await self._publish_deletion_event(execution_id, user_id) + await self._publish_deletion_event(execution_id, user_id, correlation_id) return True - async def _publish_deletion_event(self, execution_id: str, user_id: str) -> None: + async def _publish_deletion_event(self, execution_id: str, user_id: str, correlation_id: str = "") -> None: """ Publish execution deletion/cancellation event. @@ -539,7 +542,7 @@ async def _publish_deletion_event(self, execution_id: str, user_id: str) -> None execution_id: UUID of deleted execution. user_id: ID of user who deleted it. """ - metadata = self._create_event_metadata(user_id=user_id) + metadata = self._create_event_metadata(user_id=user_id, correlation_id=correlation_id) event = ExecutionCancelledEvent( execution_id=execution_id, diff --git a/backend/tests/e2e/app/test_main_app.py b/backend/tests/e2e/app/test_main_app.py index 657e9ca5..08e2b536 100644 --- a/backend/tests/e2e/app/test_main_app.py +++ b/backend/tests/e2e/app/test_main_app.py @@ -262,7 +262,8 @@ async def test_container_resolves_logger(self, scope: AsyncContainer) -> None: import structlog logger = await scope.get(structlog.stdlib.BoundLogger) - assert isinstance(logger, structlog.stdlib.BoundLogger) + assert hasattr(logger, "info") + assert hasattr(logger, "bind") class TestExceptionHandlers: diff --git a/backend/tests/e2e/core/test_container.py b/backend/tests/e2e/core/test_container.py index a6baa761..874d9b24 100644 --- a/backend/tests/e2e/core/test_container.py +++ b/backend/tests/e2e/core/test_container.py @@ -34,7 +34,8 @@ async def test_resolves_logger(self, scope: AsyncContainer) -> None: """Container resolves BoundLogger.""" logger = await scope.get(structlog.stdlib.BoundLogger) - assert isinstance(logger, structlog.stdlib.BoundLogger) + assert hasattr(logger, "info") + assert hasattr(logger, "bind") @pytest.mark.asyncio async def test_beanie_initialized(self, app: FastAPI) -> None: From 9572bc6c1bae3d29eb2b077ea4f6874d33ef956d Mon Sep 17 00:00:00 2001 From: HardMax71 Date: Fri, 13 Feb 2026 01:59:03 +0100 Subject: [PATCH 05/12] fix: removed x-corr-id - otel handles tracing on its own --- backend/app/api/routes/admin/events.py | 6 +- backend/app/api/routes/execution.py | 8 +- backend/app/core/correlation.py | 28 ------- backend/app/db/docs/event.py | 1 - backend/app/db/docs/replay.py | 4 +- backend/app/db/docs/user_settings.py | 1 - .../admin/admin_events_repository.py | 3 +- .../app/db/repositories/event_repository.py | 23 ------ .../repositories/user_settings_repository.py | 7 +- backend/app/dlq/manager.py | 3 - backend/app/domain/admin/replay_models.py | 4 +- backend/app/domain/admin/replay_updates.py | 2 +- backend/app/domain/events/event_models.py | 1 - backend/app/domain/events/typed.py | 1 - backend/app/domain/replay/models.py | 7 +- backend/app/domain/saga/models.py | 1 - backend/app/domain/user/settings_models.py | 2 - backend/app/main.py | 5 +- backend/app/schemas_pydantic/admin_events.py | 6 +- backend/app/schemas_pydantic/user_settings.py | 2 - .../services/admin/admin_events_service.py | 24 +++--- .../app/services/coordinator/coordinator.py | 1 - backend/app/services/event_service.py | 17 ---- backend/app/services/execution_service.py | 21 ++--- .../app/services/k8s_worker/pod_builder.py | 11 +-- .../app/services/pod_monitor/event_mapper.py | 10 +-- .../services/result_processor/processor.py | 18 ++--- backend/app/services/saga/execution_saga.py | 3 - .../app/services/saga/saga_orchestrator.py | 3 - backend/app/services/user_settings_service.py | 1 - backend/tests/e2e/app/test_main_app.py | 17 +--- backend/tests/e2e/core/test_middlewares.py | 42 ---------- backend/tests/e2e/test_admin_events_routes.py | 4 +- .../unit/core/test_logging_and_correlation.py | 50 ------------ .../tests/unit/events/test_metadata_model.py | 8 -- .../services/pod_monitor/test_event_mapper.py | 8 +- .../unit/services/pod_monitor/test_monitor.py | 1 - .../tests/unit/services/test_pod_builder.py | 5 -- docs/architecture/event-storage.md | 2 +- docs/operations/logging.md | 20 ++--- docs/operations/tracing.md | 2 +- docs/reference/openapi.json | 78 ++----------------- .../admin/events/EventDetailsModal.svelte | 4 - .../admin/events/EventFilters.svelte | 14 ---- .../admin/events/EventsTable.svelte | 6 +- .../__tests__/EventDetailsModal.test.ts | 11 --- .../events/__tests__/EventFilters.test.ts | 1 - .../admin/events/__tests__/eventTypes.test.ts | 13 ++-- frontend/src/lib/admin/events/eventTypes.ts | 5 -- frontend/src/lib/api/types.gen.ts | 38 ++------- frontend/src/routes/admin/AdminEvents.svelte | 3 +- .../admin/__tests__/AdminEvents.test.ts | 4 +- .../src/routes/admin/__tests__/test-utils.ts | 3 - 53 files changed, 80 insertions(+), 483 deletions(-) delete mode 100644 backend/app/core/correlation.py diff --git a/backend/app/api/routes/admin/events.py b/backend/app/api/routes/admin/events.py index e6b26cfa..bc735d3f 100644 --- a/backend/app/api/routes/admin/events.py +++ b/backend/app/api/routes/admin/events.py @@ -58,7 +58,6 @@ async def export_events( service: FromDishka[AdminEventsService], event_types: Annotated[list[EventType] | None, Query(description="Event types (repeat param for multiple)")] = None, aggregate_id: Annotated[str | None, Query(description="Aggregate ID filter")] = None, - correlation_id: Annotated[str | None, Query(description="Correlation ID filter")] = None, user_id: Annotated[str | None, Query(description="User ID filter")] = None, service_name: Annotated[str | None, Query(description="Service name filter")] = None, start_time: Annotated[datetime | None, Query(description="Start time")] = None, @@ -69,7 +68,6 @@ async def export_events( export_filter = EventFilter( event_types=event_types, aggregate_id=aggregate_id, - correlation_id=correlation_id, user_id=user_id, service_name=service_name, start_time=start_time, @@ -105,11 +103,11 @@ async def replay_events( request: EventReplayRequest, background_tasks: BackgroundTasks, service: FromDishka[AdminEventsService] ) -> EventReplayResponse: """Replay events by filter criteria, with optional dry-run mode.""" - replay_correlation_id = f"replay-{uuid4().hex}" + replay_id = f"replay-{uuid4().hex}" result = await service.prepare_or_schedule_replay( replay_filter=ReplayFilter.model_validate(request), dry_run=request.dry_run, - replay_correlation_id=replay_correlation_id, + replay_id=replay_id, target_service=request.target_service, ) diff --git a/backend/app/api/routes/execution.py b/backend/app/api/routes/execution.py index 21593abc..03a17f50 100644 --- a/backend/app/api/routes/execution.py +++ b/backend/app/api/routes/execution.py @@ -73,7 +73,6 @@ async def create_execution( lang_version=execution.lang_version, user_id=current_user.user_id, idempotency_key=idempotency_key, - correlation_id=request.state.correlation_id, ) return ExecutionResponse.model_validate(exec_result) @@ -99,7 +98,6 @@ async def get_result( }, ) async def cancel_execution( - request: Request, execution: Annotated[ExecutionInDB, Depends(get_execution_with_access)], current_user: Annotated[User, Depends(current_user)], cancel_request: CancelExecutionRequest, @@ -111,7 +109,6 @@ async def cancel_execution( current_status=execution.status, user_id=current_user.user_id, reason=cancel_request.reason, - correlation_id=request.state.correlation_id, ) return CancelResponse.model_validate(result) @@ -125,7 +122,6 @@ async def cancel_execution( }, ) async def retry_execution( - request: Request, original_execution: Annotated[ExecutionInDB, Depends(get_execution_with_access)], current_user: Annotated[User, Depends(current_user)], execution_service: FromDishka[ExecutionService], @@ -140,7 +136,6 @@ async def retry_execution( lang=original_execution.lang, lang_version=original_execution.lang_version, user_id=current_user.user_id, - correlation_id=request.state.correlation_id, ) return ExecutionResponse.model_validate(new_result) @@ -217,11 +212,10 @@ async def get_k8s_resource_limits( @router.delete("/executions/{execution_id}", response_model=DeleteResponse) async def delete_execution( - request: Request, execution_id: str, admin: Annotated[User, Depends(admin_user)], execution_service: FromDishka[ExecutionService], ) -> DeleteResponse: """Delete an execution and its associated data (admin only).""" - await execution_service.delete_execution(execution_id, admin.user_id, request.state.correlation_id) + await execution_service.delete_execution(execution_id, admin.user_id) return DeleteResponse(message="Execution deleted successfully", execution_id=execution_id) diff --git a/backend/app/core/correlation.py b/backend/app/core/correlation.py deleted file mode 100644 index 9015fb27..00000000 --- a/backend/app/core/correlation.py +++ /dev/null @@ -1,28 +0,0 @@ -import uuid - -from starlette.datastructures import Headers, MutableHeaders -from starlette.types import ASGIApp, Message, Receive, Scope, Send - - -class CorrelationMiddleware: - CORRELATION_HEADER = "X-Correlation-ID" - - def __init__(self, app: ASGIApp) -> None: - self.app = app - - async def __call__(self, scope: Scope, receive: Receive, send: Send) -> None: - if scope["type"] != "http": - await self.app(scope, receive, send) - return - - headers = Headers(scope=scope) - correlation_id = headers.get("x-correlation-id") or f"req-{uuid.uuid4().hex}" - - scope.setdefault("state", {})["correlation_id"] = correlation_id - - async def send_wrapper(message: Message) -> None: - if message["type"] == "http.response.start": - MutableHeaders(scope=message)[self.CORRELATION_HEADER] = correlation_id - await send(message) - - await self.app(scope, receive, send_wrapper) diff --git a/backend/app/db/docs/event.py b/backend/app/db/docs/event.py index 2ce1a126..71cf4044 100644 --- a/backend/app/db/docs/event.py +++ b/backend/app/db/docs/event.py @@ -38,7 +38,6 @@ class Settings: # Compound indexes for common query patterns IndexModel([("event_type", ASCENDING), ("timestamp", DESCENDING)], name="idx_event_type_ts"), IndexModel([("aggregate_id", ASCENDING), ("timestamp", DESCENDING)], name="idx_aggregate_ts"), - IndexModel([("metadata.correlation_id", ASCENDING)], name="idx_meta_correlation"), IndexModel([("metadata.user_id", ASCENDING), ("timestamp", DESCENDING)], name="idx_meta_user_ts"), IndexModel([("metadata.service_name", ASCENDING), ("timestamp", DESCENDING)], name="idx_meta_service_ts"), # Event-specific field indexes (sparse - only exist on relevant event types) diff --git a/backend/app/db/docs/replay.py b/backend/app/db/docs/replay.py index 37ec9441..eb784dc1 100644 --- a/backend/app/db/docs/replay.py +++ b/backend/app/db/docs/replay.py @@ -60,7 +60,7 @@ class ReplaySessionDocument(Document): errors: list[ReplayError] = Field(default_factory=list) # Tracking and admin fields - correlation_id: str = Field(default_factory=lambda: str(uuid4())) + replay_id: str = Field(default_factory=lambda: str(uuid4())) created_by: str | None = None target_service: str | None = None dry_run: bool = False @@ -91,5 +91,5 @@ class Settings: use_state_management = True indexes = [ IndexModel([("status", 1)]), - IndexModel([("correlation_id", 1)]), + IndexModel([("replay_id", 1)]), ] diff --git a/backend/app/db/docs/user_settings.py b/backend/app/db/docs/user_settings.py index 18015be5..2840d638 100644 --- a/backend/app/db/docs/user_settings.py +++ b/backend/app/db/docs/user_settings.py @@ -98,7 +98,6 @@ class UserSettingsSnapshotDocument(Document): # Snapshot metadata reason: str | None = None - correlation_id: str | None = None model_config = ConfigDict(from_attributes=True) diff --git a/backend/app/db/repositories/admin/admin_events_repository.py b/backend/app/db/repositories/admin/admin_events_repository.py index 205f195d..5b4846d5 100644 --- a/backend/app/db/repositories/admin/admin_events_repository.py +++ b/backend/app/db/repositories/admin/admin_events_repository.py @@ -34,7 +34,6 @@ def _event_filter_conditions(self, f: EventFilter) -> list[Any]: conditions = [ In(EventDocument.event_type, f.event_types) if f.event_types else None, EventDocument.aggregate_id == f.aggregate_id if f.aggregate_id else None, - EventDocument.metadata.correlation_id == f.correlation_id if f.correlation_id else None, EventDocument.metadata.user_id == f.user_id if f.user_id else None, EventDocument.metadata.service_name == f.service_name if f.service_name else None, GTE(EventDocument.timestamp, f.start_time) if f.start_time else None, @@ -79,7 +78,7 @@ async def get_event_detail(self, event_id: str) -> EventDetail | None: event = DomainEventAdapter.validate_python(doc) - related_query = {"metadata.correlation_id": doc.metadata.correlation_id, "event_id": {"$ne": event_id}} + related_query = {"aggregate_id": doc.aggregate_id, "event_id": {"$ne": event_id}} related_docs = await ( EventDocument.find(related_query).sort([("timestamp", SortDirection.ASCENDING)]).limit(10).to_list() ) diff --git a/backend/app/db/repositories/event_repository.py b/backend/app/db/repositories/event_repository.py index ddcc16de..19f23270 100644 --- a/backend/app/db/repositories/event_repository.py +++ b/backend/app/db/repositories/event_repository.py @@ -74,29 +74,6 @@ async def get_events_by_aggregate( ) return [DomainEventAdapter.validate_python(d) for d in docs] - async def get_events_by_correlation( - self, correlation_id: str, limit: int = 100, skip: int = 0, user_id: str | None = None, - ) -> EventListResult: - conditions = [EventDocument.metadata.correlation_id == correlation_id] - if user_id: - conditions.append(EventDocument.metadata.user_id == user_id) - condition = {"$and": conditions} if len(conditions) > 1 else conditions[0] - docs = ( - await EventDocument.find(condition) - .sort([("timestamp", SortDirection.ASCENDING)]) - .skip(skip).limit(limit).to_list() - ) - events = [DomainEventAdapter.validate_python(d) for d in docs] - total_count = await EventDocument.find(condition).count() - total_count = max(total_count, skip + len(events)) - return EventListResult( - events=events, - total=total_count, - skip=skip, - limit=limit, - has_more=(skip + limit) < total_count, - ) - async def get_execution_events( self, execution_id: str, diff --git a/backend/app/db/repositories/user_settings_repository.py b/backend/app/db/repositories/user_settings_repository.py index f15f5d28..6002c26c 100644 --- a/backend/app/db/repositories/user_settings_repository.py +++ b/backend/app/db/repositories/user_settings_repository.py @@ -52,12 +52,7 @@ async def get_settings_events( find_query = find_query.limit(limit) docs = await find_query.to_list() - return [ - DomainUserSettingsChangedEvent.model_validate(e).model_copy( - update={"correlation_id": e.metadata.correlation_id} - ) - for e in docs - ] + return [DomainUserSettingsChangedEvent.model_validate(e) for e in docs] async def count_events_since_snapshot(self, user_id: str) -> int: aggregate_id = f"user_settings_{user_id}" diff --git a/backend/app/dlq/manager.py b/backend/app/dlq/manager.py index bfc910f5..d4ed3c95 100644 --- a/backend/app/dlq/manager.py +++ b/backend/app/dlq/manager.py @@ -99,7 +99,6 @@ async def handle_message(self, message: DLQMessage) -> None: metadata=EventMetadata( service_name="dlq-manager", service_version="1.0.0", - correlation_id=message.event.metadata.correlation_id, user_id=message.event.metadata.user_id, ), ), @@ -154,7 +153,6 @@ async def retry_message(self, message: DLQMessage) -> None: metadata=EventMetadata( service_name="dlq-manager", service_version="1.0.0", - correlation_id=message.event.metadata.correlation_id, user_id=message.event.metadata.user_id, ), ), @@ -185,7 +183,6 @@ async def discard_message(self, message: DLQMessage, reason: str) -> None: metadata=EventMetadata( service_name="dlq-manager", service_version="1.0.0", - correlation_id=message.event.metadata.correlation_id, user_id=message.event.metadata.user_id, ), ), diff --git a/backend/app/domain/admin/replay_models.py b/backend/app/domain/admin/replay_models.py index 1e274f3a..c6c22704 100644 --- a/backend/app/domain/admin/replay_models.py +++ b/backend/app/domain/admin/replay_models.py @@ -44,7 +44,7 @@ class ReplaySessionStatusInfo: replayed_events: int failed_events: int skipped_events: int - correlation_id: str + replay_id: str created_at: datetime started_at: datetime | None = None completed_at: datetime | None = None @@ -57,7 +57,7 @@ class ReplaySessionData: """Unified replay session data for both preview and actual replay.""" total_events: int - replay_correlation_id: str + replay_id: str dry_run: bool filter: ReplayFilter events_preview: list[EventSummary] = field(default_factory=list) diff --git a/backend/app/domain/admin/replay_updates.py b/backend/app/domain/admin/replay_updates.py index 5ada0b9a..89f8eb5e 100644 --- a/backend/app/domain/admin/replay_updates.py +++ b/backend/app/domain/admin/replay_updates.py @@ -15,7 +15,7 @@ class ReplaySessionUpdate(BaseModel): replayed_events: int | None = None failed_events: int | None = None skipped_events: int | None = None - correlation_id: str | None = None + replay_id: str | None = None started_at: datetime | None = None completed_at: datetime | None = None error: str | None = None diff --git a/backend/app/domain/events/event_models.py b/backend/app/domain/events/event_models.py index 580489e4..e1f5c7a7 100644 --- a/backend/app/domain/events/event_models.py +++ b/backend/app/domain/events/event_models.py @@ -47,7 +47,6 @@ class EventFilter(BaseModel): event_types: list[EventType] | None = None aggregate_id: str | None = None - correlation_id: str | None = None user_id: str | None = None service_name: str | None = None start_time: datetime | None = None diff --git a/backend/app/domain/events/typed.py b/backend/app/domain/events/typed.py index acbaca32..a250e7a4 100644 --- a/backend/app/domain/events/typed.py +++ b/backend/app/domain/events/typed.py @@ -34,7 +34,6 @@ class EventMetadata(BaseModel): service_name: str service_version: str - correlation_id: str = "" user_id: str = Field(default_factory=lambda: str(uuid4())) environment: Environment = Environment.PRODUCTION diff --git a/backend/app/domain/replay/models.py b/backend/app/domain/replay/models.py index 5f1642a7..3914e52c 100644 --- a/backend/app/domain/replay/models.py +++ b/backend/app/domain/replay/models.py @@ -33,7 +33,6 @@ class ReplayFilter(BaseModel): # Event selection filters event_ids: list[str] | None = None execution_id: str | None = None - correlation_id: str | None = None aggregate_id: str | None = None event_types: list[EventType] | None = None exclude_event_types: list[EventType] | None = None @@ -51,7 +50,6 @@ def is_empty(self) -> bool: [ self.event_ids, self.execution_id, - self.correlation_id, self.aggregate_id, self.event_types, self.start_time, @@ -70,9 +68,6 @@ def to_mongo_query(self) -> dict[str, Any]: if self.execution_id: query["execution_id"] = self.execution_id - if self.correlation_id: - query["metadata.correlation_id"] = self.correlation_id - if self.aggregate_id: query["aggregate_id"] = self.aggregate_id @@ -146,7 +141,7 @@ class ReplaySessionState(BaseModel): errors: list[ReplayError] = Field(default_factory=list) # Tracking and admin fields - correlation_id: str = Field(default_factory=lambda: str(uuid4())) + replay_id: str = Field(default_factory=lambda: str(uuid4())) created_by: str | None = None target_service: str | None = None dry_run: bool = False diff --git a/backend/app/domain/saga/models.py b/backend/app/domain/saga/models.py index 36ee931b..86303031 100644 --- a/backend/app/domain/saga/models.py +++ b/backend/app/domain/saga/models.py @@ -18,7 +18,6 @@ class SagaContextData(BaseModel): timeout_seconds: int | None = None allocation_id: str | None = None resources_allocated: bool = False - correlation_id: str = Field(default_factory=lambda: str(uuid4())) pod_creation_triggered: bool = False user_id: str = Field(default_factory=lambda: str(uuid4())) diff --git a/backend/app/domain/user/settings_models.py b/backend/app/domain/user/settings_models.py index d0b61d83..9c7aef69 100644 --- a/backend/app/domain/user/settings_models.py +++ b/backend/app/domain/user/settings_models.py @@ -85,7 +85,6 @@ class DomainUserSettingsChangedEvent(BaseModel): editor: DomainEditorSettings | None = None custom_settings: dict[str, Any] | None = None reason: str | None = None - correlation_id: str | None = None class DomainSettingsHistoryEntry(BaseModel): @@ -97,7 +96,6 @@ class DomainSettingsHistoryEntry(BaseModel): old_value: Any new_value: Any reason: str | None = None - correlation_id: str | None = None class CachedSettings(BaseModel): diff --git a/backend/app/main.py b/backend/app/main.py index 33450805..bd5ba081 100644 --- a/backend/app/main.py +++ b/backend/app/main.py @@ -25,7 +25,6 @@ users_router as admin_users_router, ) from app.core.container import create_app_container -from app.core.correlation import CorrelationMiddleware from app.core.dishka_lifespan import lifespan from app.core.exceptions import configure_exception_handlers from app.core.logging import setup_logger @@ -75,7 +74,6 @@ def create_app(settings: Settings | None = None) -> FastAPI: app.add_middleware(MetricsMiddleware) app.add_middleware(RateLimitMiddleware, settings=settings) app.add_middleware(CSRFMiddleware) - app.add_middleware(CorrelationMiddleware) app.add_middleware(RequestSizeLimitMiddleware) app.add_middleware(CacheControlMiddleware) @@ -100,9 +98,8 @@ def create_app(settings: Settings | None = None) -> FastAPI: "Origin", "X-Requested-With", "X-CSRF-Token", - "X-Correlation-ID", ], - expose_headers=["Content-Length", "Content-Range", "X-Correlation-ID"], + expose_headers=["Content-Length", "Content-Range"], ) logger.info("CORS middleware configured") diff --git a/backend/app/schemas_pydantic/admin_events.py b/backend/app/schemas_pydantic/admin_events.py index 14fc083c..360a0c54 100644 --- a/backend/app/schemas_pydantic/admin_events.py +++ b/backend/app/schemas_pydantic/admin_events.py @@ -14,7 +14,6 @@ class EventFilter(BaseModel): event_types: list[EventType] | None = None aggregate_id: str | None = None - correlation_id: str | None = None user_id: str | None = None start_time: datetime | None = None end_time: datetime | None = None @@ -34,7 +33,6 @@ class EventReplayRequest(BaseModel): """Request model for replaying events""" event_ids: list[str] | None = None - correlation_id: str | None = None aggregate_id: str | None = None start_time: datetime | None = None end_time: datetime | None = None @@ -70,7 +68,7 @@ class EventReplayResponse(BaseModel): dry_run: bool total_events: int - replay_correlation_id: str + replay_id: str session_id: str | None = None status: ReplayStatus events_preview: list[EventSummary] | None = None @@ -87,7 +85,7 @@ class EventReplayStatusResponse(BaseModel): replayed_events: int failed_events: int skipped_events: int - correlation_id: str + replay_id: str created_at: datetime started_at: datetime | None = None completed_at: datetime | None = None diff --git a/backend/app/schemas_pydantic/user_settings.py b/backend/app/schemas_pydantic/user_settings.py index 53a0cb8e..76140933 100644 --- a/backend/app/schemas_pydantic/user_settings.py +++ b/backend/app/schemas_pydantic/user_settings.py @@ -104,7 +104,6 @@ class SettingsHistoryEntry(BaseModel): old_value: Any new_value: Any reason: str | None = None - correlation_id: str | None = None class SettingsHistoryResponse(BaseModel): @@ -129,4 +128,3 @@ class SettingsEvent(BaseModel): event_type: EventType timestamp: datetime payload: dict[str, Any] - correlation_id: str | None = None diff --git a/backend/app/services/admin/admin_events_service.py b/backend/app/services/admin/admin_events_service.py index cd1dbfdc..1fda61c7 100644 --- a/backend/app/services/admin/admin_events_service.py +++ b/backend/app/services/admin/admin_events_service.py @@ -32,7 +32,6 @@ def _export_row_to_dict(row: EventExportRow) -> dict[str, str]: "Event ID": data["event_id"], "Event Type": data["event_type"], "Timestamp": data["timestamp"], - "Correlation ID": meta.get("correlation_id") or "", "Aggregate ID": data.get("aggregate_id") or "", "User ID": meta.get("user_id") or "", "Service": meta.get("service_name", ""), @@ -47,14 +46,14 @@ def __init__( *, dry_run: bool, total_events: int, - replay_correlation_id: str, + replay_id: str, status: ReplayStatus, session_id: str | None = None, events_preview: list[EventSummary] | None = None, ) -> None: self.dry_run = dry_run self.total_events = total_events - self.replay_correlation_id = replay_correlation_id + self.replay_id = replay_id self.status = status self.session_id = session_id self.events_preview = events_preview @@ -98,7 +97,7 @@ async def prepare_or_schedule_replay( *, replay_filter: ReplayFilter, dry_run: bool, - replay_correlation_id: str, + replay_id: str, target_service: str | None, ) -> AdminReplayResult: if replay_filter.is_empty(): @@ -108,7 +107,7 @@ async def prepare_or_schedule_replay( "Preparing replay session", extra={ "dry_run": dry_run, - "replay_correlation_id": replay_correlation_id, + "replay_id": replay_id, }, ) @@ -126,7 +125,7 @@ async def prepare_or_schedule_replay( session_data = ReplaySessionData( total_events=event_count, - replay_correlation_id=replay_correlation_id, + replay_id=replay_id, dry_run=dry_run, filter=replay_filter, events_preview=events_preview, @@ -136,7 +135,7 @@ async def prepare_or_schedule_replay( result = AdminReplayResult( dry_run=True, total_events=session_data.total_events, - replay_correlation_id=replay_correlation_id, + replay_id=replay_id, status=ReplayStatus.PREVIEW, events_preview=session_data.events_preview, ) @@ -144,7 +143,7 @@ async def prepare_or_schedule_replay( "Replay dry-run prepared", extra={ "total_events": result.total_events, - "replay_correlation_id": result.replay_correlation_id, + "replay_id": result.replay_id, }, ) return result @@ -167,7 +166,7 @@ async def prepare_or_schedule_replay( # Persist additional metadata to the admin replay session record session_update = ReplaySessionUpdate( total_events=session_data.total_events, - correlation_id=replay_correlation_id, + replay_id=replay_id, status=ReplayStatus.SCHEDULED, ) await self._repo.update_replay_session( @@ -178,7 +177,7 @@ async def prepare_or_schedule_replay( result = AdminReplayResult( dry_run=False, total_events=session_data.total_events, - replay_correlation_id=replay_correlation_id, + replay_id=replay_id, session_id=session_id, status=ReplayStatus.SCHEDULED, ) @@ -187,7 +186,7 @@ async def prepare_or_schedule_replay( extra={ "session_id": result.session_id, "total_events": result.total_events, - "replay_correlation_id": result.replay_correlation_id, + "replay_id": result.replay_id, }, ) return result @@ -232,7 +231,6 @@ async def _export_csv(self, *, event_filter: EventFilter, limit: int) -> ExportR "Event ID", "Event Type", "Timestamp", - "Correlation ID", "Aggregate ID", "User ID", "Service", @@ -288,13 +286,11 @@ async def delete_event(self, *, event_id: str, deleted_by: str) -> bool: await self._repo.archive_event(detail.event, deleted_by) deleted = await self._repo.delete_event(event_id) if deleted: - correlation_id = detail.event.metadata.correlation_id self.logger.info( "Event deleted", extra={ "event_id": event_id, "event_type": detail.event.event_type, - "correlation_id": correlation_id, "deleted_by": deleted_by, }, ) diff --git a/backend/app/services/coordinator/coordinator.py b/backend/app/services/coordinator/coordinator.py index 32ee9c29..220483ca 100644 --- a/backend/app/services/coordinator/coordinator.py +++ b/backend/app/services/coordinator/coordinator.py @@ -273,7 +273,6 @@ async def _build_command_metadata(self, request: ExecutionRequestedEvent) -> Eve service_name="execution-coordinator", service_version="1.0.0", user_id=request.metadata.user_id, - correlation_id=request.metadata.correlation_id, ) async def _publish_create_pod_command(self, request: ExecutionRequestedEvent) -> None: diff --git a/backend/app/services/event_service.py b/backend/app/services/event_service.py index db039469..596aa4bd 100644 --- a/backend/app/services/event_service.py +++ b/backend/app/services/event_service.py @@ -21,8 +21,6 @@ def _filter_to_mongo_query(flt: EventFilter) -> dict[str, Any]: query["event_type"] = {"$in": flt.event_types} if flt.aggregate_id: query["aggregate_id"] = flt.aggregate_id - if flt.correlation_id: - query["metadata.correlation_id"] = flt.correlation_id if flt.user_id: query["metadata.user_id"] = flt.user_id if flt.service_name: @@ -126,7 +124,6 @@ async def query_events_advanced( "timestamp": "timestamp", "event_type": "event_type", "aggregate_id": "aggregate_id", - "correlation_id": "metadata.correlation_id", "stored_at": "stored_at", } sort_field = field_map.get(sort_by, "timestamp") @@ -138,20 +135,6 @@ async def query_events_advanced( limit=limit, ) - async def get_events_by_correlation( - self, - correlation_id: str, - user_id: str, - user_role: UserRole, - include_all_users: bool = False, - limit: int = 100, - skip: int = 0, - ) -> EventListResult: - filter_user = user_id if not include_all_users or user_role != UserRole.ADMIN else None - return await self.repository.get_events_by_correlation( - correlation_id=correlation_id, limit=limit, skip=skip, user_id=filter_user, - ) - async def get_event_statistics( self, user_id: str, diff --git a/backend/app/services/execution_service.py b/backend/app/services/execution_service.py index 3ed9d9ab..1ba03bc6 100644 --- a/backend/app/services/execution_service.py +++ b/backend/app/services/execution_service.py @@ -73,13 +73,12 @@ async def get_k8s_resource_limits(self) -> ResourceLimitsDomain: async def get_example_scripts(self) -> dict[str, str]: return self.settings.EXAMPLE_SCRIPTS - def _create_event_metadata(self, user_id: str, correlation_id: str = "") -> EventMetadata: + def _create_event_metadata(self, user_id: str) -> EventMetadata: """Create standardized event metadata.""" return EventMetadata( service_name="execution-service", service_version="2.0.0", user_id=user_id, - correlation_id=correlation_id, ) async def execute_script( @@ -90,7 +89,6 @@ async def execute_script( lang: str = "python", lang_version: str = "3.11", priority: QueuePriority = QueuePriority.NORMAL, - correlation_id: str = "", ) -> DomainExecution: """ Execute a script by creating an execution record and publishing an event. @@ -148,7 +146,7 @@ async def execute_script( ) # Metadata and event — use admin-configurable limits - metadata = self._create_event_metadata(user_id=user_id, correlation_id=correlation_id) + metadata = self._create_event_metadata(user_id=user_id) effective = await self._runtime_settings.get_effective_settings() event = ExecutionRequestedEvent( execution_id=created_execution.execution_id, @@ -202,7 +200,6 @@ async def cancel_execution( current_status: ExecutionStatus, user_id: str, reason: str = "User requested cancellation", - correlation_id: str = "", ) -> CancelResult: """ Cancel a running or queued execution. @@ -237,7 +234,7 @@ async def cancel_execution( event_id=None, ) - metadata = self._create_event_metadata(user_id=user_id, correlation_id=correlation_id) + metadata = self._create_event_metadata(user_id=user_id) event = ExecutionCancelledEvent( execution_id=execution_id, aggregate_id=execution_id, @@ -268,7 +265,6 @@ async def execute_script_idempotent( lang: str = "python", lang_version: str = "3.11", idempotency_key: str | None = None, - correlation_id: str = "", ) -> DomainExecution: """ Execute a script with optional idempotency support. @@ -289,7 +285,6 @@ async def execute_script_idempotent( lang=lang, lang_version=lang_version, user_id=user_id, - correlation_id=correlation_id, ) pseudo_event = BaseEvent( @@ -298,7 +293,6 @@ async def execute_script_idempotent( timestamp=datetime.now(timezone.utc), metadata=EventMetadata( user_id=user_id, - correlation_id=str(uuid4()), service_name="api", service_version="1.0.0", ), @@ -334,7 +328,6 @@ async def execute_script_idempotent( lang=lang, lang_version=lang_version, user_id=user_id, - correlation_id=correlation_id, ) await self.idempotency_manager.mark_completed_with_json( @@ -510,7 +503,7 @@ def _build_user_query( return query - async def delete_execution(self, execution_id: str, user_id: str, correlation_id: str = "") -> bool: + async def delete_execution(self, execution_id: str, user_id: str) -> bool: """ Delete an execution and publish deletion event. @@ -530,11 +523,11 @@ async def delete_execution(self, execution_id: str, user_id: str, correlation_id self.logger.info("Deleted execution", extra={"execution_id": execution_id}) - await self._publish_deletion_event(execution_id, user_id, correlation_id) + await self._publish_deletion_event(execution_id, user_id) return True - async def _publish_deletion_event(self, execution_id: str, user_id: str, correlation_id: str = "") -> None: + async def _publish_deletion_event(self, execution_id: str, user_id: str) -> None: """ Publish execution deletion/cancellation event. @@ -542,7 +535,7 @@ async def _publish_deletion_event(self, execution_id: str, user_id: str, correla execution_id: UUID of deleted execution. user_id: ID of user who deleted it. """ - metadata = self._create_event_metadata(user_id=user_id, correlation_id=correlation_id) + metadata = self._create_event_metadata(user_id=user_id) event = ExecutionCancelledEvent( execution_id=execution_id, diff --git a/backend/app/services/k8s_worker/pod_builder.py b/backend/app/services/k8s_worker/pod_builder.py index 045b80bb..3f621add 100644 --- a/backend/app/services/k8s_worker/pod_builder.py +++ b/backend/app/services/k8s_worker/pod_builder.py @@ -20,7 +20,6 @@ def build_pod_manifest(self, command: CreatePodCommandEvent) -> k8s_client.V1Pod execution_id=execution_id, user_id=command.metadata.user_id, language=command.language, - correlation_id=command.metadata.correlation_id, saga_id=command.saga_id, ) @@ -151,18 +150,13 @@ def _build_pod_metadata( execution_id: str, user_id: str, language: str, - correlation_id: str | None = None, saga_id: str | None = None, ) -> k8s_client.V1ObjectMeta: - """Build pod metadata with correlation and saga tracking""" + """Build pod metadata with saga tracking""" labels = {"app": "integr8s", "component": "executor", "execution-id": execution_id, "language": language} labels["user-id"] = user_id[:63] # K8s label value limit - # Add correlation_id if provided (truncate to K8s label limit) - if correlation_id: - labels["correlation-id"] = correlation_id[:63] - # Add saga_id if provided (truncate to K8s label limit) if saga_id: labels["saga-id"] = saga_id[:63] @@ -173,9 +167,6 @@ def _build_pod_metadata( "integr8s.io/language": language, } - if correlation_id: - annotations["integr8s.io/correlation-id"] = correlation_id - if saga_id: annotations["integr8s.io/saga-id"] = saga_id diff --git a/backend/app/services/pod_monitor/event_mapper.py b/backend/app/services/pod_monitor/event_mapper.py index 8710060a..a2615b7c 100644 --- a/backend/app/services/pod_monitor/event_mapper.py +++ b/backend/app/services/pod_monitor/event_mapper.py @@ -178,21 +178,15 @@ def _extract_execution_id(self, pod: k8s_client.V1Pod) -> str | None: return None def _create_metadata(self, pod: k8s_client.V1Pod) -> EventMetadata: - """Create event metadata from pod with correlation tracking""" + """Create event metadata from pod""" labels = pod.metadata.labels or {} - annotations = pod.metadata.annotations or {} - - # Try to get correlation_id from annotations first (full value), - # then labels (potentially truncated) - correlation_id = annotations.get("integr8s.io/correlation-id") or labels.get("correlation-id") or "" md = EventMetadata( user_id=labels.get("user-id", str(uuid4())), service_name=GroupId.POD_MONITOR, service_version="1.0.0", - correlation_id=correlation_id, ) - self.logger.info(f"POD-EVENT: metadata user_id={md.user_id} corr={md.correlation_id} name={pod.metadata.name}") + self.logger.info(f"POD-EVENT: metadata user_id={md.user_id} name={pod.metadata.name}") return md def _is_duplicate(self, pod_name: str, phase: PodPhase) -> bool: diff --git a/backend/app/services/result_processor/processor.py b/backend/app/services/result_processor/processor.py index eaea4a81..09f58caa 100644 --- a/backend/app/services/result_processor/processor.py +++ b/backend/app/services/result_processor/processor.py @@ -68,10 +68,10 @@ async def handle_execution_completed(self, event: DomainEvent) -> None: meta = event.metadata try: await self._execution_repo.write_terminal_result(result) - await self._publish_result_stored(result, meta.correlation_id, meta.user_id) + await self._publish_result_stored(result, meta.user_id) except Exception as e: self.logger.error(f"Failed to handle ExecutionCompletedEvent: {e}", exc_info=True) - await self._publish_result_failed(event.execution_id, str(e), meta.correlation_id, meta.user_id) + await self._publish_result_failed(event.execution_id, str(e), meta.user_id) async def handle_execution_failed(self, event: DomainEvent) -> None: """Handle execution failed event.""" @@ -90,10 +90,10 @@ async def handle_execution_failed(self, event: DomainEvent) -> None: meta = event.metadata try: await self._execution_repo.write_terminal_result(result) - await self._publish_result_stored(result, meta.correlation_id, meta.user_id) + await self._publish_result_stored(result, meta.user_id) except Exception as e: self.logger.error(f"Failed to handle ExecutionFailedEvent: {e}", exc_info=True) - await self._publish_result_failed(event.execution_id, str(e), meta.correlation_id, meta.user_id) + await self._publish_result_failed(event.execution_id, str(e), meta.user_id) async def handle_execution_timeout(self, event: DomainEvent) -> None: """Handle execution timeout event.""" @@ -116,12 +116,12 @@ async def handle_execution_timeout(self, event: DomainEvent) -> None: meta = event.metadata try: await self._execution_repo.write_terminal_result(result) - await self._publish_result_stored(result, meta.correlation_id, meta.user_id) + await self._publish_result_stored(result, meta.user_id) except Exception as e: self.logger.error(f"Failed to handle ExecutionTimeoutEvent: {e}", exc_info=True) - await self._publish_result_failed(event.execution_id, str(e), meta.correlation_id, meta.user_id) + await self._publish_result_failed(event.execution_id, str(e), meta.user_id) - async def _publish_result_stored(self, result: ExecutionResultDomain, correlation_id: str, user_id: str) -> None: + async def _publish_result_stored(self, result: ExecutionResultDomain, user_id: str) -> None: """Publish result stored event.""" size_bytes = len(result.stdout) + len(result.stderr) event = ResultStoredEvent( @@ -132,14 +132,13 @@ async def _publish_result_stored(self, result: ExecutionResultDomain, correlatio metadata=EventMetadata( service_name=GroupId.RESULT_PROCESSOR, service_version="1.0.0", - correlation_id=correlation_id, user_id=user_id, ), ) await self._producer.produce(event_to_produce=event, key=result.execution_id) async def _publish_result_failed( - self, execution_id: str, error_message: str, correlation_id: str, user_id: str, + self, execution_id: str, error_message: str, user_id: str, ) -> None: """Publish result processing failed event.""" event = ResultFailedEvent( @@ -148,7 +147,6 @@ async def _publish_result_failed( metadata=EventMetadata( service_name=GroupId.RESULT_PROCESSOR, service_version="1.0.0", - correlation_id=correlation_id, user_id=user_id, ), ) diff --git a/backend/app/services/saga/execution_saga.py b/backend/app/services/saga/execution_saga.py index 003c08df..1401fb79 100644 --- a/backend/app/services/saga/execution_saga.py +++ b/backend/app/services/saga/execution_saga.py @@ -115,13 +115,11 @@ async def execute(self, context: SagaContext, event: ExecutionRequestedEvent) -> service_name="saga-orchestrator", service_version="1.0.0", user_id=event.metadata.user_id, - correlation_id=event.metadata.correlation_id, ), ) await self.producer.produce(event_to_produce=create_pod_cmd, key=execution_id) - context.set("correlation_id", event.metadata.correlation_id) context.set("pod_creation_triggered", True) logger.info(f"CreatePodCommandEvent published for execution {execution_id}") @@ -174,7 +172,6 @@ async def compensate(self, context: SagaContext) -> bool: service_name="saga-orchestrator", service_version="1.0.0", user_id=context.get("user_id") or str(uuid4()), - correlation_id=context.get("correlation_id", ""), ), ) diff --git a/backend/app/services/saga/saga_orchestrator.py b/backend/app/services/saga/saga_orchestrator.py index b2e3fa0e..e6c96473 100644 --- a/backend/app/services/saga/saga_orchestrator.py +++ b/backend/app/services/saga/saga_orchestrator.py @@ -114,7 +114,6 @@ async def _start_saga(self, trigger_event: ExecutionRequestedEvent) -> str: saga = self._create_saga_instance() context = SagaContext(instance.saga_id, execution_id) - context.set("correlation_id", trigger_event.metadata.correlation_id) context.set("user_id", trigger_event.metadata.user_id) await self._execute_saga(saga, instance, context, trigger_event) @@ -326,7 +325,6 @@ async def _publish_saga_started_event( service_name="saga-orchestrator", service_version="1.0.0", user_id=trigger_event.metadata.user_id, - correlation_id=trigger_event.metadata.correlation_id, ), ) await self._producer.produce(event_to_produce=event, key=instance.execution_id) @@ -342,7 +340,6 @@ async def _publish_saga_cancelled_event(self, saga_instance: Saga) -> None: service_name="saga-orchestrator", service_version="1.0.0", user_id=cancelled_by, - correlation_id=saga_instance.context_data.correlation_id, ) event = SagaCancelledEvent( diff --git a/backend/app/services/user_settings_service.py b/backend/app/services/user_settings_service.py index a2af9029..bee8794f 100644 --- a/backend/app/services/user_settings_service.py +++ b/backend/app/services/user_settings_service.py @@ -158,7 +158,6 @@ async def get_settings_history(self, user_id: str, limit: int = 50) -> list[Doma old_value=None, new_value=event.model_dump().get(fld), reason=event.reason, - correlation_id=event.correlation_id, ) ) return history diff --git a/backend/tests/e2e/app/test_main_app.py b/backend/tests/e2e/app/test_main_app.py index 08e2b536..b1e7a914 100644 --- a/backend/tests/e2e/app/test_main_app.py +++ b/backend/tests/e2e/app/test_main_app.py @@ -128,11 +128,6 @@ def test_cors_middleware_configured(self, app: FastAPI) -> None: middleware_classes = self._get_middleware_class_names(app) assert "CORSMiddleware" in middleware_classes - def test_correlation_middleware_configured(self, app: FastAPI) -> None: - """Correlation ID middleware is configured.""" - middleware_classes = self._get_middleware_class_names(app) - assert "CorrelationMiddleware" in middleware_classes - def test_request_size_limit_middleware_configured(self, app: FastAPI) -> None: """Request size limit middleware is configured.""" middleware_classes = self._get_middleware_class_names(app) @@ -160,11 +155,10 @@ def test_csrf_middleware_configured(self, app: FastAPI) -> None: def test_middleware_count(self, app: FastAPI) -> None: """Expected number of middlewares are configured.""" - # CORS, Correlation, RequestSizeLimit, CacheControl, Metrics, RateLimit, CSRF + # CORS, RequestSizeLimit, CacheControl, Metrics, RateLimit, CSRF middleware_classes = self._get_middleware_class_names(app) expected_middlewares = { "CORSMiddleware", - "CorrelationMiddleware", "RequestSizeLimitMiddleware", "CacheControlMiddleware", "MetricsMiddleware", @@ -220,15 +214,6 @@ def test_cors_allows_required_headers(self, app: FastAPI) -> None: assert "Authorization" in headers assert "Content-Type" in headers assert "X-CSRF-Token" in headers - assert "X-Correlation-ID" in headers - - def test_cors_exposes_correlation_header(self, app: FastAPI) -> None: - """CORS exposes X-Correlation-ID header to clients.""" - cors_kwargs = self._get_cors_kwargs(app) - assert cors_kwargs is not None - - exposed = cors_kwargs.get("expose_headers", []) - assert "X-Correlation-ID" in exposed def _get_cors_kwargs(self, app: FastAPI) -> dict[str, Any] | None: """Get CORS middleware kwargs from app.""" diff --git a/backend/tests/e2e/core/test_middlewares.py b/backend/tests/e2e/core/test_middlewares.py index d0edc147..2564fe86 100644 --- a/backend/tests/e2e/core/test_middlewares.py +++ b/backend/tests/e2e/core/test_middlewares.py @@ -5,35 +5,6 @@ pytestmark = pytest.mark.e2e -class TestCorrelationMiddleware: - """Tests for CorrelationMiddleware.""" - - @pytest.mark.asyncio - async def test_generates_correlation_id(self, client: httpx.AsyncClient) -> None: - """Middleware generates correlation ID when not provided.""" - response = await client.get("/api/v1/health/live") - - assert response.status_code == 200 - assert "X-Correlation-ID" in response.headers - correlation_id = response.headers["X-Correlation-ID"] - assert correlation_id.startswith("req-") - - @pytest.mark.asyncio - async def test_passes_through_correlation_id( - self, client: httpx.AsyncClient - ) -> None: - """Middleware uses provided correlation ID.""" - custom_id = "custom-correlation-12345" - - response = await client.get( - "/api/v1/health/live", - headers={"X-Correlation-ID": custom_id}, - ) - - assert response.status_code == 200 - assert response.headers["X-Correlation-ID"] == custom_id - - class TestCSRFMiddleware: """Tests for CSRFMiddleware.""" @@ -241,16 +212,6 @@ async def test_auth_endpoints_exempt( class TestMiddlewareOrder: """Tests for middleware execution order.""" - @pytest.mark.asyncio - async def test_correlation_id_before_other_processing( - self, client: httpx.AsyncClient - ) -> None: - """Correlation ID is set before other middleware runs.""" - # Even on error responses, correlation ID should be present - response = await client.get("/nonexistent-path") - - assert "X-Correlation-ID" in response.headers - @pytest.mark.asyncio async def test_all_middlewares_work_together( self, test_user: httpx.AsyncClient, test_settings: Settings @@ -258,9 +219,6 @@ async def test_all_middlewares_work_together( """All middlewares work correctly in combination.""" response = await test_user.get("/api/v1/notifications") - # Correlation middleware ran - assert "X-Correlation-ID" in response.headers - # Cache control middleware ran assert "Cache-Control" in response.headers diff --git a/backend/tests/e2e/test_admin_events_routes.py b/backend/tests/e2e/test_admin_events_routes.py index da824f2c..0bad3ed7 100644 --- a/backend/tests/e2e/test_admin_events_routes.py +++ b/backend/tests/e2e/test_admin_events_routes.py @@ -332,7 +332,7 @@ async def test_replay_events_dry_run( result = EventReplayResponse.model_validate(response.json()) assert result.dry_run is True assert result.total_events >= 1 - assert result.replay_correlation_id is not None + assert result.replay_id is not None assert result.status == ReplayStatus.PREVIEW @pytest.mark.asyncio @@ -341,7 +341,7 @@ async def test_replay_events_no_events_found( ) -> None: """Replay with non-matching filter returns 404.""" request = EventReplayRequest( - correlation_id="nonexistent-correlation-id-12345", + aggregate_id="nonexistent-aggregate-id-12345", dry_run=True, ) response = await test_admin.post( diff --git a/backend/tests/unit/core/test_logging_and_correlation.py b/backend/tests/unit/core/test_logging_and_correlation.py index 934e4482..0f09fd5d 100644 --- a/backend/tests/unit/core/test_logging_and_correlation.py +++ b/backend/tests/unit/core/test_logging_and_correlation.py @@ -3,18 +3,12 @@ import pytest import structlog -from app.core.correlation import CorrelationMiddleware from app.core.logging import ( SENSITIVE_PATTERNS, add_otel_context, sanitize_sensitive_data, setup_logger, ) -from starlette.applications import Starlette -from starlette.requests import Request -from starlette.responses import JSONResponse -from starlette.routing import Route -from starlette.testclient import TestClient class TestSanitizeSensitiveData: @@ -110,50 +104,6 @@ def test_with_valid_span(self) -> None: assert result["span_id"] == "1234567890abcdef" -class TestCorrelationMiddleware: - """Tests for CorrelationMiddleware.""" - - def test_sets_correlation_header(self) -> None: - async def ping(request: Request) -> JSONResponse: - return JSONResponse({"ok": True}) - - app = Starlette(routes=[Route("/ping", ping)]) - app.add_middleware(CorrelationMiddleware) - - with TestClient(app) as client: - response = client.get("/ping") - - assert response.status_code == 200 - assert "X-Correlation-ID" in response.headers - - def test_preserves_provided_correlation_id(self) -> None: - async def ping(request: Request) -> JSONResponse: - return JSONResponse({"ok": True}) - - app = Starlette(routes=[Route("/ping", ping)]) - app.add_middleware(CorrelationMiddleware) - - with TestClient(app) as client: - response = client.get("/ping", headers={"X-Correlation-ID": "custom-id-123"}) - assert response.headers["X-Correlation-ID"] == "custom-id-123" - - def test_stores_correlation_in_scope_state(self) -> None: - captured_state: dict[str, Any] = {} - - async def capture(request: Request) -> JSONResponse: - captured_state["correlation_id"] = request.state.correlation_id - return JSONResponse({"ok": True}) - - app = Starlette(routes=[Route("/capture", capture)]) - app.add_middleware(CorrelationMiddleware) - - with TestClient(app) as client: - client.get("/capture") - - assert "correlation_id" in captured_state - assert captured_state["correlation_id"].startswith("req-") - - class TestSetupLogger: """Tests for logger setup.""" diff --git a/backend/tests/unit/events/test_metadata_model.py b/backend/tests/unit/events/test_metadata_model.py index aae3644f..261a1092 100644 --- a/backend/tests/unit/events/test_metadata_model.py +++ b/backend/tests/unit/events/test_metadata_model.py @@ -5,16 +5,8 @@ def test_metadata_creation() -> None: m = EventMetadata(service_name="svc", service_version="1") assert m.service_name == "svc" assert m.service_version == "1" - assert m.correlation_id == "" # empty until filled by publish_event def test_metadata_with_user() -> None: m = EventMetadata(service_name="svc", service_version="1", user_id="u1") assert m.user_id == "u1" - - -def test_metadata_copy_with_correlation() -> None: - m = EventMetadata(service_name="svc", service_version="1") - m2 = m.model_copy(update={"correlation_id": "cid"}) - assert m2.correlation_id == "cid" - assert m2.service_name == m.service_name diff --git a/backend/tests/unit/services/pod_monitor/test_event_mapper.py b/backend/tests/unit/services/pod_monitor/test_event_mapper.py index 8f956d59..7d5d586b 100644 --- a/backend/tests/unit/services/pod_monitor/test_event_mapper.py +++ b/backend/tests/unit/services/pod_monitor/test_event_mapper.py @@ -159,20 +159,18 @@ def test_extract_id_and_metadata_priority_and_duplicates() -> None: p = make_pod( name="any", phase="Pending", - labels={"execution-id": "L1", "user-id": "u", "correlation-id": "corrL"}, + labels={"execution-id": "L1", "user-id": "u"}, ) md = pem._create_metadata(p) - assert pem._extract_execution_id(p) == "L1" and md.user_id == "u" and md.correlation_id == "corrL" + assert pem._extract_execution_id(p) == "L1" and md.user_id == "u" # From annotation when label absent p2 = make_pod( name="any", phase="Pending", - annotations={"integr8s.io/execution-id": "A1", "integr8s.io/correlation-id": "corrA"}, + annotations={"integr8s.io/execution-id": "A1"}, ) assert pem._extract_execution_id(p2) == "A1" - md2 = pem._create_metadata(p2) - assert md2.correlation_id == "corrA" # From name pattern exec- p3 = make_pod(name="exec-XYZ", phase="Pending") diff --git a/backend/tests/unit/services/pod_monitor/test_monitor.py b/backend/tests/unit/services/pod_monitor/test_monitor.py index 28d9ecdc..06d950a3 100644 --- a/backend/tests/unit/services/pod_monitor/test_monitor.py +++ b/backend/tests/unit/services/pod_monitor/test_monitor.py @@ -278,7 +278,6 @@ class MockMapper: async def map_pod_event(self, pod: Any, event_type: WatchEventType) -> list[Any]: # noqa: ARG002 class Event: event_type = types.SimpleNamespace(value="test_event") - metadata = types.SimpleNamespace(correlation_id=None) aggregate_id = "agg1" return [Event()] diff --git a/backend/tests/unit/services/test_pod_builder.py b/backend/tests/unit/services/test_pod_builder.py index 528f1b17..6fdd96d4 100644 --- a/backend/tests/unit/services/test_pod_builder.py +++ b/backend/tests/unit/services/test_pod_builder.py @@ -47,7 +47,6 @@ def create_pod_command(self) -> CreatePodCommandEvent: priority=QueuePriority.NORMAL, metadata=EventMetadata( user_id=str(uuid4()), - correlation_id=str(uuid4()), service_name="test-service", service_version="1.0.0" ) @@ -164,7 +163,6 @@ def test_container_resources_defaults( service_name="svc", service_version="1", user_id=str(uuid4()), - correlation_id=str(uuid4()) ) ) @@ -354,7 +352,6 @@ def test_pod_labels_truncation( service_name="svc", service_version="1", user_id=long_id, - correlation_id=long_id ) ) @@ -362,11 +359,9 @@ def test_pod_labels_truncation( # Verify labels are truncated to 63 chars assert len(pod.metadata.labels["user-id"]) == 63 - assert len(pod.metadata.labels["correlation-id"]) == 63 assert len(pod.metadata.labels["saga-id"]) == 63 # But annotations should have full values - assert pod.metadata.annotations["integr8s.io/correlation-id"] == long_id assert pod.metadata.annotations["integr8s.io/saga-id"] == long_id def test_pod_restart_policy( diff --git a/docs/architecture/event-storage.md b/docs/architecture/event-storage.md index ae56c78f..cc198fe4 100644 --- a/docs/architecture/event-storage.md +++ b/docs/architecture/event-storage.md @@ -36,7 +36,7 @@ For MongoDB queries, access payload fields with dot notation: ```python query["payload.execution_id"] = execution_id -query["metadata.correlation_id"] = correlation_id +query["aggregate_id"] = aggregate_id ``` ## Write flow diff --git a/docs/operations/logging.md b/docs/operations/logging.md index 64470f3a..8d242752 100644 --- a/docs/operations/logging.md +++ b/docs/operations/logging.md @@ -1,6 +1,6 @@ # Logging -This backend uses structured JSON logging with automatic correlation IDs, trace context injection, and sensitive data +This backend uses structured JSON logging with OpenTelemetry trace context injection and sensitive data sanitization. The goal is logs that are both secure against injection attacks and easy to query in aggregation systems like Elasticsearch or Loki. @@ -9,8 +9,7 @@ like Elasticsearch or Loki. ```mermaid flowchart LR Code[Application Code] --> Logger - Logger --> CF[CorrelationFilter] - CF --> TF[TracingFilter] + Logger --> TF[TracingFilter] TF --> JF[JSONFormatter] JF --> Output[JSON stdout] ``` @@ -19,16 +18,15 @@ flowchart LR The logger is created once during application startup via dependency injection. The [`setup_logger`](https://github.com/HardMax71/Integr8sCode/blob/main/backend/app/core/logging.py) function configures a -JSON formatter and attaches filters for correlation IDs and OpenTelemetry trace context: +JSON formatter and attaches a filter for OpenTelemetry trace context: ```python --8<-- "backend/app/core/logging.py:110:147" ``` The JSON formatter does two things beyond basic formatting. First, it injects context that would be tedious to pass -manually—the correlation ID from the current request, the trace and span IDs from OpenTelemetry, and request metadata -like method and path. Second, it sanitizes sensitive data by pattern-matching things like API keys, JWT tokens, and -database URLs: +manually—the trace and span IDs from OpenTelemetry, and request metadata like method and path. Second, it sanitizes +sensitive data by pattern-matching things like API keys, JWT tokens, and database URLs: ```python --8<-- "backend/app/core/logging.py:35:59" @@ -74,11 +72,10 @@ messages (which often contain user data). ## What gets logged -Correlation and trace IDs are injected automatically by filters: +Trace IDs are injected automatically by the OTel filter: | Field | Source | Purpose | |------------------|---------------------------------|-----------------------------------| -| `correlation_id` | Request header or generated | Track request across services | | `trace_id` | OpenTelemetry | Link to distributed traces | | `span_id` | OpenTelemetry | Link to specific span | | `request_method` | HTTP request | GET, POST, etc. | @@ -90,8 +87,8 @@ consistent: the message says what happened, `extra` says to what and by whom. ## Practical use -When something goes wrong, start by filtering logs by `correlation_id` to see everything that happened during that -request. If you need to correlate with traces, use the `trace_id` to jump to Jaeger. +When something goes wrong, start by filtering logs by `trace_id` to see everything that happened during that +request. Use the `trace_id` to jump to Jaeger for the full distributed trace. | Log Level | Use case | |-----------|-------------------------------------------------------------| @@ -107,4 +104,3 @@ The log level is controlled by the `LOG_LEVEL` environment variable. | File | Purpose | |------------------------------------------------------------------------------------------------------------|--------------------------------------| | [`core/logging.py`](https://github.com/HardMax71/Integr8sCode/blob/main/backend/app/core/logging.py) | Logger setup, filters, JSON formatter| -| [`core/correlation.py`](https://github.com/HardMax71/Integr8sCode/blob/main/backend/app/core/correlation.py) | Correlation ID middleware | diff --git a/docs/operations/tracing.md b/docs/operations/tracing.md index a8050d6f..c8e51ccd 100644 --- a/docs/operations/tracing.md +++ b/docs/operations/tracing.md @@ -16,7 +16,7 @@ FastAPI auto-instrumentation creates a span per request. In selected endpoints a ### Kafka publishes -The event publisher (KafkaEventService) adds headers to each message. Besides a few readable headers (event_type, correlation_id, service) it injects the W3C trace context (traceparent/tracestate). That makes the trace transferable to consumers without coupling to any specific SDK. +The event publisher (KafkaEventService) adds headers to each message. Besides a few readable headers (event_type, service) it injects the W3C trace context (traceparent/tracestate). That makes the trace transferable to consumers without coupling to any specific SDK. ### Kafka consumes diff --git a/docs/reference/openapi.json b/docs/reference/openapi.json index bcceab41..24b8e57f 100644 --- a/docs/reference/openapi.json +++ b/docs/reference/openapi.json @@ -2133,24 +2133,6 @@ }, "description": "Aggregate ID filter" }, - { - "name": "correlation_id", - "in": "query", - "required": false, - "schema": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "null" - } - ], - "description": "Correlation ID filter", - "title": "Correlation Id" - }, - "description": "Correlation ID filter" - }, { "name": "user_id", "in": "query", @@ -6856,17 +6838,6 @@ ], "title": "Aggregate Id" }, - "correlation_id": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "null" - } - ], - "title": "Correlation Id" - }, "user_id": { "anyOf": [ { @@ -6939,10 +6910,6 @@ "type": "string", "title": "Service Version" }, - "correlation_id": { - "type": "string", - "title": "Correlation Id" - }, "user_id": { "type": "string", "title": "User Id" @@ -6976,17 +6943,6 @@ ], "title": "Event Ids" }, - "correlation_id": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "null" - } - ], - "title": "Correlation Id" - }, "aggregate_id": { "anyOf": [ { @@ -7053,9 +7009,9 @@ "type": "integer", "title": "Total Events" }, - "replay_correlation_id": { + "replay_id": { "type": "string", - "title": "Replay Correlation Id" + "title": "Replay Id" }, "session_id": { "anyOf": [ @@ -7090,7 +7046,7 @@ "required": [ "dry_run", "total_events", - "replay_correlation_id", + "replay_id", "status" ], "title": "EventReplayResponse", @@ -7121,9 +7077,9 @@ "type": "integer", "title": "Skipped Events" }, - "correlation_id": { + "replay_id": { "type": "string", - "title": "Correlation Id" + "title": "Replay Id" }, "created_at": { "type": "string", @@ -7208,7 +7164,7 @@ "replayed_events", "failed_events", "skipped_events", - "correlation_id", + "replay_id", "created_at", "progress_percentage" ], @@ -10575,17 +10531,6 @@ ], "title": "Execution Id" }, - "correlation_id": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "null" - } - ], - "title": "Correlation Id" - }, "aggregate_id": { "anyOf": [ { @@ -12933,17 +12878,6 @@ } ], "title": "Reason" - }, - "correlation_id": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "null" - } - ], - "title": "Correlation Id" } }, "type": "object", diff --git a/frontend/src/components/admin/events/EventDetailsModal.svelte b/frontend/src/components/admin/events/EventDetailsModal.svelte index 91d39c2c..1ac748e4 100644 --- a/frontend/src/components/admin/events/EventDetailsModal.svelte +++ b/frontend/src/components/admin/events/EventDetailsModal.svelte @@ -48,10 +48,6 @@ Timestamp {formatTimestamp(eventData.timestamp)} - - Correlation ID - {eventData.metadata.correlation_id ?? '-'} - Aggregate ID {eventData.aggregate_id || '-'} diff --git a/frontend/src/components/admin/events/EventFilters.svelte b/frontend/src/components/admin/events/EventFilters.svelte index fa5a4e88..34f7f99e 100644 --- a/frontend/src/components/admin/events/EventFilters.svelte +++ b/frontend/src/components/admin/events/EventFilters.svelte @@ -11,7 +11,6 @@ filters = $bindable({ event_types: [], aggregate_id: '', - correlation_id: '', user_id: '', service_name: '', search_text: '', @@ -72,19 +71,6 @@ /> -
- - -
-
- Correlation: - - {event.metadata?.correlation_id ?? '-'} + Aggregate: + + {event.aggregate_id ?? '-'}
diff --git a/frontend/src/components/admin/events/__tests__/EventDetailsModal.test.ts b/frontend/src/components/admin/events/__tests__/EventDetailsModal.test.ts index 922dbab2..8e870ae1 100644 --- a/frontend/src/components/admin/events/__tests__/EventDetailsModal.test.ts +++ b/frontend/src/components/admin/events/__tests__/EventDetailsModal.test.ts @@ -49,7 +49,6 @@ describe('EventDetailsModal', () => { it.each([ { label: 'Event ID', value: 'evt-1' }, { label: 'Event Type', value: 'execution_completed' }, - { label: 'Correlation ID', value: 'corr-123' }, { label: 'Aggregate ID', value: 'exec-456' }, ])('displays $label with value "$value"', ({ label, value }) => { renderModal(); @@ -101,14 +100,4 @@ describe('EventDetailsModal', () => { expect(onClose).toHaveBeenCalledOnce(); }); - it.each([ - { case: 'null', value: null }, - { case: 'undefined', value: undefined }, - ])('shows "-" when correlation_id is $case', ({ value }) => { - const detail = createMockEventDetail(); - detail.event.metadata.correlation_id = value as undefined; - renderModal({ event: detail }); - const cells = screen.getAllByText('-'); - expect(cells.length).toBeGreaterThanOrEqual(1); - }); }); diff --git a/frontend/src/components/admin/events/__tests__/EventFilters.test.ts b/frontend/src/components/admin/events/__tests__/EventFilters.test.ts index 012e4f8a..613bbf27 100644 --- a/frontend/src/components/admin/events/__tests__/EventFilters.test.ts +++ b/frontend/src/components/admin/events/__tests__/EventFilters.test.ts @@ -28,7 +28,6 @@ describe('EventFilters', () => { it.each([ { id: 'event-types-filter', label: 'Event Types' }, { id: 'search-filter', label: 'Search' }, - { id: 'correlation-filter', label: 'Correlation ID' }, { id: 'aggregate-filter', label: 'Aggregate ID' }, { id: 'user-filter', label: 'User ID' }, { id: 'service-filter', label: 'Service' }, diff --git a/frontend/src/lib/admin/events/__tests__/eventTypes.test.ts b/frontend/src/lib/admin/events/__tests__/eventTypes.test.ts index 7b56c740..ba9bbc5c 100644 --- a/frontend/src/lib/admin/events/__tests__/eventTypes.test.ts +++ b/frontend/src/lib/admin/events/__tests__/eventTypes.test.ts @@ -62,7 +62,6 @@ describe('eventTypes', () => { expect(createDefaultEventFilters()).toEqual({ event_types: [], aggregate_id: '', - correlation_id: '', user_id: '', service_name: '', search_text: '', @@ -84,7 +83,6 @@ describe('eventTypes', () => { it.each([ ['event_types', { event_types: ['execution_completed'] }], ['search_text', { search_text: 'test' }], - ['correlation_id', { correlation_id: 'abc' }], ['aggregate_id', { aggregate_id: 'exec-1' }], ['user_id', { user_id: 'user-1' }], ['service_name', { service_name: 'svc' }], @@ -98,12 +96,12 @@ describe('eventTypes', () => { describe('getActiveFilterCount', () => { it.each([ [createDefaultEventFilters(), 0], - [withFilter({ event_types: ['x'], search_text: 'y', correlation_id: 'z' }), 3], + [withFilter({ event_types: ['x'], search_text: 'y' }), 2], [withFilter({ - event_types: ['x'], search_text: 'y', correlation_id: 'z', + event_types: ['x'], search_text: 'y', aggregate_id: 'a', user_id: 'u', service_name: 's', start_time: 't1', end_time: 't2' - }), 8], + }), 7], ])('returns correct count', (filters, expected) => { expect(getActiveFilterCount(filters)).toBe(expected); }); @@ -118,7 +116,6 @@ describe('eventTypes', () => { [{ event_types: ['a', 'b'] }, '2 event types'], [{ event_types: ['a'] }, '1 event type'], [{ search_text: 'test' }, 'search'], - [{ correlation_id: 'abc' }, 'correlation'], [{ start_time: '2024-01-01' }, 'time range'], [{ end_time: '2024-01-02' }, 'time range'], ])('includes expected label', (override, expected) => { @@ -127,10 +124,10 @@ describe('eventTypes', () => { it('includes all active filter labels', () => { const summary = getActiveFilterSummary(withFilter({ - event_types: ['x'], search_text: 'y', correlation_id: 'z', + event_types: ['x'], search_text: 'y', aggregate_id: 'a', user_id: 'u', service_name: 's', start_time: 't' })); - ['1 event type', 'search', 'correlation', 'aggregate', 'user', 'service', 'time range'] + ['1 event type', 'search', 'aggregate', 'user', 'service', 'time range'] .forEach(label => expect(summary).toContain(label)); }); }); diff --git a/frontend/src/lib/admin/events/eventTypes.ts b/frontend/src/lib/admin/events/eventTypes.ts index c07e58bc..231542ef 100644 --- a/frontend/src/lib/admin/events/eventTypes.ts +++ b/frontend/src/lib/admin/events/eventTypes.ts @@ -58,7 +58,6 @@ export function getEventTypeLabel(eventType: EventType): string { export interface EventFilters { event_types: EventType[]; aggregate_id: string; - correlation_id: string; user_id: string; service_name: string; search_text: string; @@ -70,7 +69,6 @@ export function createDefaultEventFilters(): EventFilters { return { event_types: [], aggregate_id: '', - correlation_id: '', user_id: '', service_name: '', search_text: '', @@ -83,7 +81,6 @@ export function hasActiveFilters(filters: EventFilters): boolean { return ( filters.event_types.length > 0 || !!filters.search_text || - !!filters.correlation_id || !!filters.aggregate_id || !!filters.user_id || !!filters.service_name || @@ -96,7 +93,6 @@ export function getActiveFilterCount(filters: EventFilters): number { let count = 0; if (filters.event_types.length > 0) count++; if (filters.search_text) count++; - if (filters.correlation_id) count++; if (filters.aggregate_id) count++; if (filters.user_id) count++; if (filters.service_name) count++; @@ -111,7 +107,6 @@ export function getActiveFilterSummary(filters: EventFilters): string[] { items.push(`${filters.event_types.length} event type${filters.event_types.length > 1 ? 's' : ''}`); } if (filters.search_text) items.push('search'); - if (filters.correlation_id) items.push('correlation'); if (filters.aggregate_id) items.push('aggregate'); if (filters.user_id) items.push('user'); if (filters.service_name) items.push('service'); diff --git a/frontend/src/lib/api/types.gen.ts b/frontend/src/lib/api/types.gen.ts index c143a77e..7971ea31 100644 --- a/frontend/src/lib/api/types.gen.ts +++ b/frontend/src/lib/api/types.gen.ts @@ -1535,10 +1535,6 @@ export type EventFilter = { * Aggregate Id */ aggregate_id?: string | null; - /** - * Correlation Id - */ - correlation_id?: string | null; /** * User Id */ @@ -1575,10 +1571,6 @@ export type EventMetadata = { * Service Version */ service_version: string; - /** - * Correlation Id - */ - correlation_id?: string; /** * User Id */ @@ -1596,10 +1588,6 @@ export type EventReplayRequest = { * Event Ids */ event_ids?: Array | null; - /** - * Correlation Id - */ - correlation_id?: string | null; /** * Aggregate Id */ @@ -1637,9 +1625,9 @@ export type EventReplayResponse = { */ total_events: number; /** - * Replay Correlation Id + * Replay Id */ - replay_correlation_id: string; + replay_id: string; /** * Session Id */ @@ -1679,9 +1667,9 @@ export type EventReplayStatusResponse = { */ skipped_events: number; /** - * Correlation Id + * Replay Id */ - correlation_id: string; + replay_id: string; /** * Created At */ @@ -3688,10 +3676,6 @@ export type ReplayFilter = { * Execution Id */ execution_id?: string | null; - /** - * Correlation Id - */ - correlation_id?: string | null; /** * Aggregate Id */ @@ -5072,10 +5056,6 @@ export type SettingsHistoryEntry = { * Reason */ reason?: string | null; - /** - * Correlation Id - */ - correlation_id?: string | null; }; /** @@ -5898,9 +5878,9 @@ export type EventReplayStatusResponseWritable = { */ skipped_events: number; /** - * Correlation Id + * Replay Id */ - correlation_id: string; + replay_id: string; /** * Created At */ @@ -7213,12 +7193,6 @@ export type ExportEventsApiV1AdminEventsExportExportFormatGetData = { * Aggregate ID filter */ aggregate_id?: string | null; - /** - * Correlation Id - * - * Correlation ID filter - */ - correlation_id?: string | null; /** * User Id * diff --git a/frontend/src/routes/admin/AdminEvents.svelte b/frontend/src/routes/admin/AdminEvents.svelte index 8da3af69..3cca9c0b 100644 --- a/frontend/src/routes/admin/AdminEvents.svelte +++ b/frontend/src/routes/admin/AdminEvents.svelte @@ -166,7 +166,7 @@ progress_percentage: 0, failed_events: 0, skipped_events: 0, - correlation_id: '', + replay_id: '', created_at: new Date().toISOString() }; checkReplayStatus(sessionId); @@ -190,7 +190,6 @@ if (filters.start_time) params.append('start_time', new Date(filters.start_time).toISOString()); if (filters.end_time) params.append('end_time', new Date(filters.end_time).toISOString()); if (filters.aggregate_id) params.append('aggregate_id', filters.aggregate_id); - if (filters.correlation_id) params.append('correlation_id', filters.correlation_id); if (filters.user_id) params.append('user_id', filters.user_id); if (filters.service_name) params.append('service_name', filters.service_name); diff --git a/frontend/src/routes/admin/__tests__/AdminEvents.test.ts b/frontend/src/routes/admin/__tests__/AdminEvents.test.ts index 67d9bc9e..f93579aa 100644 --- a/frontend/src/routes/admin/__tests__/AdminEvents.test.ts +++ b/frontend/src/routes/admin/__tests__/AdminEvents.test.ts @@ -229,7 +229,6 @@ describe('AdminEvents', () => { await waitFor(() => { expect(screen.getByLabelText(/Search/i)).toBeInTheDocument(); - expect(screen.getByLabelText(/Correlation ID/i)).toBeInTheDocument(); expect(screen.getByLabelText(/Aggregate ID/i)).toBeInTheDocument(); expect(screen.getByLabelText(/User ID/i)).toBeInTheDocument(); expect(screen.getByLabelText(/Service/i)).toBeInTheDocument(); @@ -396,7 +395,7 @@ describe('AdminEvents', () => { it('displays event information in modal', async () => { vi.useRealTimers(); const user = userEvent.setup(); - const event = createMockEvent({ event_id: 'evt-123', event_type: 'execution_completed', metadata: { correlation_id: 'corr-abc' } }); + const event = createMockEvent({ event_id: 'evt-123', event_type: 'execution_completed' }); mocks.getEventDetailApiV1AdminEventsEventIdGet.mockResolvedValue({ data: createMockEventDetail(event), error: null, @@ -410,7 +409,6 @@ describe('AdminEvents', () => { // Using getAllByText because values may appear in table + modal expect(screen.getAllByText('evt-123').length).toBeGreaterThan(0); expect(screen.getAllByText('execution_completed').length).toBeGreaterThan(0); - expect(screen.getAllByText('corr-abc').length).toBeGreaterThan(0); }); }); diff --git a/frontend/src/routes/admin/__tests__/test-utils.ts b/frontend/src/routes/admin/__tests__/test-utils.ts index f31db55d..183cb4bb 100644 --- a/frontend/src/routes/admin/__tests__/test-utils.ts +++ b/frontend/src/routes/admin/__tests__/test-utils.ts @@ -51,7 +51,6 @@ export function mockWindowGlobals(openMock: Mock, confirmMock: Mock): void { export interface MockEventMetadata { service_name: string; service_version: string; - correlation_id?: string; user_id?: string | null; } @@ -75,7 +74,6 @@ export const DEFAULT_EVENT = { metadata: { service_name: 'test-service', service_version: '1.0.0', - correlation_id: 'corr-123', user_id: 'user-1', }, execution_id: 'exec-456', @@ -102,7 +100,6 @@ export const createMockEvents = (count: number) => timestamp: new Date(Date.now() - i * 60000).toISOString(), aggregate_id: `exec-${i + 1}`, metadata: { - correlation_id: `corr-${i + 1}`, user_id: `user-${(i % 3) + 1}`, service_name: 'execution-service', }, From 297fa0dfaaaa34e32f40d9693d42a44ad301aedd Mon Sep 17 00:00:00 2001 From: HardMax71 Date: Fri, 13 Feb 2026 13:58:27 +0100 Subject: [PATCH 06/12] fix: found issues --- backend/app/api/routes/auth.py | 84 +++++++------------ backend/app/core/dishka_lifespan.py | 10 +-- backend/app/core/logging.py | 36 ++++---- backend/app/core/middlewares/metrics.py | 5 +- backend/app/core/tracing/__init__.py | 2 +- .../db/repositories/execution_repository.py | 12 +-- backend/app/dlq/manager.py | 14 ++-- backend/app/main.py | 14 ++-- .../services/admin/admin_events_service.py | 42 ++++------ .../services/admin/admin_settings_service.py | 6 +- .../app/services/admin/admin_user_service.py | 40 ++++----- .../services/event_replay/replay_service.py | 19 +++-- backend/app/services/execution_service.py | 71 +++++++--------- backend/app/services/kafka_event_service.py | 2 +- backend/app/services/login_lockout.py | 3 +- backend/app/services/notification_service.py | 75 +++++++---------- .../app/services/saga/saga_orchestrator.py | 18 ++-- backend/app/services/saga/saga_service.py | 58 ++++++------- backend/app/services/saved_script_service.py | 60 ++++++------- backend/app/services/sse/redis_bus.py | 3 +- backend/app/services/sse/sse_service.py | 16 ++-- backend/app/services/user_settings_service.py | 9 +- backend/app/settings.py | 2 +- backend/pyproject.toml | 2 +- .../unit/core/test_logging_and_correlation.py | 28 +++++++ backend/uv.lock | 2 +- docs/operations/logging.md | 23 +++-- frontend/src/routes/admin/AdminEvents.svelte | 2 +- 28 files changed, 314 insertions(+), 344 deletions(-) diff --git a/backend/app/api/routes/auth.py b/backend/app/api/routes/auth.py index 3e1aff7f..8dca8c15 100644 --- a/backend/app/api/routes/auth.py +++ b/backend/app/api/routes/auth.py @@ -46,12 +46,10 @@ async def login( """Authenticate and receive session cookies.""" logger.info( "Login attempt", - extra={ - "username": form_data.username, - "client_ip": get_client_ip(request), - "endpoint": "/login", - "user_agent": request.headers.get("user-agent"), - }, + username=form_data.username, + client_ip=get_client_ip(request), + endpoint="/login", + user_agent=request.headers.get("user-agent"), ) if await lockout_service.check_locked(form_data.username): @@ -65,11 +63,9 @@ async def login( if not user: logger.warning( "Login failed - user not found", - extra={ - "username": form_data.username, - "client_ip": get_client_ip(request), - "user_agent": request.headers.get("user-agent"), - }, + username=form_data.username, + client_ip=get_client_ip(request), + user_agent=request.headers.get("user-agent"), ) locked = await lockout_service.record_failed_attempt(form_data.username) if locked: @@ -86,11 +82,9 @@ async def login( if not security_service.verify_password(form_data.password, user.hashed_password): logger.warning( "Login failed - invalid password", - extra={ - "username": form_data.username, - "client_ip": get_client_ip(request), - "user_agent": request.headers.get("user-agent"), - }, + username=form_data.username, + client_ip=get_client_ip(request), + user_agent=request.headers.get("user-agent"), ) locked = await lockout_service.record_failed_attempt(form_data.username) if locked: @@ -111,12 +105,10 @@ async def login( logger.info( "Login successful", - extra={ - "username": user.username, - "client_ip": get_client_ip(request), - "user_agent": request.headers.get("user-agent"), - "token_expires_in_minutes": session_timeout, - }, + username=user.username, + client_ip=get_client_ip(request), + user_agent=request.headers.get("user-agent"), + token_expires_in_minutes=session_timeout, ) access_token_expires = timedelta(minutes=session_timeout) @@ -174,12 +166,10 @@ async def register( """Register a new user account.""" logger.info( "Registration attempt", - extra={ - "username": user.username, - "client_ip": get_client_ip(request), - "endpoint": "/register", - "user_agent": request.headers.get("user-agent"), - }, + username=user.username, + client_ip=get_client_ip(request), + endpoint="/register", + user_agent=request.headers.get("user-agent"), ) effective = await runtime_settings.get_effective_settings() @@ -191,11 +181,9 @@ async def register( if db_user: logger.warning( "Registration failed - username taken", - extra={ - "username": user.username, - "client_ip": get_client_ip(request), - "user_agent": request.headers.get("user-agent"), - }, + username=user.username, + client_ip=get_client_ip(request), + user_agent=request.headers.get("user-agent"), ) raise HTTPException(status_code=409, detail="Username already registered") @@ -212,11 +200,9 @@ async def register( logger.info( "Registration successful", - extra={ - "username": created_user.username, - "client_ip": get_client_ip(request), - "user_agent": request.headers.get("user-agent"), - }, + username=created_user.username, + client_ip=get_client_ip(request), + user_agent=request.headers.get("user-agent"), ) return UserResponse.model_validate(created_user) @@ -234,11 +220,9 @@ async def get_current_user_profile( logger.info( "User profile request", - extra={ - "username": current_user.username, - "client_ip": get_client_ip(request), - "endpoint": "/me", - }, + username=current_user.username, + client_ip=get_client_ip(request), + endpoint="/me", ) # Set cache control headers @@ -257,11 +241,9 @@ async def logout( """Log out and clear session cookies.""" logger.info( "Logout attempt", - extra={ - "client_ip": get_client_ip(request), - "endpoint": "/logout", - "user_agent": request.headers.get("user-agent"), - }, + client_ip=get_client_ip(request), + endpoint="/logout", + user_agent=request.headers.get("user-agent"), ) # Clear the httpOnly cookie @@ -278,10 +260,8 @@ async def logout( logger.info( "Logout successful", - extra={ - "client_ip": get_client_ip(request), - "user_agent": request.headers.get("user-agent"), - }, + client_ip=get_client_ip(request), + user_agent=request.headers.get("user-agent"), ) return MessageResponse(message="Logout successful") diff --git a/backend/app/core/dishka_lifespan.py b/backend/app/core/dishka_lifespan.py index a7d23c4a..ce107cac 100644 --- a/backend/app/core/dishka_lifespan.py +++ b/backend/app/core/dishka_lifespan.py @@ -3,6 +3,7 @@ from contextlib import asynccontextmanager from typing import Any, AsyncGenerator +import structlog from beanie import init_beanie from dishka import AsyncContainer from dishka.integrations.faststream import setup_dishka as setup_dishka_faststream @@ -10,7 +11,6 @@ from faststream.kafka import KafkaBroker from pymongo import AsyncMongoClient -from app.core.logging import setup_logger from app.db.docs import ALL_DOCUMENTS from app.events.handlers import ( register_notification_subscriber, @@ -34,7 +34,7 @@ async def lifespan(app: FastAPI) -> AsyncGenerator[None, None]: """ settings: Settings = app.state.settings container: AsyncContainer = app.state.dishka_container - logger = setup_logger(settings.LOG_LEVEL) + logger: structlog.stdlib.BoundLogger = await container.get(structlog.stdlib.BoundLogger) # Initialize Beanie with tz_aware client (so MongoDB returns aware datetimes). # Use URL database first and fall back to configured DATABASE_NAME so runtime @@ -46,10 +46,8 @@ async def lifespan(app: FastAPI) -> AsyncGenerator[None, None]: logger.info( "Starting application with dishka DI", - extra={ - "project_name": settings.PROJECT_NAME, - "environment": "test" if settings.TESTING else "production", - }, + project_name=settings.PROJECT_NAME, + environment="test" if settings.TESTING else "production", ) # Get unstarted broker from DI (BrokerProvider yields without starting) diff --git a/backend/app/core/logging.py b/backend/app/core/logging.py index bb48287c..d5a8c398 100644 --- a/backend/app/core/logging.py +++ b/backend/app/core/logging.py @@ -17,17 +17,25 @@ ] +def _redact(value: str) -> str: + for pattern, replacement in SENSITIVE_PATTERNS: + value = re.sub(pattern, replacement, value, flags=re.IGNORECASE) + return value + + def sanitize_sensitive_data( logger: structlog.types.WrappedLogger, method_name: str, event_dict: structlog.types.EventDict, ) -> structlog.types.EventDict: - """Structlog processor that redacts sensitive data from the event message.""" - event = event_dict.get("event", "") - if isinstance(event, str): - for pattern, replacement in SENSITIVE_PATTERNS: - event = re.sub(pattern, replacement, event, flags=re.IGNORECASE) - event_dict["event"] = event + """Structlog processor that redacts sensitive data from all string fields. + + Covers event message, formatted exception text, stack info, and any + string value added by prior processors. + """ + for key, value in event_dict.items(): + if isinstance(value, str): + event_dict[key] = _redact(value) return event_dict @@ -46,22 +54,11 @@ def add_otel_context( return event_dict -LOG_LEVELS: dict[str, int] = { - "DEBUG": logging.DEBUG, - "INFO": logging.INFO, - "WARNING": logging.WARNING, - "ERROR": logging.ERROR, - "CRITICAL": logging.CRITICAL, -} - - def setup_logger(log_level: str) -> structlog.stdlib.BoundLogger: """Configure structlog and return a bound logger for the application. Called by DI with Settings.LOG_LEVEL and also directly by main.py/lifespan. """ - level = LOG_LEVELS.get(log_level.upper(), logging.DEBUG) - structlog.configure( processors=[ structlog.contextvars.merge_contextvars, @@ -69,10 +66,10 @@ def setup_logger(log_level: str) -> structlog.stdlib.BoundLogger: structlog.stdlib.add_logger_name, structlog.stdlib.add_log_level, structlog.processors.TimeStamper(fmt="iso"), - sanitize_sensitive_data, add_otel_context, structlog.processors.StackInfoRenderer(), structlog.processors.format_exc_info, + sanitize_sensitive_data, structlog.processors.JSONRenderer(), ], wrapper_class=structlog.stdlib.BoundLogger, @@ -81,8 +78,7 @@ def setup_logger(log_level: str) -> structlog.stdlib.BoundLogger: cache_logger_on_first_use=True, ) - logging.basicConfig(level=level, format="%(message)s", handlers=[logging.StreamHandler()]) - logging.getLogger().setLevel(level) + logging.basicConfig(level=log_level.upper(), format="%(message)s", handlers=[logging.StreamHandler()]) logger: structlog.stdlib.BoundLogger = structlog.get_logger("integr8scode") return logger diff --git a/backend/app/core/middlewares/metrics.py b/backend/app/core/middlewares/metrics.py index 34bcdb8b..6868652e 100644 --- a/backend/app/core/middlewares/metrics.py +++ b/backend/app/core/middlewares/metrics.py @@ -141,7 +141,10 @@ def setup_metrics(settings: Settings, logger: structlog.stdlib.BoundLogger) -> N } ) - otlp_exporter = OTLPMetricExporter(endpoint=settings.OTEL_EXPORTER_OTLP_ENDPOINT, insecure=True) + otlp_exporter = OTLPMetricExporter( + endpoint=settings.OTEL_EXPORTER_OTLP_ENDPOINT, + insecure=settings.OTEL_EXPORTER_OTLP_ENDPOINT.startswith("http://"), + ) metric_reader = PeriodicExportingMetricReader( exporter=otlp_exporter, diff --git a/backend/app/core/tracing/__init__.py b/backend/app/core/tracing/__init__.py index 77236470..803a7e69 100644 --- a/backend/app/core/tracing/__init__.py +++ b/backend/app/core/tracing/__init__.py @@ -46,7 +46,7 @@ def __init__(self, settings: Settings, logger: structlog.stdlib.BoundLogger) -> provider.add_span_processor( BatchSpanProcessor(OTLPSpanExporter( endpoint=settings.OTLP_TRACES_ENDPOINT, - insecure=True, + insecure=settings.OTLP_TRACES_ENDPOINT.startswith("http://"), )) ) diff --git a/backend/app/db/repositories/execution_repository.py b/backend/app/db/repositories/execution_repository.py index de1aa1dc..f8004ded 100644 --- a/backend/app/db/repositories/execution_repository.py +++ b/backend/app/db/repositories/execution_repository.py @@ -19,19 +19,19 @@ def __init__(self, logger: structlog.stdlib.BoundLogger): async def create_execution(self, create_data: DomainExecutionCreate) -> DomainExecution: doc = ExecutionDocument(**create_data.model_dump()) - self.logger.info("Inserting execution into MongoDB", extra={"execution_id": doc.execution_id}) + self.logger.info("Inserting execution into MongoDB", execution_id=doc.execution_id) await doc.insert() - self.logger.info("Inserted execution", extra={"execution_id": doc.execution_id}) + self.logger.info("Inserted execution", execution_id=doc.execution_id) return DomainExecution.model_validate(doc) async def get_execution(self, execution_id: str) -> DomainExecution | None: - self.logger.info("Searching for execution in MongoDB", extra={"execution_id": execution_id}) + self.logger.info("Searching for execution in MongoDB", execution_id=execution_id) doc = await ExecutionDocument.find_one(ExecutionDocument.execution_id == execution_id) if not doc: - self.logger.warning("Execution not found in MongoDB", extra={"execution_id": execution_id}) + self.logger.warning("Execution not found in MongoDB", execution_id=execution_id) return None - self.logger.info("Found execution in MongoDB", extra={"execution_id": execution_id}) + self.logger.info("Found execution in MongoDB", execution_id=execution_id) return DomainExecution.model_validate(doc) async def update_execution(self, execution_id: str, update_data: DomainExecutionUpdate) -> bool: @@ -48,7 +48,7 @@ async def update_execution(self, execution_id: str, update_data: DomainExecution async def write_terminal_result(self, result: ExecutionResultDomain) -> bool: doc = await ExecutionDocument.find_one(ExecutionDocument.execution_id == result.execution_id) if not doc: - self.logger.warning("No execution found", extra={"execution_id": result.execution_id}) + self.logger.warning("No execution found", execution_id=result.execution_id) return False await doc.set( diff --git a/backend/app/dlq/manager.py b/backend/app/dlq/manager.py index d4ed3c95..05ef2284 100644 --- a/backend/app/dlq/manager.py +++ b/backend/app/dlq/manager.py @@ -80,7 +80,7 @@ async def handle_message(self, message: DLQMessage) -> None: """Process a single DLQ message: filter → store → decide retry/discard.""" for filter_func in self._filters: if not filter_func(message): - self.logger.info("Message filtered out", extra={"event_id": message.event.event_id}) + self.logger.info("Message filtered out", event_id=message.event.event_id) return message.status = DLQMessageStatus.PENDING @@ -158,7 +158,7 @@ async def retry_message(self, message: DLQMessage) -> None: ), topic=self._dlq_events_topic, ) - self.logger.info("Successfully retried message", extra={"event_id": message.event.event_id}) + self.logger.info("Successfully retried message", event_id=message.event.event_id) async def discard_message(self, message: DLQMessage, reason: str) -> None: """Discard a DLQ message, updating status and emitting an event.""" @@ -188,7 +188,7 @@ async def discard_message(self, message: DLQMessage, reason: str) -> None: ), topic=self._dlq_events_topic, ) - self.logger.warning("Discarded message", extra={"event_id": message.event.event_id, "reason": reason}) + self.logger.warning("Discarded message", event_id=message.event.event_id, reason=reason) async def process_due_retries(self) -> int: """Process all scheduled messages whose retry time has arrived. @@ -215,11 +215,11 @@ def add_filter(self, filter_func: Callable[[DLQMessage], bool]) -> None: async def retry_message_manually(self, event_id: str) -> bool: message = await self.repository.get_message_by_id(event_id) if not message: - self.logger.error("Message not found in DLQ", extra={"event_id": event_id}) + self.logger.error("Message not found in DLQ", event_id=event_id) return False if message.status in {DLQMessageStatus.DISCARDED, DLQMessageStatus.RETRIED}: - self.logger.info("Skipping manual retry", extra={"event_id": event_id, "status": message.status}) + self.logger.info("Skipping manual retry", event_id=event_id, status=message.status) return False await self.retry_message(message) @@ -266,11 +266,11 @@ async def discard_message_manually(self, event_id: str, reason: str) -> bool: """ message = await self.repository.get_message_by_id(event_id) if not message: - self.logger.error("Message not found in DLQ", extra={"event_id": event_id}) + self.logger.error("Message not found in DLQ", event_id=event_id) return False if message.status in {DLQMessageStatus.DISCARDED, DLQMessageStatus.RETRIED}: - self.logger.info("Skipping manual discard", extra={"event_id": event_id, "status": message.status}) + self.logger.info("Skipping manual discard", event_id=event_id, status=message.status) return False await self.discard_message(message, reason) diff --git a/backend/app/main.py b/backend/app/main.py index bd5ba081..2cdf36f5 100644 --- a/backend/app/main.py +++ b/backend/app/main.py @@ -131,14 +131,12 @@ def create_app(settings: Settings | None = None) -> FastAPI: logger = setup_logger(settings.LOG_LEVEL) logger.info( "Starting uvicorn server", - extra={ - "host": settings.SERVER_HOST, - "port": settings.SERVER_PORT, - "ssl_enabled": True, - "workers": settings.WEB_CONCURRENCY, - "backlog": settings.WEB_BACKLOG, - "timeout_keep_alive": settings.WEB_TIMEOUT, - }, + host=settings.SERVER_HOST, + port=settings.SERVER_PORT, + ssl_enabled=True, + workers=settings.WEB_CONCURRENCY, + backlog=settings.WEB_BACKLOG, + timeout_keep_alive=settings.WEB_TIMEOUT, ) uvicorn.run( "app.main:create_app", diff --git a/backend/app/services/admin/admin_events_service.py b/backend/app/services/admin/admin_events_service.py index 1fda61c7..c9d600ca 100644 --- a/backend/app/services/admin/admin_events_service.py +++ b/backend/app/services/admin/admin_events_service.py @@ -105,10 +105,8 @@ async def prepare_or_schedule_replay( self.logger.info( "Preparing replay session", - extra={ - "dry_run": dry_run, - "replay_id": replay_id, - }, + dry_run=dry_run, + replay_id=replay_id, ) event_count = await self._repo.count_events_for_replay(replay_filter) @@ -141,10 +139,8 @@ async def prepare_or_schedule_replay( ) self.logger.info( "Replay dry-run prepared", - extra={ - "total_events": result.total_events, - "replay_id": result.replay_id, - }, + total_events=result.total_events, + replay_id=result.replay_id, ) return result @@ -183,11 +179,9 @@ async def prepare_or_schedule_replay( ) self.logger.info( "Replay scheduled", - extra={ - "session_id": result.session_id, - "total_events": result.total_events, - "replay_id": result.replay_id, - }, + session_id=result.session_id, + total_events=result.total_events, + replay_id=result.replay_id, ) return result @@ -245,10 +239,8 @@ async def _export_csv(self, *, event_filter: EventFilter, limit: int) -> ExportR filename = f"events_export_{datetime.now(timezone.utc).strftime('%Y%m%d_%H%M%S')}.csv" self.logger.info( "Exported events CSV", - extra={ - "row_count": len(rows), - "file_name": filename, - }, + row_count=len(rows), + file_name=filename, ) return ExportResult(file_name=filename, content=output.getvalue(), media_type="text/csv") @@ -270,16 +262,14 @@ async def _export_json(self, *, event_filter: EventFilter, limit: int) -> Export filename = f"events_export_{datetime.now(timezone.utc).strftime('%Y%m%d_%H%M%S')}.json" self.logger.info( "Exported events JSON", - extra={ - "event_count": len(events_data), - "file_name": filename, - }, + event_count=len(events_data), + file_name=filename, ) return ExportResult(file_name=filename, content=json_content, media_type="application/json") async def delete_event(self, *, event_id: str, deleted_by: str) -> bool: # Load event for archival; archive then delete - self.logger.warning("Admin attempting to delete event", extra={"event_id": event_id, "deleted_by": deleted_by}) + self.logger.warning("Admin attempting to delete event", event_id=event_id, deleted_by=deleted_by) detail = await self._repo.get_event_detail(event_id) if not detail: return False @@ -288,10 +278,8 @@ async def delete_event(self, *, event_id: str, deleted_by: str) -> bool: if deleted: self.logger.info( "Event deleted", - extra={ - "event_id": event_id, - "event_type": detail.event.event_type, - "deleted_by": deleted_by, - }, + event_id=event_id, + event_type=detail.event.event_type, + deleted_by=deleted_by, ) return deleted diff --git a/backend/app/services/admin/admin_settings_service.py b/backend/app/services/admin/admin_settings_service.py index 48ea0cae..d3dc69c3 100644 --- a/backend/app/services/admin/admin_settings_service.py +++ b/backend/app/services/admin/admin_settings_service.py @@ -19,14 +19,14 @@ def __init__( async def get_system_settings(self, user_id: str) -> SystemSettings: self.logger.info( "Admin retrieving system settings", - extra={"user_id": user_id}, + user_id=user_id, ) return await self._runtime_settings.get_effective_settings() async def update_system_settings(self, settings: SystemSettings, user_id: str) -> SystemSettings: self.logger.info( "Admin updating system settings", - extra={"user_id": user_id}, + user_id=user_id, ) updated = await self._repo.update_system_settings(settings=settings, user_id=user_id) self._runtime_settings.invalidate_cache() @@ -36,7 +36,7 @@ async def update_system_settings(self, settings: SystemSettings, user_id: str) - async def reset_system_settings(self, user_id: str) -> SystemSettings: self.logger.info( "Admin resetting system settings to defaults", - extra={"user_id": user_id}, + user_id=user_id, ) await self._repo.reset_system_settings(user_id=user_id) self._runtime_settings.invalidate_cache() diff --git a/backend/app/services/admin/admin_user_service.py b/backend/app/services/admin/admin_user_service.py index c60e1267..f68cb97a 100644 --- a/backend/app/services/admin/admin_user_service.py +++ b/backend/app/services/admin/admin_user_service.py @@ -33,7 +33,7 @@ def __init__( self.logger = logger async def get_user_overview(self, user_id: str, hours: int = 24) -> AdminUserOverviewDomain: - self.logger.info("Admin getting user overview", extra={"target_user_id": user_id, "hours": hours}) + self.logger.info("Admin getting user overview", target_user_id=user_id, hours=hours) user = await self._users.get_user_by_id(user_id) if not user: raise NotFoundError("User", user_id) @@ -105,13 +105,11 @@ async def list_users( ) -> UserListResult: self.logger.info( "Admin listing users", - extra={ - "admin_user_id": admin_user_id, - "limit": limit, - "offset": offset, - "search": search, - "role": role, - }, + admin_user_id=admin_user_id, + limit=limit, + offset=offset, + search=search, + role=role, ) result = await self._users.list_users(limit=limit, offset=offset, search=search, role=role) @@ -132,7 +130,7 @@ async def list_users( async def create_user(self, *, admin_user_id: str, user_data: UserCreate) -> User: """Create a new user and return domain user.""" self.logger.info( - "Admin creating new user", extra={"admin_user_id": admin_user_id, "new_username": user_data.username} + "Admin creating new user", admin_user_id=admin_user_id, new_username=user_data.username ) # Ensure not exists search_result = await self._users.list_users(limit=1, offset=0, search=user_data.username) @@ -152,20 +150,21 @@ async def create_user(self, *, admin_user_id: str, user_data: UserCreate) -> Use ) created = await self._users.create_user(create_data) self.logger.info( - "User created successfully", extra={"new_username": user_data.username, "admin_user_id": admin_user_id} + "User created successfully", new_username=user_data.username, admin_user_id=admin_user_id ) return created async def get_user(self, *, admin_user_id: str, user_id: str) -> User | None: self.logger.info( - "Admin getting user details", extra={"admin_user_id": admin_user_id, "target_user_id": user_id} + "Admin getting user details", admin_user_id=admin_user_id, target_user_id=user_id ) return await self._users.get_user_by_id(user_id) async def update_user(self, *, admin_user_id: str, user_id: str, update: UserUpdate) -> User | None: self.logger.info( "Admin updating user", - extra={"admin_user_id": admin_user_id, "target_user_id": user_id}, + admin_user_id=admin_user_id, + target_user_id=user_id, ) if update.password is not None: update = update.model_copy(update={"password": self._security.get_password_hash(update.password)}) @@ -174,29 +173,31 @@ async def update_user(self, *, admin_user_id: str, user_id: str, update: UserUpd async def delete_user(self, *, admin_user_id: str, user_id: str, cascade: bool) -> UserDeleteResult: self.logger.info( "Admin deleting user", - extra={"admin_user_id": admin_user_id, "target_user_id": user_id, "cascade": cascade}, + admin_user_id=admin_user_id, + target_user_id=user_id, + cascade=cascade, ) # Reset rate limits prior to deletion await self._rate_limits.reset_user_limits(user_id) result = await self._users.delete_user(user_id, cascade=cascade) if result.user_deleted: - self.logger.info("User deleted successfully", extra={"target_user_id": user_id}) + self.logger.info("User deleted successfully", target_user_id=user_id) return result async def reset_user_password(self, *, admin_user_id: str, user_id: str, new_password: str) -> bool: self.logger.info( - "Admin resetting user password", extra={"admin_user_id": admin_user_id, "target_user_id": user_id} + "Admin resetting user password", admin_user_id=admin_user_id, target_user_id=user_id ) hashed = self._security.get_password_hash(new_password) pr = PasswordReset(user_id=user_id, new_password=hashed) ok = await self._users.reset_user_password(pr) if ok: - self.logger.info("User password reset successfully", extra={"target_user_id": user_id}) + self.logger.info("User password reset successfully", target_user_id=user_id) return ok async def get_user_rate_limits(self, *, admin_user_id: str, user_id: str) -> UserRateLimitsResult: self.logger.info( - "Admin getting user rate limits", extra={"admin_user_id": admin_user_id, "target_user_id": user_id} + "Admin getting user rate limits", admin_user_id=admin_user_id, target_user_id=user_id ) user_limit = await self._rate_limits.get_user_rate_limit(user_id) usage_stats = await self._rate_limits.get_usage_stats(user_id) @@ -211,7 +212,8 @@ async def update_user_rate_limits( ) -> RateLimitUpdateResult: self.logger.info( "Admin updating user rate limits", - extra={"admin_user_id": admin_user_id, "target_user_id": user_id}, + admin_user_id=admin_user_id, + target_user_id=user_id, ) config = UserRateLimit( user_id=user_id, @@ -225,7 +227,7 @@ async def update_user_rate_limits( async def reset_user_rate_limits(self, *, admin_user_id: str, user_id: str) -> bool: self.logger.info( - "Admin resetting user rate limits", extra={"admin_user_id": admin_user_id, "target_user_id": user_id} + "Admin resetting user rate limits", admin_user_id=admin_user_id, target_user_id=user_id ) await self._rate_limits.reset_user_limits(user_id) return True diff --git a/backend/app/services/event_replay/replay_service.py b/backend/app/services/event_replay/replay_service.py index ebb6a8ec..fecaf40a 100644 --- a/backend/app/services/event_replay/replay_service.py +++ b/backend/app/services/event_replay/replay_service.py @@ -169,7 +169,7 @@ async def cleanup_old_sessions(self, older_than_hours: int = 24) -> CleanupResul removed_db = await self._repository.delete_old_sessions(cutoff_time) total_removed = max(removed_memory, removed_db) - self.logger.info("Cleaned up old replay sessions", extra={"removed_count": total_removed}) + self.logger.info("Cleaned up old replay sessions", removed_count=total_removed) return CleanupResult(removed_sessions=total_removed, message=f"Removed {total_removed} old sessions") async def _dispatch_next(self, session: ReplaySessionState) -> None: @@ -269,12 +269,10 @@ async def _finalize_session(self, session: ReplaySessionState, final_status: Rep self._buffer_indices.pop(session.session_id, None) self.logger.info( "Replay session finished", - extra={ - "session_id": session.session_id, - "status": session.status, - "replayed_events": session.replayed_events, - "failed_events": session.failed_events, - }, + session_id=session.session_id, + status=session.status, + replayed_events=session.replayed_events, + failed_events=session.failed_events, ) async def _fetch_event_batches(self, session: ReplaySessionState) -> AsyncIterator[list[DomainEvent]]: @@ -295,7 +293,8 @@ async def _fetch_event_batches(self, session: ReplaySessionState) -> AsyncIterat session.failed_events += 1 self.logger.warning( "Skipping event that failed validation", - extra={"event_id": doc.get("event_id", "unknown"), "error": str(e)}, + event_id=doc.get("event_id", "unknown"), + error=str(e), ) continue @@ -324,7 +323,9 @@ async def _replay_event(self, session: ReplaySessionState, event: DomainEvent) - jitter=None, on_backoff=lambda details: self.logger.error( "Failed to replay event", - extra={"attempt": details["tries"], "max_attempts": max_attempts, "error": str(details["exception"])}, + attempt=details["tries"], + max_attempts=max_attempts, + error=str(details["exception"]), ), ) async def _dispatch() -> None: diff --git a/backend/app/services/execution_service.py b/backend/app/services/execution_service.py index 1ba03bc6..bb3ffef4 100644 --- a/backend/app/services/execution_service.py +++ b/backend/app/services/execution_service.py @@ -112,12 +112,10 @@ async def execute_script( # Log incoming request self.logger.info( "Received script execution request", - extra={ - "lang": lang, - "lang_version": lang_version, - "script_length": len(script), - "priority": priority, - }, + lang=lang, + lang_version=lang_version, + script_length=len(script), + priority=priority, ) runtime_cfg = RUNTIME_REGISTRY[lang][lang_version] @@ -136,13 +134,11 @@ async def execute_script( self.logger.info( "Created execution record", - extra={ - "execution_id": created_execution.execution_id, - "lang": lang, - "lang_version": lang_version, - "user_id": user_id, - "script_length": len(script), - }, + execution_id=created_execution.execution_id, + lang=lang, + lang_version=lang_version, + user_id=user_id, + script_length=len(script), ) # Metadata and event — use admin-configurable limits @@ -184,11 +180,9 @@ async def execute_script( self.metrics.record_queue_wait_time(duration, lang_and_version) self.logger.info( "Script execution submitted successfully", - extra={ - "execution_id": created_execution.execution_id, - "status": created_execution.status, - "duration_seconds": duration, - }, + execution_id=created_execution.execution_id, + status=created_execution.status, + duration_seconds=duration, ) return created_execution finally: @@ -247,7 +241,8 @@ async def cancel_execution( self.logger.info( "Published cancellation event", - extra={"execution_id": execution_id, "event_id": event.event_id}, + execution_id=execution_id, + event_id=event.event_id, ) return CancelResult( @@ -379,20 +374,18 @@ async def get_execution_result(self, execution_id: str) -> DomainExecution: """ execution = await self.execution_repo.get_execution(execution_id) if not execution: - self.logger.warning("Execution not found", extra={"execution_id": execution_id}) + self.logger.warning("Execution not found", execution_id=execution_id) raise ExecutionNotFoundError(execution_id) self.logger.info( "Execution result retrieved successfully", - extra={ - "execution_id": execution_id, - "status": execution.status, - "lang": execution.lang, - "lang_version": execution.lang_version, - "has_output": bool(execution.stdout), - "has_errors": bool(execution.stderr), - "resource_usage": execution.resource_usage, - }, + execution_id=execution_id, + status=execution.status, + lang=execution.lang, + lang_version=execution.lang_version, + has_output=bool(execution.stdout), + has_errors=bool(execution.stderr), + resource_usage=execution.resource_usage, ) return execution @@ -430,12 +423,10 @@ async def get_user_executions( self.logger.debug( f"Retrieved {len(executions)} executions for user", - extra={ - "user_id": user_id, - "filters": {k: v for k, v in query.items() if k != "user_id"}, - "limit": limit, - "skip": skip, - }, + user_id=user_id, + filters={k: v for k, v in query.items() if k != "user_id"}, + limit=limit, + skip=skip, ) return executions @@ -518,10 +509,10 @@ async def delete_execution(self, execution_id: str, user_id: str) -> bool: deleted = await self.execution_repo.delete_execution(execution_id) if not deleted: - self.logger.warning("Execution not found for deletion", extra={"execution_id": execution_id}) + self.logger.warning("Execution not found for deletion", execution_id=execution_id) raise ExecutionNotFoundError(execution_id) - self.logger.info("Deleted execution", extra={"execution_id": execution_id}) + self.logger.info("Deleted execution", execution_id=execution_id) await self._publish_deletion_event(execution_id, user_id) @@ -549,10 +540,8 @@ async def _publish_deletion_event(self, execution_id: str, user_id: str) -> None self.logger.info( "Published cancellation event", - extra={ - "execution_id": execution_id, - "event_id": event.event_id, - }, + execution_id=execution_id, + event_id=event.event_id, ) async def get_execution_stats( diff --git a/backend/app/services/kafka_event_service.py b/backend/app/services/kafka_event_service.py index 82127b27..a14e8cff 100644 --- a/backend/app/services/kafka_event_service.py +++ b/backend/app/services/kafka_event_service.py @@ -39,5 +39,5 @@ async def publish_event(self, event: DomainEvent, key: str) -> str: await self.kafka_producer.produce(event_to_produce=event, key=key) self.metrics.record_event_published(event.event_type) self.metrics.record_event_processing_duration(time.time() - start_time, event.event_type) - self.logger.info("Event published", extra={"event_type": event.event_type, "event_id": event.event_id}) + self.logger.info("Event published", event_type=event.event_type, event_id=event.event_id) return event.event_id diff --git a/backend/app/services/login_lockout.py b/backend/app/services/login_lockout.py index 1b2cf156..fb139900 100644 --- a/backend/app/services/login_lockout.py +++ b/backend/app/services/login_lockout.py @@ -41,7 +41,8 @@ async def record_failed_attempt(self, username: str) -> bool: await self._redis.set(self._locked_key(username), "1", ex=ttl) self._logger.warning( "Account locked due to too many failed attempts", - extra={"username": username, "attempts": attempts}, + username=username, + attempts=attempts, ) return True diff --git a/backend/app/services/notification_service.py b/backend/app/services/notification_service.py index 8c7171d6..4ea894eb 100644 --- a/backend/app/services/notification_service.py +++ b/backend/app/services/notification_service.py @@ -143,13 +143,11 @@ async def create_notification( ) self.logger.info( f"Creating notification for user {user_id}", - extra={ - "user_id": user_id, - "channel": channel, - "severity": str(severity), - "tags": tags, - "scheduled": scheduled_for is not None, - }, + user_id=user_id, + channel=channel, + severity=str(severity), + tags=tags, + scheduled=scheduled_for is not None, ) # Check throttling @@ -256,14 +254,12 @@ async def worker(uid: str) -> str: self.logger.info( "System notification completed", - extra={ - "severity": cfg.severity, - "title": title, - "total_users": len(users), - "created": created, - "failed": failed, - "throttled": throttled, - }, + severity=cfg.severity, + title=title, + total_users=len(users), + created=created, + failed=failed, + throttled=throttled, ) return {"total_users": len(users), "created": created, "failed": failed, "throttled": throttled} @@ -311,7 +307,7 @@ async def _create_system_for_user( return "created" except Exception as e: self.logger.error( - "Failed to create system notification for user", extra={"user_id": user_id, "error": str(e)} + "Failed to create system notification for user", user_id=user_id, error=str(e) ) return "failed" @@ -349,11 +345,9 @@ async def _send_webhook( self.logger.debug( f"Sending webhook notification to {webhook_url}", - extra={ - "notification_id": str(notification.notification_id), - "payload_size": len(str(payload)), - "webhook_url": webhook_url, - }, + notification_id=str(notification.notification_id), + payload_size=len(str(payload)), + webhook_url=webhook_url, ) trace.get_current_span().set_attributes({ @@ -366,11 +360,9 @@ async def _send_webhook( response.raise_for_status() self.logger.debug( "Webhook delivered successfully", - extra={ - "notification_id": str(notification.notification_id), - "status_code": response.status_code, - "response_time_ms": int(response.elapsed.total_seconds() * 1000), - }, + notification_id=str(notification.notification_id), + status_code=response.status_code, + response_time_ms=int(response.elapsed.total_seconds() * 1000), ) async def _send_slack(self, notification: DomainNotification, subscription: DomainNotificationSubscription) -> None: @@ -402,11 +394,9 @@ async def _send_slack(self, notification: DomainNotification, subscription: Doma self.logger.debug( "Sending Slack notification", - extra={ - "notification_id": str(notification.notification_id), - "has_action": bool(notification.action_url), - "priority_color": self._get_slack_color(notification.severity), - }, + notification_id=str(notification.notification_id), + has_action=bool(notification.action_url), + priority_color=self._get_slack_color(notification.severity), ) trace.get_current_span().set_attributes({ @@ -418,7 +408,8 @@ async def _send_slack(self, notification: DomainNotification, subscription: Doma response.raise_for_status() self.logger.debug( "Slack notification delivered successfully", - extra={"notification_id": str(notification.notification_id), "status_code": response.status_code}, + notification_id=str(notification.notification_id), + status_code=response.status_code, ) def _get_slack_color(self, priority: NotificationSeverity) -> str: @@ -612,13 +603,11 @@ async def _deliver_notification(self, notification: DomainNotification) -> bool: self.logger.info( f"Delivering notification {notification.notification_id}", - extra={ - "notification_id": str(notification.notification_id), - "user_id": notification.user_id, - "channel": notification.channel, - "severity": notification.severity, - "tags": list(notification.tags or []), - }, + notification_id=str(notification.notification_id), + user_id=notification.user_id, + channel=notification.channel, + severity=notification.severity, + tags=list(notification.tags or []), ) subscription = await self.repository.get_subscription(notification.user_id, notification.channel) @@ -672,11 +661,9 @@ async def _attempt() -> None: ) self.logger.info( f"Delivered notification {notification.notification_id}", - extra={ - "notification_id": str(notification.notification_id), - "channel": notification.channel, - "delivery_time_ms": int(delivery_time * 1000), - }, + notification_id=str(notification.notification_id), + channel=notification.channel, + delivery_time_ms=int(delivery_time * 1000), ) notification_type = notification.tags[0] if notification.tags else "unknown" self.metrics.record_notification_sent( diff --git a/backend/app/services/saga/saga_orchestrator.py b/backend/app/services/saga/saga_orchestrator.py index e6c96473..554d2fb8 100644 --- a/backend/app/services/saga/saga_orchestrator.py +++ b/backend/app/services/saga/saga_orchestrator.py @@ -255,13 +255,14 @@ async def cancel_saga(self, saga_id: str) -> bool: try: saga_instance = await self.get_saga_status(saga_id) if not saga_instance: - self.logger.error("Saga not found", extra={"saga_id": saga_id}) + self.logger.error("Saga not found", saga_id=saga_id) return False if saga_instance.state not in [SagaState.RUNNING, SagaState.CREATED]: self.logger.warning( "Cannot cancel saga in current state. Only RUNNING or CREATED sagas can be cancelled.", - extra={"saga_id": saga_id, "state": saga_instance.state}, + saga_id=saga_id, + state=saga_instance.state, ) return False @@ -272,11 +273,9 @@ async def cancel_saga(self, saga_id: str) -> bool: user_id = saga_instance.context_data.user_id self.logger.info( "Saga cancellation initiated", - extra={ - "saga_id": saga_id, - "execution_id": saga_instance.execution_id, - "user_id": user_id, - }, + saga_id=saga_id, + execution_id=saga_instance.execution_id, + user_id=user_id, ) await self._save_saga(saga_instance) @@ -300,13 +299,14 @@ async def cancel_saga(self, saga_id: str) -> bool: await self._compensate_saga(saga_instance, context) - self.logger.info("Saga cancelled successfully", extra={"saga_id": saga_id}) + self.logger.info("Saga cancelled successfully", saga_id=saga_id) return True except Exception as e: self.logger.error( "Error cancelling saga", - extra={"saga_id": saga_id, "error": str(e)}, + saga_id=saga_id, + error=str(e), exc_info=True, ) return False diff --git a/backend/app/services/saga/saga_service.py b/backend/app/services/saga/saga_service.py index e3bd5b7a..8b960aff 100644 --- a/backend/app/services/saga/saga_service.py +++ b/backend/app/services/saga/saga_service.py @@ -32,11 +32,9 @@ def __init__( self.logger.info( "SagaService initialized", - extra={ - "saga_repo": type(saga_repo).__name__, - "execution_repo": type(execution_repo).__name__, - "orchestrator": type(orchestrator).__name__, - }, + saga_repo=type(saga_repo).__name__, + execution_repo=type(execution_repo).__name__, + orchestrator=type(orchestrator).__name__, ) async def check_execution_access(self, execution_id: str, user: User) -> bool: @@ -52,31 +50,31 @@ async def check_execution_access(self, execution_id: str, user: User) -> bool: self.logger.debug( "Access denied to execution", - extra={ - "user_id": user.user_id, - "execution_id": execution_id, - "user_role": user.role, - "execution_exists": execution is not None, - }, + user_id=user.user_id, + execution_id=execution_id, + user_role=user.role, + execution_exists=execution is not None, ) return False async def get_saga_with_access_check(self, saga_id: str, user: User) -> Saga: """Get saga with access control.""" self.logger.debug( - "Getting saga for user", extra={"saga_id": saga_id, "user_id": user.user_id, "user_role": user.role} + "Getting saga for user", saga_id=saga_id, user_id=user.user_id, user_role=user.role ) saga = await self.saga_repo.get_saga(saga_id) if not saga: - self.logger.warning("Saga not found", extra={"saga_id": saga_id}) + self.logger.warning("Saga not found", saga_id=saga_id) raise SagaNotFoundError(saga_id) # Check access permissions if not await self.check_execution_access(saga.execution_id, user): self.logger.warning( "Access denied to saga", - extra={"user_id": user.user_id, "saga_id": saga_id, "execution_id": saga.execution_id}, + user_id=user.user_id, + saga_id=saga_id, + execution_id=saga.execution_id, ) raise SagaAccessDeniedError(saga_id, user.user_id) @@ -90,7 +88,9 @@ async def get_execution_sagas( if not await self.check_execution_access(execution_id, user): self.logger.warning( "Access denied to execution", - extra={"user_id": user.user_id, "execution_id": execution_id, "user_role": user.role}, + user_id=user.user_id, + execution_id=execution_id, + user_role=user.role, ) raise SagaAccessDeniedError(execution_id, user.user_id) @@ -110,28 +110,24 @@ async def list_user_sagas( if not user_execution_ids: self.logger.debug( "User has no executions, returning empty saga list", - extra={"user_id": user.user_id}, + user_id=user.user_id, ) return SagaListResult(sagas=[], total=0, skip=skip, limit=limit) saga_filter.execution_ids = user_execution_ids self.logger.debug( "Filtering sagas for user", - extra={ - "user_id": user.user_id, - "execution_count": len(user_execution_ids), - }, + user_id=user.user_id, + execution_count=len(user_execution_ids), ) # Get sagas from repository result = await self.saga_repo.list_sagas(saga_filter, limit, skip) self.logger.debug( "Listed sagas for user", - extra={ - "user_id": user.user_id, - "count": len(result.sagas), - "total": result.total, - "state_filter": state, - }, + user_id=user.user_id, + count=len(result.sagas), + total=result.total, + state_filter=state, ) return result @@ -139,7 +135,9 @@ async def cancel_saga(self, saga_id: str, user: User) -> SagaCancellationResult: """Cancel a saga with permission check.""" self.logger.info( "User requesting saga cancellation", - extra={"user_id": user.user_id, "saga_id": saga_id, "user_role": user.role}, + user_id=user.user_id, + saga_id=saga_id, + user_role=user.role, ) # Get saga with access check saga = await self.get_saga_with_access_check(saga_id, user) @@ -153,10 +151,12 @@ async def cancel_saga(self, saga_id: str, user: User) -> SagaCancellationResult: if success: self.logger.info( "User cancelled saga", - extra={"user_id": user.user_id, "saga_id": saga_id, "user_role": user.role}, + user_id=user.user_id, + saga_id=saga_id, + user_role=user.role, ) else: - self.logger.error("Failed to cancel saga", extra={"saga_id": saga_id, "user_id": user.user_id}) + self.logger.error("Failed to cancel saga", saga_id=saga_id, user_id=user.user_id) message = "Saga cancelled successfully" if success else "Failed to cancel saga" return SagaCancellationResult(success=success, message=message, saga_id=saga_id) diff --git a/backend/app/services/saved_script_service.py b/backend/app/services/saved_script_service.py index e24bfb5e..2779d428 100644 --- a/backend/app/services/saved_script_service.py +++ b/backend/app/services/saved_script_service.py @@ -20,45 +20,41 @@ async def create_saved_script( ) -> DomainSavedScript: self.logger.info( "Creating new saved script", - extra={ - "user_id": user_id, - "script_name": saved_script_create.name, - "script_length": len(saved_script_create.script), - }, + user_id=user_id, + script_name=saved_script_create.name, + script_length=len(saved_script_create.script), ) created_script = await self.saved_script_repo.create_saved_script(saved_script_create, user_id) self.logger.info( "Successfully created saved script", - extra={ - "script_id": str(created_script.script_id), - "user_id": user_id, - "script_name": created_script.name, - }, + script_id=str(created_script.script_id), + user_id=user_id, + script_name=created_script.name, ) return created_script async def get_saved_script(self, script_id: str, user_id: str) -> DomainSavedScript: self.logger.info( "Retrieving saved script", - extra={ - "user_id": user_id, - "script_id": script_id, - }, + user_id=user_id, + script_id=script_id, ) script = await self.saved_script_repo.get_saved_script(script_id, user_id) if not script: self.logger.warning( "Script not found for user", - extra={"user_id": user_id, "script_id": script_id}, + user_id=user_id, + script_id=script_id, ) raise SavedScriptNotFoundError(script_id) self.logger.info( "Successfully retrieved script", - extra={"script_id": script.script_id, "script_name": script.name}, + script_id=script.script_id, + script_name=script.name, ) return script @@ -67,12 +63,10 @@ async def update_saved_script( ) -> DomainSavedScript: self.logger.info( "Updating saved script", - extra={ - "user_id": user_id, - "script_id": script_id, - "script_name": update_data.name, - "script_length": len(update_data.script) if update_data.script else None, - }, + user_id=user_id, + script_id=script_id, + script_name=update_data.name, + script_length=len(update_data.script) if update_data.script else None, ) await self.saved_script_repo.update_saved_script(script_id, user_id, update_data) @@ -82,44 +76,44 @@ async def update_saved_script( self.logger.info( "Successfully updated script", - extra={"script_id": script_id, "script_name": updated_script.name}, + script_id=script_id, + script_name=updated_script.name, ) return updated_script async def delete_saved_script(self, script_id: str, user_id: str) -> None: self.logger.info( "Deleting saved script", - extra={ - "user_id": user_id, - "script_id": script_id, - }, + user_id=user_id, + script_id=script_id, ) deleted = await self.saved_script_repo.delete_saved_script(script_id, user_id) if not deleted: self.logger.warning( "Script not found for user", - extra={"user_id": user_id, "script_id": script_id}, + user_id=user_id, + script_id=script_id, ) raise SavedScriptNotFoundError(script_id) self.logger.info( "Successfully deleted script", - extra={"script_id": script_id, "user_id": user_id}, + script_id=script_id, + user_id=user_id, ) async def list_saved_scripts(self, user_id: str) -> DomainSavedScriptListResult: self.logger.info( "Listing saved scripts", - extra={ - "user_id": user_id, - }, + user_id=user_id, ) scripts = await self.saved_script_repo.list_saved_scripts(user_id) self.logger.info( "Successfully retrieved saved scripts", - extra={"user_id": user_id, "script_count": len(scripts)}, + user_id=user_id, + script_count=len(scripts), ) return DomainSavedScriptListResult(scripts=scripts) diff --git a/backend/app/services/sse/redis_bus.py b/backend/app/services/sse/redis_bus.py index 72dff3e2..ebda9c7a 100644 --- a/backend/app/services/sse/redis_bus.py +++ b/backend/app/services/sse/redis_bus.py @@ -31,7 +31,8 @@ async def get(self, model: Type[T]) -> T | None: except Exception as e: self.logger.warning( f"Failed to parse Redis message on channel {self._channel}: {e}", - extra={"channel": self._channel, "model": model.__name__}, + channel=self._channel, + model=model.__name__, ) return None diff --git a/backend/app/services/sse/sse_service.py b/backend/app/services/sse/sse_service.py index 7a7e4cd7..3d7ceb1d 100644 --- a/backend/app/services/sse/sse_service.py +++ b/backend/app/services/sse/sse_service.py @@ -91,7 +91,8 @@ async def create_execution_stream(self, execution_id: str, user_id: str) -> Asyn self.logger.info( "Received Redis message for execution", - extra={"execution_id": execution_id, "event_type": msg.event_type}, + execution_id=execution_id, + event_type=msg.event_type, ) try: sse_event = await self._build_sse_event_from_redis(execution_id, msg) @@ -100,19 +101,22 @@ async def create_execution_stream(self, execution_id: str, user_id: str) -> Asyn if msg.event_type in self.TERMINAL_EVENT_TYPES: self.logger.info( "Terminal event for execution", - extra={"execution_id": execution_id, "event_type": msg.event_type}, + execution_id=execution_id, + event_type=msg.event_type, ) return except Exception as e: self.logger.warning( "Failed to process SSE message", - extra={"execution_id": execution_id, "event_type": msg.event_type, "error": str(e)}, + execution_id=execution_id, + event_type=msg.event_type, + error=str(e), ) finally: if subscription is not None: await asyncio.shield(subscription.close()) self.metrics.decrement_sse_connections("executions") - self.logger.info("SSE connection closed", extra={"execution_id": execution_id}) + self.logger.info("SSE connection closed", execution_id=execution_id) async def _build_sse_event_from_redis(self, execution_id: str, msg: RedisSSEMessage) -> SSEExecutionEventData: """Build typed SSE event from Redis message.""" @@ -135,7 +139,7 @@ async def create_notification_stream(self, user_id: str) -> AsyncGenerator[dict[ subscription: SSERedisSubscription | None = None try: subscription = await self.sse_bus.open_notification_subscription(user_id) - self.logger.info("Notification subscription opened", extra={"user_id": user_id}) + self.logger.info("Notification subscription opened", user_id=user_id) while True: redis_msg = await subscription.get(RedisNotificationMessage) @@ -158,7 +162,7 @@ async def create_notification_stream(self, user_id: str) -> AsyncGenerator[dict[ finally: if subscription is not None: await asyncio.shield(subscription.close()) - self.logger.info("Notification stream closed", extra={"user_id": user_id}) + self.logger.info("Notification stream closed", user_id=user_id) def _format_sse_event(self, event: SSEExecutionEventData) -> dict[str, Any]: """Format typed SSE event for sse-starlette.""" diff --git a/backend/app/services/user_settings_service.py b/backend/app/services/user_settings_service.py index bee8794f..d7f21584 100644 --- a/backend/app/services/user_settings_service.py +++ b/backend/app/services/user_settings_service.py @@ -40,14 +40,15 @@ def __init__( self.logger.info( "UserSettingsService initialized", - extra={"cache_ttl_seconds": self._cache_ttl.total_seconds(), "max_cache_size": self._max_cache_size}, + cache_ttl_seconds=self._cache_ttl.total_seconds(), + max_cache_size=self._max_cache_size, ) async def get_user_settings(self, user_id: str) -> DomainUserSettings: """Get settings with cache; rebuild and cache on miss.""" if user_id in self._cache: cached = self._cache[user_id] - self.logger.debug(f"Settings cache hit for user {user_id}", extra={"cache_size": len(self._cache)}) + self.logger.debug(f"Settings cache hit for user {user_id}", cache_size=len(self._cache)) return cached return await self.get_user_settings_fresh(user_id) @@ -197,12 +198,12 @@ def _apply_event(self, settings: DomainUserSettings, event: DomainUserSettingsCh async def invalidate_cache(self, user_id: str) -> None: """Invalidate cached settings for a user.""" if self._cache.pop(user_id, None) is not None: - self.logger.debug(f"Invalidated cache for user {user_id}", extra={"cache_size": len(self._cache)}) + self.logger.debug(f"Invalidated cache for user {user_id}", cache_size=len(self._cache)) def _add_to_cache(self, user_id: str, settings: DomainUserSettings) -> None: """Add settings to TTL+LRU cache.""" self._cache[user_id] = settings - self.logger.debug(f"Cached settings for user {user_id}", extra={"cache_size": len(self._cache)}) + self.logger.debug(f"Cached settings for user {user_id}", cache_size=len(self._cache)) async def reset_user_settings(self, user_id: str) -> None: """Reset user settings by deleting all data and cache.""" diff --git a/backend/app/settings.py b/backend/app/settings.py index 8c12c163..3eaa8b52 100644 --- a/backend/app/settings.py +++ b/backend/app/settings.py @@ -167,4 +167,4 @@ def __init__( SECURE_COOKIES: bool = True # Logging configuration - LOG_LEVEL: str = Field(default="DEBUG", description="Logging level (DEBUG, INFO, WARNING, ERROR, CRITICAL)") + LOG_LEVEL: Literal["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"] = "DEBUG" diff --git a/backend/pyproject.toml b/backend/pyproject.toml index ba388c3d..ba26ca9b 100644 --- a/backend/pyproject.toml +++ b/backend/pyproject.toml @@ -122,7 +122,7 @@ dependencies = [ "aiofiles==25.1.0", "APScheduler==3.10.4", "faststream[kafka]==0.6.6", - "structlog>=25.5.0", + "structlog==25.5.0", ] [build-system] diff --git a/backend/tests/unit/core/test_logging_and_correlation.py b/backend/tests/unit/core/test_logging_and_correlation.py index 0f09fd5d..f5a19735 100644 --- a/backend/tests/unit/core/test_logging_and_correlation.py +++ b/backend/tests/unit/core/test_logging_and_correlation.py @@ -75,6 +75,34 @@ def test_non_string_event_unchanged(self) -> None: result = sanitize_sensitive_data(None, "info", event_dict) assert result["event"] == 42 + def test_sanitizes_exc_info_field(self) -> None: + event_dict: dict[str, Any] = { + "event": "connection failed", + "exc_info": "ConnectionError: mongodb://user:secret123@host/db", + } + result = sanitize_sensitive_data(None, "error", event_dict) + assert "secret123" not in result["exc_info"] + assert "MONGODB_REDACTED" in result["exc_info"] + + def test_sanitizes_stack_info_field(self) -> None: + event_dict: dict[str, Any] = { + "event": "debug trace", + "stack_info": 'password: "hunter2" at line 42', + } + result = sanitize_sensitive_data(None, "debug", event_dict) + assert "hunter2" not in result["stack_info"] + + def test_sanitizes_extra_string_values(self) -> None: + event_dict: dict[str, Any] = { + "event": "check", + "url": "https://admin:s3cret@api.example.com/v1", + "count": 5, + } + result = sanitize_sensitive_data(None, "info", event_dict) + assert "s3cret" not in result["url"] + assert "URL_CREDS_REDACTED" in result["url"] + assert result["count"] == 5 + def test_has_expected_pattern_count(self) -> None: assert len(SENSITIVE_PATTERNS) == 6 diff --git a/backend/uv.lock b/backend/uv.lock index 3c78102d..6576b413 100644 --- a/backend/uv.lock +++ b/backend/uv.lock @@ -1288,7 +1288,7 @@ requires-dist = [ { name = "sortedcontainers", specifier = "==2.4.0" }, { name = "sse-starlette", specifier = "==3.2.0" }, { name = "starlette", specifier = "==0.49.1" }, - { name = "structlog", specifier = ">=25.5.0" }, + { name = "structlog", specifier = "==25.5.0" }, { name = "tiktoken", specifier = "==0.11.0" }, { name = "tomli", specifier = "==2.0.2" }, { name = "typing-extensions", specifier = "==4.12.2" }, diff --git a/docs/operations/logging.md b/docs/operations/logging.md index 8d242752..70eb9b89 100644 --- a/docs/operations/logging.md +++ b/docs/operations/logging.md @@ -34,17 +34,15 @@ sensitive data by pattern-matching things like API keys, JWT tokens, and databas ## Structured logging -All log calls use the `extra` parameter to pass structured data rather than interpolating values into the message string. The message itself is a static string that describes what happened; the details go in `extra` where they become separate JSON fields. +All log calls pass structured data as keyword arguments rather than interpolating values into the message string. The message itself is a static string that describes what happened; the details go in keyword args where they become separate top-level JSON fields. ```python # This is how logging looks throughout the codebase self.logger.info( "Event deleted by admin", - extra={ - "event_id": event_id, - "admin_email": admin.email, - "event_type": result.event_type, - }, + event_id=event_id, + admin_email=admin.email, + event_type=result.event_type, ) ``` @@ -64,9 +62,9 @@ logger.warning(f"Processing event {event_id}") # Your log output now contains a forged critical alert ``` -The fix is to keep user data out of the message string entirely. When you put it in `extra`, the JSON formatter escapes special characters, and the malicious content becomes a harmless string value rather than a log line injection. +The fix is to keep user data out of the message string entirely. When you pass it as a keyword argument, the JSON renderer escapes special characters, and the malicious content becomes a harmless string value rather than a log line injection. -The codebase treats these as user-controlled and keeps them in `extra`: path parameters like execution_id or saga_id, +The codebase treats these as user-controlled and passes them as keyword args: path parameters like execution_id or saga_id, query parameters, request body fields, Kafka message content, database results derived from user input, and exception messages (which often contain user data). @@ -82,13 +80,14 @@ Trace IDs are injected automatically by the OTel filter: | `request_path` | HTTP request | API endpoint path | | `client_host` | HTTP request | Client IP address | -For domain-specific context, developers add fields to `extra` based on what operation they're logging. The pattern is -consistent: the message says what happened, `extra` says to what and by whom. +For domain-specific context, developers add keyword arguments based on what operation they're logging. The pattern is +consistent: the message says what happened, the keyword args say to what and by whom. ## Practical use When something goes wrong, start by filtering logs by `trace_id` to see everything that happened during that -request. Use the `trace_id` to jump to Jaeger for the full distributed trace. +request. Use the `trace_id` to view the full distributed trace in your tracing backend (e.g. Tempo, or any +OTLP-compatible collector configured via `OTLP_TRACES_ENDPOINT` in settings). | Log Level | Use case | |-----------|-------------------------------------------------------------| @@ -97,7 +96,7 @@ request. Use the `trace_id` to jump to Jaeger for the full distributed trace. | WARNING | Recoverable issues | | ERROR | Failures requiring attention | -The log level is controlled by the `LOG_LEVEL` environment variable. +The log level is controlled by the `LOG_LEVEL` setting in `config.toml`. ## Key files diff --git a/frontend/src/routes/admin/AdminEvents.svelte b/frontend/src/routes/admin/AdminEvents.svelte index 3cca9c0b..010d461e 100644 --- a/frontend/src/routes/admin/AdminEvents.svelte +++ b/frontend/src/routes/admin/AdminEvents.svelte @@ -166,7 +166,7 @@ progress_percentage: 0, failed_events: 0, skipped_events: 0, - replay_id: '', + replay_id: response.replay_id, created_at: new Date().toISOString() }; checkReplayStatus(sessionId); From 8b25500f826e34e1a06e65459ac8563dfff0cbae Mon Sep 17 00:00:00 2001 From: HardMax71 Date: Fri, 13 Feb 2026 15:31:21 +0100 Subject: [PATCH 07/12] fix: found issues --- backend/app/core/dishka_lifespan.py | 5 +++++ backend/app/core/logging.py | 16 ++++++++++++--- backend/app/core/tracing/__init__.py | 15 +++++++++----- backend/app/services/notification_service.py | 9 ++++++--- .../unit/core/test_logging_and_correlation.py | 20 ++++++++++++++++--- docs/operations/logging.md | 4 ++-- 6 files changed, 53 insertions(+), 16 deletions(-) diff --git a/backend/app/core/dishka_lifespan.py b/backend/app/core/dishka_lifespan.py index ce107cac..11a8f2e2 100644 --- a/backend/app/core/dishka_lifespan.py +++ b/backend/app/core/dishka_lifespan.py @@ -11,6 +11,7 @@ from faststream.kafka import KafkaBroker from pymongo import AsyncMongoClient +from app.core.tracing import Tracer from app.db.docs import ALL_DOCUMENTS from app.events.handlers import ( register_notification_subscriber, @@ -44,6 +45,10 @@ async def lifespan(app: FastAPI) -> AsyncGenerator[None, None]: await init_beanie(database=database, document_models=ALL_DOCUMENTS) logger.info("MongoDB initialized via Beanie") + tracer: Tracer = await container.get(Tracer) + tracer.instrument_app(app) + logger.info("FastAPI OpenTelemetry instrumentation applied") + logger.info( "Starting application with dishka DI", project_name=settings.PROJECT_NAME, diff --git a/backend/app/core/logging.py b/backend/app/core/logging.py index d5a8c398..90c8d2c6 100644 --- a/backend/app/core/logging.py +++ b/backend/app/core/logging.py @@ -13,13 +13,21 @@ (r"(eyJ[A-Za-z0-9\-_]+\.eyJ[A-Za-z0-9\-_]+\.[A-Za-z0-9\-_]+)", r"***JWT_REDACTED***"), (r"(mongodb(?:\+srv)?://[^:]+:)([^@]+)(@)", r"\1***MONGODB_REDACTED***\3"), (r"(https?://[^:]+:)([^@]+)(@)", r"\1***URL_CREDS_REDACTED***\3"), - (r"([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,})", r"***EMAIL_REDACTED***"), ] +_EMAIL_PATTERN = re.compile(r"([a-zA-Z0-9._%+-]+)@([a-zA-Z0-9.-]+\.[a-zA-Z]{2,})", re.IGNORECASE) + + +def _mask_email(match: re.Match[str]) -> str: + local = match.group(1) + domain = match.group(2) + return f"{local[:3]}***@{domain}" + def _redact(value: str) -> str: for pattern, replacement in SENSITIVE_PATTERNS: value = re.sub(pattern, replacement, value, flags=re.IGNORECASE) + value = _EMAIL_PATTERN.sub(_mask_email, value) return value @@ -31,7 +39,8 @@ def sanitize_sensitive_data( """Structlog processor that redacts sensitive data from all string fields. Covers event message, formatted exception text, stack info, and any - string value added by prior processors. + string value added by prior processors. Emails are masked to show the + first 3 characters of the local part plus the full domain. """ for key, value in event_dict.items(): if isinstance(value, str): @@ -78,7 +87,8 @@ def setup_logger(log_level: str) -> structlog.stdlib.BoundLogger: cache_logger_on_first_use=True, ) - logging.basicConfig(level=log_level.upper(), format="%(message)s", handlers=[logging.StreamHandler()]) + logging.basicConfig(format="%(message)s", handlers=[logging.StreamHandler()]) + logging.getLogger().setLevel(log_level.upper()) logger: structlog.stdlib.BoundLogger = structlog.get_logger("integr8scode") return logger diff --git a/backend/app/core/tracing/__init__.py b/backend/app/core/tracing/__init__.py index 803a7e69..787e2d56 100644 --- a/backend/app/core/tracing/__init__.py +++ b/backend/app/core/tracing/__init__.py @@ -1,6 +1,7 @@ import os import structlog +from fastapi import FastAPI from opentelemetry import trace from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import OTLPSpanExporter from opentelemetry.instrumentation.fastapi import FastAPIInstrumentor @@ -54,11 +55,15 @@ def __init__(self, settings: Settings, logger: structlog.stdlib.BoundLogger) -> set_global_textmap(TraceContextTextMapPropagator()) tp = trace.get_tracer_provider() - FastAPIInstrumentor().instrument( - tracer_provider=tp, excluded_urls="health,metrics,docs,openapi.json", - ) HTTPXClientInstrumentor().instrument(tracer_provider=tp) PymongoInstrumentor().instrument(tracer_provider=tp) - LoggingInstrumentor().instrument(set_logging_format=True, log_level="INFO") + LoggingInstrumentor().instrument(set_logging_format=False, log_level="INFO") + + logger.info("Tracing initialized", service_name=name) - logger.info(f"Tracing initialized for {name}") + def instrument_app(self, app: FastAPI) -> None: + """Instrument an existing FastAPI app with OpenTelemetry middleware.""" + tp = trace.get_tracer_provider() + FastAPIInstrumentor().instrument_app( + app, tracer_provider=tp, excluded_urls="health,metrics,docs,openapi.json", + ) diff --git a/backend/app/services/notification_service.py b/backend/app/services/notification_service.py index 4ea894eb..ef837da6 100644 --- a/backend/app/services/notification_service.py +++ b/backend/app/services/notification_service.py @@ -2,6 +2,7 @@ from dataclasses import dataclass, field from datetime import UTC, datetime, timedelta from typing import Awaitable, Callable +from urllib.parse import urlparse import backoff import httpx @@ -343,17 +344,19 @@ async def _send_webhook( headers = notification.webhook_headers or {} headers["Content-Type"] = "application/json" + safe_host = urlparse(webhook_url).netloc + self.logger.debug( - f"Sending webhook notification to {webhook_url}", + "Sending webhook notification", notification_id=str(notification.notification_id), payload_size=len(str(payload)), - webhook_url=webhook_url, + webhook_host=safe_host, ) trace.get_current_span().set_attributes({ "notification.id": str(notification.notification_id), "notification.channel": "webhook", - "notification.webhook_url": webhook_url, + "notification.webhook_host": safe_host, }) async with httpx.AsyncClient() as client: response = await client.post(webhook_url, json=payload, headers=headers, timeout=30.0) diff --git a/backend/tests/unit/core/test_logging_and_correlation.py b/backend/tests/unit/core/test_logging_and_correlation.py index f5a19735..4bbc841e 100644 --- a/backend/tests/unit/core/test_logging_and_correlation.py +++ b/backend/tests/unit/core/test_logging_and_correlation.py @@ -38,7 +38,7 @@ def _run_processor(event: str) -> str: "secretpass", "MONGODB_REDACTED", ), - ("user email: test@example.com", "test@example.com", "EMAIL_REDACTED"), + ("user email: test@example.com", "test@example.com", "tes***@example.com"), ('password: "mysecret123"', "mysecret123", "REDACTED"), ( "https://user:password@api.example.com/endpoint", @@ -68,7 +68,7 @@ def test_sanitizes_multiple_types_in_one_message(self) -> None: result = self._run_processor(msg) assert "BEARER_TOKEN_REDACTED" in result assert "MONGODB_REDACTED" in result - assert "EMAIL_REDACTED" in result + assert "a***@b.com" in result def test_non_string_event_unchanged(self) -> None: event_dict: dict[str, Any] = {"event": 42} @@ -104,7 +104,21 @@ def test_sanitizes_extra_string_values(self) -> None: assert result["count"] == 5 def test_has_expected_pattern_count(self) -> None: - assert len(SENSITIVE_PATTERNS) == 6 + assert len(SENSITIVE_PATTERNS) == 5 + + @pytest.mark.parametrize( + ("email", "expected"), + [ + ("admin@example.com", "adm***@example.com"), + ("ab@example.com", "ab***@example.com"), + ("longlocal@domain.org", "lon***@domain.org"), + ], + ids=["normal", "short_local", "long_local"], + ) + def test_email_masked_preserves_prefix_and_domain(self, email: str, expected: str) -> None: + event_dict: dict[str, Any] = {"event": "action", "detail": email} + result = sanitize_sensitive_data(None, "info", event_dict) + assert result["detail"] == expected class TestAddOtelContext: diff --git a/docs/operations/logging.md b/docs/operations/logging.md index 70eb9b89..51a2d5b8 100644 --- a/docs/operations/logging.md +++ b/docs/operations/logging.md @@ -21,7 +21,7 @@ The logger is created once during application startup via dependency injection. JSON formatter and attaches a filter for OpenTelemetry trace context: ```python ---8<-- "backend/app/core/logging.py:110:147" +--8<-- "backend/app/core/logging.py:66:94" ``` The JSON formatter does two things beyond basic formatting. First, it injects context that would be tedious to pass @@ -29,7 +29,7 @@ manually—the trace and span IDs from OpenTelemetry, and request metadata like sensitive data by pattern-matching things like API keys, JWT tokens, and database URLs: ```python ---8<-- "backend/app/core/logging.py:35:59" +--8<-- "backend/app/core/logging.py:34:63" ``` ## Structured logging From cbaaf1e8b92c713675c92538cef19e7c2c9fc4ff Mon Sep 17 00:00:00 2001 From: HardMax71 Date: Fri, 13 Feb 2026 15:49:38 +0100 Subject: [PATCH 08/12] fix: tracer - to separate file; also removed os.environ calls (right now taking stuff from settings obj) --- backend/app/core/dishka_lifespan.py | 8 ++- backend/app/core/providers.py | 6 +- backend/app/core/tracing/__init__.py | 70 +------------------ backend/app/core/tracing/tracer.py | 58 +++++++++++++++ backend/app/services/pod_monitor/config.py | 5 +- backend/app/settings.py | 1 + backend/scripts/create_topics.py | 6 +- .../pod_monitor/test_config_and_init.py | 2 +- 8 files changed, 78 insertions(+), 78 deletions(-) create mode 100644 backend/app/core/tracing/tracer.py diff --git a/backend/app/core/dishka_lifespan.py b/backend/app/core/dishka_lifespan.py index 11a8f2e2..d484e267 100644 --- a/backend/app/core/dishka_lifespan.py +++ b/backend/app/core/dishka_lifespan.py @@ -9,6 +9,8 @@ from dishka.integrations.faststream import setup_dishka as setup_dishka_faststream from fastapi import FastAPI from faststream.kafka import KafkaBroker +from opentelemetry import trace +from opentelemetry.instrumentation.fastapi import FastAPIInstrumentor from pymongo import AsyncMongoClient from app.core.tracing import Tracer @@ -45,8 +47,10 @@ async def lifespan(app: FastAPI) -> AsyncGenerator[None, None]: await init_beanie(database=database, document_models=ALL_DOCUMENTS) logger.info("MongoDB initialized via Beanie") - tracer: Tracer = await container.get(Tracer) - tracer.instrument_app(app) + await container.get(Tracer) + FastAPIInstrumentor().instrument_app( + app, tracer_provider=trace.get_tracer_provider(), excluded_urls="health,metrics,docs,openapi.json", + ) logger.info("FastAPI OpenTelemetry instrumentation applied") logger.info( diff --git a/backend/app/core/providers.py b/backend/app/core/providers.py index 65733e53..beb24f6e 100644 --- a/backend/app/core/providers.py +++ b/backend/app/core/providers.py @@ -793,6 +793,7 @@ def get_event_mapper( @provide async def get_pod_monitor( self, + settings: Settings, kafka_event_service: KafkaEventService, api_client: k8s_client.ApiClient, logger: structlog.stdlib.BoundLogger, @@ -800,7 +801,10 @@ async def get_pod_monitor( kubernetes_metrics: KubernetesMetrics, ) -> AsyncIterator[PodMonitor]: - config = PodMonitorConfig() + config = PodMonitorConfig( + namespace=settings.K8S_NAMESPACE, + kubeconfig_path=settings.KUBERNETES_CONFIG_PATH, + ) monitor = PodMonitor( config=config, kafka_event_service=kafka_event_service, diff --git a/backend/app/core/tracing/__init__.py b/backend/app/core/tracing/__init__.py index 787e2d56..78550282 100644 --- a/backend/app/core/tracing/__init__.py +++ b/backend/app/core/tracing/__init__.py @@ -1,69 +1,3 @@ -import os +from app.core.tracing.tracer import Tracer -import structlog -from fastapi import FastAPI -from opentelemetry import trace -from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import OTLPSpanExporter -from opentelemetry.instrumentation.fastapi import FastAPIInstrumentor -from opentelemetry.instrumentation.httpx import HTTPXClientInstrumentor -from opentelemetry.instrumentation.logging import LoggingInstrumentor -from opentelemetry.instrumentation.pymongo import PymongoInstrumentor -from opentelemetry.propagate import set_global_textmap -from opentelemetry.sdk.resources import SERVICE_NAME, SERVICE_VERSION, Resource -from opentelemetry.sdk.trace import TracerProvider -from opentelemetry.sdk.trace.export import BatchSpanProcessor -from opentelemetry.sdk.trace.sampling import ALWAYS_OFF, ALWAYS_ON, ParentBased, Sampler, TraceIdRatioBased -from opentelemetry.trace.propagation.tracecontext import TraceContextTextMapPropagator - -from app.settings import Settings - - -class Tracer: - """DI-managed OpenTelemetry tracer. Initialization happens on construction.""" - - def __init__(self, settings: Settings, logger: structlog.stdlib.BoundLogger) -> None: - name = settings.TRACING_SERVICE_NAME - rate = settings.TRACING_SAMPLING_RATE - - resource = Resource.create({ - SERVICE_NAME: name, - SERVICE_VERSION: settings.TRACING_SERVICE_VERSION, - "deployment.environment": "test" if settings.TESTING else "production", - "service.namespace": "integr8scode", - "service.instance.id": os.environ.get("HOSTNAME", "unknown"), - }) - - sampler: Sampler - if rate <= 0: - sampler = ALWAYS_OFF - elif rate >= 1.0: - sampler = ALWAYS_ON - else: - sampler = ParentBased(root=TraceIdRatioBased(rate)) - - provider = TracerProvider(resource=resource, sampler=sampler) - - if settings.OTLP_TRACES_ENDPOINT: - provider.add_span_processor( - BatchSpanProcessor(OTLPSpanExporter( - endpoint=settings.OTLP_TRACES_ENDPOINT, - insecure=settings.OTLP_TRACES_ENDPOINT.startswith("http://"), - )) - ) - - trace.set_tracer_provider(provider) - set_global_textmap(TraceContextTextMapPropagator()) - - tp = trace.get_tracer_provider() - HTTPXClientInstrumentor().instrument(tracer_provider=tp) - PymongoInstrumentor().instrument(tracer_provider=tp) - LoggingInstrumentor().instrument(set_logging_format=False, log_level="INFO") - - logger.info("Tracing initialized", service_name=name) - - def instrument_app(self, app: FastAPI) -> None: - """Instrument an existing FastAPI app with OpenTelemetry middleware.""" - tp = trace.get_tracer_provider() - FastAPIInstrumentor().instrument_app( - app, tracer_provider=tp, excluded_urls="health,metrics,docs,openapi.json", - ) +__all__ = ["Tracer"] diff --git a/backend/app/core/tracing/tracer.py b/backend/app/core/tracing/tracer.py new file mode 100644 index 00000000..527b8076 --- /dev/null +++ b/backend/app/core/tracing/tracer.py @@ -0,0 +1,58 @@ +import structlog +from opentelemetry import trace +from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import OTLPSpanExporter +from opentelemetry.instrumentation.httpx import HTTPXClientInstrumentor +from opentelemetry.instrumentation.logging import LoggingInstrumentor +from opentelemetry.instrumentation.pymongo import PymongoInstrumentor +from opentelemetry.propagate import set_global_textmap +from opentelemetry.sdk.resources import SERVICE_NAME, SERVICE_VERSION, Resource +from opentelemetry.sdk.trace import TracerProvider +from opentelemetry.sdk.trace.export import BatchSpanProcessor +from opentelemetry.sdk.trace.sampling import ALWAYS_OFF, ALWAYS_ON, ParentBased, Sampler, TraceIdRatioBased +from opentelemetry.trace.propagation.tracecontext import TraceContextTextMapPropagator + +from app.settings import Settings + + +class Tracer: + """DI-managed OpenTelemetry tracer. Initialization happens on construction.""" + + def __init__(self, settings: Settings, logger: structlog.stdlib.BoundLogger) -> None: + name = settings.TRACING_SERVICE_NAME + rate = settings.TRACING_SAMPLING_RATE + + resource = Resource.create({ + SERVICE_NAME: name, + SERVICE_VERSION: settings.TRACING_SERVICE_VERSION, + "deployment.environment": "test" if settings.TESTING else "production", + "service.namespace": "integr8scode", + "service.instance.id": settings.HOSTNAME, + }) + + sampler: Sampler + if rate <= 0: + sampler = ALWAYS_OFF + elif rate >= 1.0: + sampler = ALWAYS_ON + else: + sampler = ParentBased(root=TraceIdRatioBased(rate)) + + provider = TracerProvider(resource=resource, sampler=sampler) + + if settings.OTLP_TRACES_ENDPOINT: + provider.add_span_processor( + BatchSpanProcessor(OTLPSpanExporter( + endpoint=settings.OTLP_TRACES_ENDPOINT, + insecure=settings.OTLP_TRACES_ENDPOINT.startswith("http://"), + )) + ) + + trace.set_tracer_provider(provider) + set_global_textmap(TraceContextTextMapPropagator()) + + tp = trace.get_tracer_provider() + HTTPXClientInstrumentor().instrument(tracer_provider=tp) + PymongoInstrumentor().instrument(tracer_provider=tp) + LoggingInstrumentor().instrument(set_logging_format=False, log_level="INFO") + + logger.info("Tracing initialized", service_name=name) diff --git a/backend/app/services/pod_monitor/config.py b/backend/app/services/pod_monitor/config.py index 04af8524..97b12aa6 100644 --- a/backend/app/services/pod_monitor/config.py +++ b/backend/app/services/pod_monitor/config.py @@ -1,4 +1,3 @@ -import os from dataclasses import dataclass, field from app.domain.enums import EventType, KafkaTopic @@ -17,8 +16,8 @@ class PodMonitorConfig: execution_failed_topic: KafkaTopic = get_topic_for_event(EventType.EXECUTION_FAILED) # Kubernetes settings - namespace: str = os.environ.get("K8S_NAMESPACE", "integr8scode") - kubeconfig_path: str | None = os.environ.get("KUBECONFIG", None) + namespace: str = "integr8scode" + kubeconfig_path: str | None = None in_cluster: bool = False # Watch settings diff --git a/backend/app/settings.py b/backend/app/settings.py index 3eaa8b52..e3046241 100644 --- a/backend/app/settings.py +++ b/backend/app/settings.py @@ -152,6 +152,7 @@ def __init__( SERVICE_NAME: str = "integr8scode-backend" SERVICE_VERSION: str = "1.0.0" ENVIRONMENT: str = "production" # deployment environment (production, staging, development) + HOSTNAME: str = "unknown" # container hostname, set via TOML or override # OpenTelemetry metrics export endpoint OTEL_EXPORTER_OTLP_ENDPOINT: str | None = None diff --git a/backend/scripts/create_topics.py b/backend/scripts/create_topics.py index 2cf81e21..75477ac9 100755 --- a/backend/scripts/create_topics.py +++ b/backend/scripts/create_topics.py @@ -4,7 +4,6 @@ """ import asyncio -import os import sys from aiokafka.admin import AIOKafkaAdminClient, NewTopic @@ -13,7 +12,8 @@ from app.infrastructure.kafka.topics import get_all_topics, get_topic_configs from app.settings import Settings -logger = setup_logger(os.environ.get("LOG_LEVEL", "INFO")) +settings = Settings() +logger = setup_logger(settings.LOG_LEVEL) async def create_topics(settings: Settings) -> None: @@ -100,7 +100,7 @@ async def main() -> None: logger.info("Starting Kafka topic creation...") try: - await create_topics(Settings()) + await create_topics(settings) logger.info("Topic creation completed successfully") except Exception as e: logger.error(f"Topic creation failed: {e}") diff --git a/backend/tests/unit/services/pod_monitor/test_config_and_init.py b/backend/tests/unit/services/pod_monitor/test_config_and_init.py index 57fd710a..8e2c14d6 100644 --- a/backend/tests/unit/services/pod_monitor/test_config_and_init.py +++ b/backend/tests/unit/services/pod_monitor/test_config_and_init.py @@ -9,7 +9,7 @@ def test_pod_monitor_config_defaults() -> None: cfg = PodMonitorConfig() - assert cfg.namespace in {"integr8scode", "default"} + assert cfg.namespace == "integr8scode" assert isinstance(cfg.pod_events_topic, KafkaTopic) and cfg.pod_events_topic assert isinstance(cfg.execution_completed_topic, KafkaTopic) assert cfg.ignored_pod_phases == [] From a9c9858ac6a92d5b798e0df41d258162779a22fe Mon Sep 17 00:00:00 2001 From: HardMax71 Date: Fri, 13 Feb 2026 16:01:54 +0100 Subject: [PATCH 09/12] fix: separate provider for pod monitor config --- backend/app/core/providers.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/backend/app/core/providers.py b/backend/app/core/providers.py index beb24f6e..e9fcf64d 100644 --- a/backend/app/core/providers.py +++ b/backend/app/core/providers.py @@ -790,10 +790,17 @@ def get_event_mapper( ) -> PodEventMapper: return PodEventMapper(logger=logger, k8s_api=k8s_client.CoreV1Api(api_client)) + @provide + def get_pod_monitor_config(self, settings: Settings) -> PodMonitorConfig: + return PodMonitorConfig( + namespace=settings.K8S_NAMESPACE, + kubeconfig_path=settings.KUBERNETES_CONFIG_PATH, + ) + @provide async def get_pod_monitor( self, - settings: Settings, + config: PodMonitorConfig, kafka_event_service: KafkaEventService, api_client: k8s_client.ApiClient, logger: structlog.stdlib.BoundLogger, @@ -801,10 +808,6 @@ async def get_pod_monitor( kubernetes_metrics: KubernetesMetrics, ) -> AsyncIterator[PodMonitor]: - config = PodMonitorConfig( - namespace=settings.K8S_NAMESPACE, - kubeconfig_path=settings.KUBERNETES_CONFIG_PATH, - ) monitor = PodMonitor( config=config, kafka_event_service=kafka_event_service, From e44a11399c6e15550e4ca3cecc03e28b7aa1c636 Mon Sep 17 00:00:00 2001 From: HardMax71 Date: Fri, 13 Feb 2026 16:26:31 +0100 Subject: [PATCH 10/12] fix: moved imprecise req size middleware to dependency of fastapi to dependencies list --- backend/app/api/dependencies.py | 14 ++++++++- backend/app/core/middlewares/__init__.py | 2 -- .../core/middlewares/request_size_limit.py | 30 ------------------- backend/app/main.py | 6 ++-- backend/app/settings.py | 1 + backend/config.toml | 1 + backend/tests/e2e/app/test_main_app.py | 8 +---- backend/tests/e2e/core/test_middlewares.py | 19 ++++++++++-- 8 files changed, 36 insertions(+), 45 deletions(-) delete mode 100644 backend/app/core/middlewares/request_size_limit.py diff --git a/backend/app/api/dependencies.py b/backend/app/api/dependencies.py index 6387ec97..3ef9c1b1 100644 --- a/backend/app/api/dependencies.py +++ b/backend/app/api/dependencies.py @@ -1,11 +1,23 @@ from dishka import FromDishka from dishka.integrations.fastapi import inject -from fastapi import Request +from fastapi import HTTPException, Request from app.domain.user import User from app.services.auth_service import AuthService +async def check_request_size(request: Request) -> None: + """Reject requests whose body exceeds MAX_REQUEST_SIZE_MB from settings.""" + settings = request.app.state.settings + max_bytes = settings.MAX_REQUEST_SIZE_MB * 1024 * 1024 + body = await request.body() + if len(body) > max_bytes: + raise HTTPException( + status_code=413, + detail=f"Request too large. Maximum size is {settings.MAX_REQUEST_SIZE_MB}MB", + ) + + @inject async def current_user(request: Request, auth_service: FromDishka[AuthService]) -> User: """Get authenticated user.""" diff --git a/backend/app/core/middlewares/__init__.py b/backend/app/core/middlewares/__init__.py index 0ea15f3a..aa1a9c4f 100644 --- a/backend/app/core/middlewares/__init__.py +++ b/backend/app/core/middlewares/__init__.py @@ -2,7 +2,6 @@ from .csrf import CSRFMiddleware from .metrics import MetricsMiddleware, create_system_metrics, setup_metrics from .rate_limit import RateLimitMiddleware -from .request_size_limit import RequestSizeLimitMiddleware __all__ = [ "CacheControlMiddleware", @@ -10,6 +9,5 @@ "MetricsMiddleware", "setup_metrics", "create_system_metrics", - "RequestSizeLimitMiddleware", "RateLimitMiddleware", ] diff --git a/backend/app/core/middlewares/request_size_limit.py b/backend/app/core/middlewares/request_size_limit.py deleted file mode 100644 index dcfdecd0..00000000 --- a/backend/app/core/middlewares/request_size_limit.py +++ /dev/null @@ -1,30 +0,0 @@ -from starlette.responses import JSONResponse -from starlette.types import ASGIApp, Receive, Scope, Send - - -class RequestSizeLimitMiddleware: - """Middleware to limit request size, default 10MB""" - - def __init__(self, app: ASGIApp, max_size_mb: int = 10) -> None: - self.app = app - self.max_size_bytes = max_size_mb * 1024 * 1024 - - async def __call__(self, scope: Scope, receive: Receive, send: Send) -> None: - if scope["type"] != "http": - await self.app(scope, receive, send) - return - - headers = dict(scope["headers"]) - content_length_header = headers.get(b"content-length") - - if content_length_header: - content_length = int(content_length_header) - if content_length > self.max_size_bytes: - response = JSONResponse( - status_code=413, - content={"detail": f"Request too large. Maximum size is {self.max_size_bytes / 1024 / 1024}MB"}, - ) - await response(scope, receive, send) - return - - await self.app(scope, receive, send) diff --git a/backend/app/main.py b/backend/app/main.py index 2cdf36f5..ad60880d 100644 --- a/backend/app/main.py +++ b/backend/app/main.py @@ -1,8 +1,9 @@ import uvicorn from dishka.integrations.fastapi import setup_dishka as setup_dishka_fastapi -from fastapi import FastAPI +from fastapi import Depends, FastAPI from fastapi.middleware.cors import CORSMiddleware +from app.api.dependencies import check_request_size from app.api.routes import ( auth, dlq, @@ -33,7 +34,6 @@ CSRFMiddleware, MetricsMiddleware, RateLimitMiddleware, - RequestSizeLimitMiddleware, setup_metrics, ) from app.settings import Settings @@ -60,6 +60,7 @@ def create_app(settings: Settings | None = None) -> FastAPI: openapi_url=None, docs_url=None, redoc_url=None, + dependencies=[Depends(check_request_size)], ) # Store settings on app state for lifespan access @@ -74,7 +75,6 @@ def create_app(settings: Settings | None = None) -> FastAPI: app.add_middleware(MetricsMiddleware) app.add_middleware(RateLimitMiddleware, settings=settings) app.add_middleware(CSRFMiddleware) - app.add_middleware(RequestSizeLimitMiddleware) app.add_middleware(CacheControlMiddleware) app.add_middleware( diff --git a/backend/app/settings.py b/backend/app/settings.py index e3046241..178df1ae 100644 --- a/backend/app/settings.py +++ b/backend/app/settings.py @@ -58,6 +58,7 @@ def __init__( KUBERNETES_CA_CERTIFICATE_PATH: str | None = None RATE_LIMITS: str = "100/minute" RATE_LIMIT_ENABLED: bool = True # Set to False to disable rate limiting entirely + MAX_REQUEST_SIZE_MB: int = 10 SSL_KEYFILE: str = "/app/certs/server.key" SSL_CERTFILE: str = "/app/certs/server.crt" diff --git a/backend/config.toml b/backend/config.toml index 369d4ea5..9049d8b1 100644 --- a/backend/config.toml +++ b/backend/config.toml @@ -20,6 +20,7 @@ K8S_POD_EXECUTION_TIMEOUT = 5 K8S_NAMESPACE = "integr8scode" RATE_LIMITS = "100/minute" +MAX_REQUEST_SIZE_MB = 10 SERVER_HOST = "0.0.0.0" diff --git a/backend/tests/e2e/app/test_main_app.py b/backend/tests/e2e/app/test_main_app.py index b1e7a914..d58a5107 100644 --- a/backend/tests/e2e/app/test_main_app.py +++ b/backend/tests/e2e/app/test_main_app.py @@ -128,11 +128,6 @@ def test_cors_middleware_configured(self, app: FastAPI) -> None: middleware_classes = self._get_middleware_class_names(app) assert "CORSMiddleware" in middleware_classes - def test_request_size_limit_middleware_configured(self, app: FastAPI) -> None: - """Request size limit middleware is configured.""" - middleware_classes = self._get_middleware_class_names(app) - assert "RequestSizeLimitMiddleware" in middleware_classes - def test_cache_control_middleware_configured(self, app: FastAPI) -> None: """Cache control middleware is configured.""" middleware_classes = self._get_middleware_class_names(app) @@ -155,11 +150,10 @@ def test_csrf_middleware_configured(self, app: FastAPI) -> None: def test_middleware_count(self, app: FastAPI) -> None: """Expected number of middlewares are configured.""" - # CORS, RequestSizeLimit, CacheControl, Metrics, RateLimit, CSRF + # CORS, CacheControl, Metrics, RateLimit, CSRF middleware_classes = self._get_middleware_class_names(app) expected_middlewares = { "CORSMiddleware", - "RequestSizeLimitMiddleware", "CacheControlMiddleware", "MetricsMiddleware", "RateLimitMiddleware", diff --git a/backend/tests/e2e/core/test_middlewares.py b/backend/tests/e2e/core/test_middlewares.py index 2564fe86..7c2af2c5 100644 --- a/backend/tests/e2e/core/test_middlewares.py +++ b/backend/tests/e2e/core/test_middlewares.py @@ -69,8 +69,8 @@ async def test_authenticated_post_with_csrf_succeeds( assert response.status_code != 403 -class TestRequestSizeLimitMiddleware: - """Tests for RequestSizeLimitMiddleware.""" +class TestRequestSizeLimit: + """Tests for check_request_size dependency.""" @pytest.mark.asyncio async def test_small_request_allowed( @@ -106,6 +106,21 @@ async def test_large_request_rejected( assert response.status_code == 413 assert "too large" in response.json()["detail"].lower() + @pytest.mark.asyncio + async def test_large_request_without_content_length_rejected( + self, client: httpx.AsyncClient + ) -> None: + """Requests without Content-Length header are still checked by body size.""" + large_payload = "x" * (11 * 1024 * 1024) # 11MB + + response = await client.post( + "/api/v1/auth/register", + content=large_payload, + headers={"Content-Type": "text/plain", "Transfer-Encoding": "chunked"}, + ) + + assert response.status_code == 413 + class TestCacheControlMiddleware: """Tests for CacheControlMiddleware.""" From 42662b74753ed56441df3823ac18914d0fd8ccc8 Mon Sep 17 00:00:00 2001 From: HardMax71 Date: Fri, 13 Feb 2026 16:40:11 +0100 Subject: [PATCH 11/12] fix: request size dependency --- backend/app/api/dependencies.py | 35 ++++++++++++++++----- docs/architecture/middleware.md | 54 +++++++++++++++++++-------------- 2 files changed, 59 insertions(+), 30 deletions(-) diff --git a/backend/app/api/dependencies.py b/backend/app/api/dependencies.py index 3ef9c1b1..826d33ee 100644 --- a/backend/app/api/dependencies.py +++ b/backend/app/api/dependencies.py @@ -7,15 +7,34 @@ async def check_request_size(request: Request) -> None: - """Reject requests whose body exceeds MAX_REQUEST_SIZE_MB from settings.""" + """Reject requests whose body exceeds MAX_REQUEST_SIZE_MB from settings. + + Two-phase check: + 1. Content-Length header — rejects known-oversized requests without any I/O. + 2. Streaming read with cap — rejects as soon as accumulated bytes exceed the + limit, so a missing or dishonest Content-Length cannot force the full + payload into memory. + + After a successful check the body is cached on ``request._body`` so that + downstream calls to ``request.body()`` return the already-read bytes + (same attribute Starlette uses internally for caching). + """ settings = request.app.state.settings - max_bytes = settings.MAX_REQUEST_SIZE_MB * 1024 * 1024 - body = await request.body() - if len(body) > max_bytes: - raise HTTPException( - status_code=413, - detail=f"Request too large. Maximum size is {settings.MAX_REQUEST_SIZE_MB}MB", - ) + max_bytes: int = settings.MAX_REQUEST_SIZE_MB * 1024 * 1024 + detail = f"Request too large. Maximum size is {settings.MAX_REQUEST_SIZE_MB}MB" + + content_length = request.headers.get("content-length") + if content_length is not None and int(content_length) > max_bytes: + raise HTTPException(status_code=413, detail=detail) + + received = 0 + chunks: list[bytes] = [] + async for chunk in request.stream(): + received += len(chunk) + if received > max_bytes: + raise HTTPException(status_code=413, detail=detail) + chunks.append(chunk) + request._body = b"".join(chunks) @inject diff --git a/docs/architecture/middleware.md b/docs/architecture/middleware.md index e739ff27..0cce2db9 100644 --- a/docs/architecture/middleware.md +++ b/docs/architecture/middleware.md @@ -1,35 +1,45 @@ -# Middleware +# Middleware & App-Level Guards -The backend uses a stack of ASGI middleware to handle cross-cutting concerns like rate limiting, request size -validation, caching, and metrics collection. Middleware runs in order from outermost to innermost, with response -processing in reverse order. +The backend uses ASGI middleware for cross-cutting concerns like rate limiting, caching, and metrics collection, plus a +FastAPI app-level dependency for request size enforcement. Middleware runs in order from outermost to innermost, with +response processing in reverse order. ## Middleware Stack The middleware is applied in this order (outermost first): -1. **RequestSizeLimitMiddleware** - Rejects oversized requests -2. **RateLimitMiddleware** - Enforces per-user/per-endpoint limits -3. **CacheControlMiddleware** - Adds cache headers to responses -4. **MetricsMiddleware** - Collects HTTP request metrics +1. **RateLimitMiddleware** - Enforces per-user/per-endpoint limits +2. **CacheControlMiddleware** - Adds cache headers to responses +3. **MetricsMiddleware** - Collects HTTP request metrics -## Request Size Limit +## Request Size Limit (App Dependency) -Rejects requests exceeding a configurable size limit (default 10MB). This protects against denial-of-service attacks -from large payloads. +Request size enforcement is implemented as a FastAPI app-level dependency (`check_request_size` in +`app/api/dependencies.py`) rather than ASGI middleware. It is registered in `FastAPI(dependencies=[...])` so it runs on +every request. -```python ---8<-- "backend/app/core/middlewares/request_size_limit.py:5:10" -``` +The dependency uses a two-phase approach to reject oversized requests without buffering the entire payload into memory: + +1. **Content-Length fast-path** — if the header is present and exceeds `MAX_REQUEST_SIZE_MB`, the request is rejected + immediately with zero body I/O. +2. **Streaming read with cap** — the body is read chunk-by-chunk via `request.stream()`. As soon as accumulated bytes + exceed the limit, a 413 is raised. Only ~one chunk past the limit ever enters memory, not the full payload. On + success the body is cached on `request._body` so downstream `request.body()` calls work without re-reading the + stream. + +The limit is configured via `MAX_REQUEST_SIZE_MB` in `config.toml` (default 10). Requests exceeding the limit receive a 413 response: ```json -{"detail": "Request too large. Maximum size is 10.0MB"} +{"detail": "Request too large. Maximum size is 10MB"} ``` -The middleware checks the `Content-Length` header before reading the body, avoiding wasted processing on oversized -requests. +!!! note "Why not middleware?" + A previous implementation used `RequestSizeLimitMiddleware` that only checked the `Content-Length` header. This was + trivially bypassable by omitting the header or lying about the value. A pure ASGI middleware that wraps `receive()` + is possible but requires closures for per-request state. The FastAPI dependency approach is simpler, testable, and + has access to `request.app.state.settings` for configuration. ## Rate Limit @@ -104,8 +114,8 @@ These expose: | File | Purpose | |----------------------------------------------------------------------------------------------------------------------------------------------------|-------------------------| -| [`core/middlewares/__init__.py`](https://github.com/HardMax71/Integr8sCode/blob/main/backend/app/core/middlewares/__init__.py) | Middleware exports | -| [`core/middlewares/rate_limit.py`](https://github.com/HardMax71/Integr8sCode/blob/main/backend/app/core/middlewares/rate_limit.py) | Rate limiting | -| [`core/middlewares/cache.py`](https://github.com/HardMax71/Integr8sCode/blob/main/backend/app/core/middlewares/cache.py) | Cache headers | -| [`core/middlewares/request_size_limit.py`](https://github.com/HardMax71/Integr8sCode/blob/main/backend/app/core/middlewares/request_size_limit.py) | Request size validation | -| [`core/middlewares/metrics.py`](https://github.com/HardMax71/Integr8sCode/blob/main/backend/app/core/middlewares/metrics.py) | HTTP and system metrics | +| [`core/middlewares/__init__.py`](https://github.com/HardMax71/Integr8sCode/blob/main/backend/app/core/middlewares/__init__.py) | Middleware exports | +| [`core/middlewares/rate_limit.py`](https://github.com/HardMax71/Integr8sCode/blob/main/backend/app/core/middlewares/rate_limit.py) | Rate limiting | +| [`core/middlewares/cache.py`](https://github.com/HardMax71/Integr8sCode/blob/main/backend/app/core/middlewares/cache.py) | Cache headers | +| [`core/middlewares/metrics.py`](https://github.com/HardMax71/Integr8sCode/blob/main/backend/app/core/middlewares/metrics.py) | HTTP and system metrics | +| [`api/dependencies.py`](https://github.com/HardMax71/Integr8sCode/blob/main/backend/app/api/dependencies.py) | Request size enforcement | From ab40197622007087a8bb068346c141f36cdb6a63 Mon Sep 17 00:00:00 2001 From: HardMax71 Date: Fri, 13 Feb 2026 17:25:11 +0100 Subject: [PATCH 12/12] reverted back --- backend/app/api/dependencies.py | 33 +----------- backend/app/core/middlewares/__init__.py | 2 + .../core/middlewares/request_size_limit.py | 30 +++++++++++ backend/app/main.py | 6 +-- backend/app/settings.py | 1 - backend/config.toml | 1 - backend/tests/e2e/app/test_main_app.py | 8 ++- backend/tests/e2e/core/test_middlewares.py | 19 +------ docs/architecture/middleware.md | 54 ++++++++----------- 9 files changed, 67 insertions(+), 87 deletions(-) create mode 100644 backend/app/core/middlewares/request_size_limit.py diff --git a/backend/app/api/dependencies.py b/backend/app/api/dependencies.py index 826d33ee..6387ec97 100644 --- a/backend/app/api/dependencies.py +++ b/backend/app/api/dependencies.py @@ -1,42 +1,11 @@ from dishka import FromDishka from dishka.integrations.fastapi import inject -from fastapi import HTTPException, Request +from fastapi import Request from app.domain.user import User from app.services.auth_service import AuthService -async def check_request_size(request: Request) -> None: - """Reject requests whose body exceeds MAX_REQUEST_SIZE_MB from settings. - - Two-phase check: - 1. Content-Length header — rejects known-oversized requests without any I/O. - 2. Streaming read with cap — rejects as soon as accumulated bytes exceed the - limit, so a missing or dishonest Content-Length cannot force the full - payload into memory. - - After a successful check the body is cached on ``request._body`` so that - downstream calls to ``request.body()`` return the already-read bytes - (same attribute Starlette uses internally for caching). - """ - settings = request.app.state.settings - max_bytes: int = settings.MAX_REQUEST_SIZE_MB * 1024 * 1024 - detail = f"Request too large. Maximum size is {settings.MAX_REQUEST_SIZE_MB}MB" - - content_length = request.headers.get("content-length") - if content_length is not None and int(content_length) > max_bytes: - raise HTTPException(status_code=413, detail=detail) - - received = 0 - chunks: list[bytes] = [] - async for chunk in request.stream(): - received += len(chunk) - if received > max_bytes: - raise HTTPException(status_code=413, detail=detail) - chunks.append(chunk) - request._body = b"".join(chunks) - - @inject async def current_user(request: Request, auth_service: FromDishka[AuthService]) -> User: """Get authenticated user.""" diff --git a/backend/app/core/middlewares/__init__.py b/backend/app/core/middlewares/__init__.py index aa1a9c4f..0ea15f3a 100644 --- a/backend/app/core/middlewares/__init__.py +++ b/backend/app/core/middlewares/__init__.py @@ -2,6 +2,7 @@ from .csrf import CSRFMiddleware from .metrics import MetricsMiddleware, create_system_metrics, setup_metrics from .rate_limit import RateLimitMiddleware +from .request_size_limit import RequestSizeLimitMiddleware __all__ = [ "CacheControlMiddleware", @@ -9,5 +10,6 @@ "MetricsMiddleware", "setup_metrics", "create_system_metrics", + "RequestSizeLimitMiddleware", "RateLimitMiddleware", ] diff --git a/backend/app/core/middlewares/request_size_limit.py b/backend/app/core/middlewares/request_size_limit.py new file mode 100644 index 00000000..dcfdecd0 --- /dev/null +++ b/backend/app/core/middlewares/request_size_limit.py @@ -0,0 +1,30 @@ +from starlette.responses import JSONResponse +from starlette.types import ASGIApp, Receive, Scope, Send + + +class RequestSizeLimitMiddleware: + """Middleware to limit request size, default 10MB""" + + def __init__(self, app: ASGIApp, max_size_mb: int = 10) -> None: + self.app = app + self.max_size_bytes = max_size_mb * 1024 * 1024 + + async def __call__(self, scope: Scope, receive: Receive, send: Send) -> None: + if scope["type"] != "http": + await self.app(scope, receive, send) + return + + headers = dict(scope["headers"]) + content_length_header = headers.get(b"content-length") + + if content_length_header: + content_length = int(content_length_header) + if content_length > self.max_size_bytes: + response = JSONResponse( + status_code=413, + content={"detail": f"Request too large. Maximum size is {self.max_size_bytes / 1024 / 1024}MB"}, + ) + await response(scope, receive, send) + return + + await self.app(scope, receive, send) diff --git a/backend/app/main.py b/backend/app/main.py index ad60880d..2cdf36f5 100644 --- a/backend/app/main.py +++ b/backend/app/main.py @@ -1,9 +1,8 @@ import uvicorn from dishka.integrations.fastapi import setup_dishka as setup_dishka_fastapi -from fastapi import Depends, FastAPI +from fastapi import FastAPI from fastapi.middleware.cors import CORSMiddleware -from app.api.dependencies import check_request_size from app.api.routes import ( auth, dlq, @@ -34,6 +33,7 @@ CSRFMiddleware, MetricsMiddleware, RateLimitMiddleware, + RequestSizeLimitMiddleware, setup_metrics, ) from app.settings import Settings @@ -60,7 +60,6 @@ def create_app(settings: Settings | None = None) -> FastAPI: openapi_url=None, docs_url=None, redoc_url=None, - dependencies=[Depends(check_request_size)], ) # Store settings on app state for lifespan access @@ -75,6 +74,7 @@ def create_app(settings: Settings | None = None) -> FastAPI: app.add_middleware(MetricsMiddleware) app.add_middleware(RateLimitMiddleware, settings=settings) app.add_middleware(CSRFMiddleware) + app.add_middleware(RequestSizeLimitMiddleware) app.add_middleware(CacheControlMiddleware) app.add_middleware( diff --git a/backend/app/settings.py b/backend/app/settings.py index 178df1ae..e3046241 100644 --- a/backend/app/settings.py +++ b/backend/app/settings.py @@ -58,7 +58,6 @@ def __init__( KUBERNETES_CA_CERTIFICATE_PATH: str | None = None RATE_LIMITS: str = "100/minute" RATE_LIMIT_ENABLED: bool = True # Set to False to disable rate limiting entirely - MAX_REQUEST_SIZE_MB: int = 10 SSL_KEYFILE: str = "/app/certs/server.key" SSL_CERTFILE: str = "/app/certs/server.crt" diff --git a/backend/config.toml b/backend/config.toml index 9049d8b1..369d4ea5 100644 --- a/backend/config.toml +++ b/backend/config.toml @@ -20,7 +20,6 @@ K8S_POD_EXECUTION_TIMEOUT = 5 K8S_NAMESPACE = "integr8scode" RATE_LIMITS = "100/minute" -MAX_REQUEST_SIZE_MB = 10 SERVER_HOST = "0.0.0.0" diff --git a/backend/tests/e2e/app/test_main_app.py b/backend/tests/e2e/app/test_main_app.py index d58a5107..b1e7a914 100644 --- a/backend/tests/e2e/app/test_main_app.py +++ b/backend/tests/e2e/app/test_main_app.py @@ -128,6 +128,11 @@ def test_cors_middleware_configured(self, app: FastAPI) -> None: middleware_classes = self._get_middleware_class_names(app) assert "CORSMiddleware" in middleware_classes + def test_request_size_limit_middleware_configured(self, app: FastAPI) -> None: + """Request size limit middleware is configured.""" + middleware_classes = self._get_middleware_class_names(app) + assert "RequestSizeLimitMiddleware" in middleware_classes + def test_cache_control_middleware_configured(self, app: FastAPI) -> None: """Cache control middleware is configured.""" middleware_classes = self._get_middleware_class_names(app) @@ -150,10 +155,11 @@ def test_csrf_middleware_configured(self, app: FastAPI) -> None: def test_middleware_count(self, app: FastAPI) -> None: """Expected number of middlewares are configured.""" - # CORS, CacheControl, Metrics, RateLimit, CSRF + # CORS, RequestSizeLimit, CacheControl, Metrics, RateLimit, CSRF middleware_classes = self._get_middleware_class_names(app) expected_middlewares = { "CORSMiddleware", + "RequestSizeLimitMiddleware", "CacheControlMiddleware", "MetricsMiddleware", "RateLimitMiddleware", diff --git a/backend/tests/e2e/core/test_middlewares.py b/backend/tests/e2e/core/test_middlewares.py index 7c2af2c5..2564fe86 100644 --- a/backend/tests/e2e/core/test_middlewares.py +++ b/backend/tests/e2e/core/test_middlewares.py @@ -69,8 +69,8 @@ async def test_authenticated_post_with_csrf_succeeds( assert response.status_code != 403 -class TestRequestSizeLimit: - """Tests for check_request_size dependency.""" +class TestRequestSizeLimitMiddleware: + """Tests for RequestSizeLimitMiddleware.""" @pytest.mark.asyncio async def test_small_request_allowed( @@ -106,21 +106,6 @@ async def test_large_request_rejected( assert response.status_code == 413 assert "too large" in response.json()["detail"].lower() - @pytest.mark.asyncio - async def test_large_request_without_content_length_rejected( - self, client: httpx.AsyncClient - ) -> None: - """Requests without Content-Length header are still checked by body size.""" - large_payload = "x" * (11 * 1024 * 1024) # 11MB - - response = await client.post( - "/api/v1/auth/register", - content=large_payload, - headers={"Content-Type": "text/plain", "Transfer-Encoding": "chunked"}, - ) - - assert response.status_code == 413 - class TestCacheControlMiddleware: """Tests for CacheControlMiddleware.""" diff --git a/docs/architecture/middleware.md b/docs/architecture/middleware.md index 0cce2db9..e739ff27 100644 --- a/docs/architecture/middleware.md +++ b/docs/architecture/middleware.md @@ -1,45 +1,35 @@ -# Middleware & App-Level Guards +# Middleware -The backend uses ASGI middleware for cross-cutting concerns like rate limiting, caching, and metrics collection, plus a -FastAPI app-level dependency for request size enforcement. Middleware runs in order from outermost to innermost, with -response processing in reverse order. +The backend uses a stack of ASGI middleware to handle cross-cutting concerns like rate limiting, request size +validation, caching, and metrics collection. Middleware runs in order from outermost to innermost, with response +processing in reverse order. ## Middleware Stack The middleware is applied in this order (outermost first): -1. **RateLimitMiddleware** - Enforces per-user/per-endpoint limits -2. **CacheControlMiddleware** - Adds cache headers to responses -3. **MetricsMiddleware** - Collects HTTP request metrics +1. **RequestSizeLimitMiddleware** - Rejects oversized requests +2. **RateLimitMiddleware** - Enforces per-user/per-endpoint limits +3. **CacheControlMiddleware** - Adds cache headers to responses +4. **MetricsMiddleware** - Collects HTTP request metrics -## Request Size Limit (App Dependency) +## Request Size Limit -Request size enforcement is implemented as a FastAPI app-level dependency (`check_request_size` in -`app/api/dependencies.py`) rather than ASGI middleware. It is registered in `FastAPI(dependencies=[...])` so it runs on -every request. +Rejects requests exceeding a configurable size limit (default 10MB). This protects against denial-of-service attacks +from large payloads. -The dependency uses a two-phase approach to reject oversized requests without buffering the entire payload into memory: - -1. **Content-Length fast-path** — if the header is present and exceeds `MAX_REQUEST_SIZE_MB`, the request is rejected - immediately with zero body I/O. -2. **Streaming read with cap** — the body is read chunk-by-chunk via `request.stream()`. As soon as accumulated bytes - exceed the limit, a 413 is raised. Only ~one chunk past the limit ever enters memory, not the full payload. On - success the body is cached on `request._body` so downstream `request.body()` calls work without re-reading the - stream. - -The limit is configured via `MAX_REQUEST_SIZE_MB` in `config.toml` (default 10). +```python +--8<-- "backend/app/core/middlewares/request_size_limit.py:5:10" +``` Requests exceeding the limit receive a 413 response: ```json -{"detail": "Request too large. Maximum size is 10MB"} +{"detail": "Request too large. Maximum size is 10.0MB"} ``` -!!! note "Why not middleware?" - A previous implementation used `RequestSizeLimitMiddleware` that only checked the `Content-Length` header. This was - trivially bypassable by omitting the header or lying about the value. A pure ASGI middleware that wraps `receive()` - is possible but requires closures for per-request state. The FastAPI dependency approach is simpler, testable, and - has access to `request.app.state.settings` for configuration. +The middleware checks the `Content-Length` header before reading the body, avoiding wasted processing on oversized +requests. ## Rate Limit @@ -114,8 +104,8 @@ These expose: | File | Purpose | |----------------------------------------------------------------------------------------------------------------------------------------------------|-------------------------| -| [`core/middlewares/__init__.py`](https://github.com/HardMax71/Integr8sCode/blob/main/backend/app/core/middlewares/__init__.py) | Middleware exports | -| [`core/middlewares/rate_limit.py`](https://github.com/HardMax71/Integr8sCode/blob/main/backend/app/core/middlewares/rate_limit.py) | Rate limiting | -| [`core/middlewares/cache.py`](https://github.com/HardMax71/Integr8sCode/blob/main/backend/app/core/middlewares/cache.py) | Cache headers | -| [`core/middlewares/metrics.py`](https://github.com/HardMax71/Integr8sCode/blob/main/backend/app/core/middlewares/metrics.py) | HTTP and system metrics | -| [`api/dependencies.py`](https://github.com/HardMax71/Integr8sCode/blob/main/backend/app/api/dependencies.py) | Request size enforcement | +| [`core/middlewares/__init__.py`](https://github.com/HardMax71/Integr8sCode/blob/main/backend/app/core/middlewares/__init__.py) | Middleware exports | +| [`core/middlewares/rate_limit.py`](https://github.com/HardMax71/Integr8sCode/blob/main/backend/app/core/middlewares/rate_limit.py) | Rate limiting | +| [`core/middlewares/cache.py`](https://github.com/HardMax71/Integr8sCode/blob/main/backend/app/core/middlewares/cache.py) | Cache headers | +| [`core/middlewares/request_size_limit.py`](https://github.com/HardMax71/Integr8sCode/blob/main/backend/app/core/middlewares/request_size_limit.py) | Request size validation | +| [`core/middlewares/metrics.py`](https://github.com/HardMax71/Integr8sCode/blob/main/backend/app/core/middlewares/metrics.py) | HTTP and system metrics |