diff --git a/lambdas/file-scanner-lambda/src/apis/sqs-handler.ts b/lambdas/file-scanner-lambda/src/apis/sqs-handler.ts index d8b7b706..0acf2934 100644 --- a/lambdas/file-scanner-lambda/src/apis/sqs-handler.ts +++ b/lambdas/file-scanner-lambda/src/apis/sqs-handler.ts @@ -64,6 +64,7 @@ async function processRecord( messageReference: event.data.messageReference, senderId: event.data.senderId, createdAt: event.time, + traceparent: event.traceparent, }); if (result.outcome === 'failed') { diff --git a/lambdas/file-scanner-lambda/src/app/file-scanner.ts b/lambdas/file-scanner-lambda/src/app/file-scanner.ts index 3eabde7a..fb687b0a 100644 --- a/lambdas/file-scanner-lambda/src/app/file-scanner.ts +++ b/lambdas/file-scanner-lambda/src/app/file-scanner.ts @@ -24,6 +24,7 @@ export interface ScanFileMetadata { messageReference: string; senderId: string; createdAt: string; + traceparent: string; } export type ScanFileResult = { @@ -158,6 +159,7 @@ export class FileScanner { messageReference: metadata.messageReference, senderId: metadata.senderId, createdAt: metadata.createdAt, + traceparent: metadata.traceparent, }, }; diff --git a/lambdas/mesh-acknowledge/requirements.txt b/lambdas/mesh-acknowledge/requirements.txt index 68ecfdcb..3e375cb8 100644 --- a/lambdas/mesh-acknowledge/requirements.txt +++ b/lambdas/mesh-acknowledge/requirements.txt @@ -3,6 +3,8 @@ boto3>=1.28.62 pyopenssl>=24.2.1 pydantic>=2.0.0 structlog>=21.5.0 +opentelemetry-api>=1.25.0 +opentelemetry-sdk>=1.25.0 -e ../../src/digital-letters-events -e ../../utils/py-mock-mesh -e ../../utils/py-utils diff --git a/lambdas/mesh-download/mesh_download/processor.py b/lambdas/mesh-download/mesh_download/processor.py index e51edced..45264113 100644 --- a/lambdas/mesh-download/mesh_download/processor.py +++ b/lambdas/mesh-download/mesh_download/processor.py @@ -23,7 +23,10 @@ def __init__(self, **kwargs): def process_sqs_message(self, sqs_record): try: validated_event = self._parse_and_validate_event(sqs_record) - logger = self.__log.bind(mesh_message_id=validated_event.data.meshMessageId) + logger = self.__log.bind( + mesh_message_id=validated_event.data.meshMessageId, + traceparent=validated_event.traceparent, + ) logger.info("Processing MESH download request") self._handle_download(validated_event, logger) @@ -106,6 +109,8 @@ def _store_message_content(self, sender_id, message_reference, message_content, def _publish_downloaded_event(self, incoming_event, message_uri): """ Publishes a MESHInboxMessageDownloaded event. + The EventPublisher will derive a child traceparent from the incoming traceparent + automatically, preserving the trace-id across this service hop. """ now = datetime.now(timezone.utc).isoformat() @@ -137,5 +142,7 @@ def _publish_downloaded_event(self, incoming_event, message_uri): "Published MESHInboxMessageDownloaded event", sender_id=incoming_event.data.senderId, message_uri=message_uri, - message_reference=incoming_event.data.messageReference + message_reference=incoming_event.data.messageReference, + incoming_traceparent=incoming_event.traceparent, + outgoing_traceparent=cloud_event['traceparent'], ) diff --git a/lambdas/mesh-download/requirements.txt b/lambdas/mesh-download/requirements.txt index b9af3fe0..41615568 100644 --- a/lambdas/mesh-download/requirements.txt +++ b/lambdas/mesh-download/requirements.txt @@ -8,6 +8,8 @@ urllib3>=1.26.19,<2.0.0 idna>=3.7 requests>=2.32.0 pyopenssl>=24.2.1 +opentelemetry-api>=1.25.0 +opentelemetry-sdk>=1.25.0 -e ../../src/digital-letters-events -e ../../utils/py-mock-mesh -e ../../utils/py-utils diff --git a/lambdas/mesh-poll/mesh_poll/processor.py b/lambdas/mesh-poll/mesh_poll/processor.py index 79ff6527..f8fe2c07 100644 --- a/lambdas/mesh-poll/mesh_poll/processor.py +++ b/lambdas/mesh-poll/mesh_poll/processor.py @@ -151,6 +151,7 @@ def process_message(self, message): def _publish_mesh_inbox_message_received_event(self, event_detail): """ Publishes a MESHInboxMessageReceived event for the retriever component. + The EventPublisher will create a root traceparent automatically. """ now = datetime.now(timezone.utc).isoformat() @@ -167,7 +168,6 @@ def _publish_mesh_inbox_message_received_event(self, event_detail): 'recordedtime': now, 'severitynumber': 2, 'severitytext': 'INFO', - 'traceparent': '00-0af7651916cd43dd8448eb211c80319c-b7ad6b7169203331-01', 'dataschema': ( 'https://notify.nhs.uk/cloudevents/schemas/digital-letters/' '2025-10-draft/data/digital-letters-mesh-inbox-message-received-data.schema.json' @@ -185,11 +185,13 @@ def _publish_mesh_inbox_message_received_event(self, event_detail): self.__log.info("Published MESHInboxMessageReceived event", mesh_message_id=event_detail["data"]["meshMessageId"], - sender_id=event_detail["data"]["senderId"]) + sender_id=event_detail["data"]["senderId"], + traceparent=cloud_event['traceparent']) def _publish_mesh_inbox_message_invalid_event(self, event_detail): """ Publishes a MESHInboxMessageInvalid event when a message fails validation. + The EventPublisher will create a root traceparent automatically. """ now = datetime.now(timezone.utc).isoformat() @@ -203,7 +205,6 @@ def _publish_mesh_inbox_message_invalid_event(self, event_detail): 'recordedtime': now, 'severitynumber': 3, 'severitytext': 'WARN', - 'traceparent': '00-0af7651916cd43dd8448eb211c80319c-b7ad6b7169203331-01', 'dataschema': ( 'https://notify.nhs.uk/cloudevents/schemas/digital-letters/' '2025-10-draft/data/digital-letters-mesh-inbox-message-invalid-data.schema.json' diff --git a/lambdas/mesh-poll/requirements.txt b/lambdas/mesh-poll/requirements.txt index 5e7f1345..36ed181b 100644 --- a/lambdas/mesh-poll/requirements.txt +++ b/lambdas/mesh-poll/requirements.txt @@ -8,6 +8,8 @@ idna>=3.7 requests>=2.32.0 pyopenssl>=24.2.1 pydantic>=2.0.0 +opentelemetry-api>=1.25.0 +opentelemetry-sdk>=1.25.0 -e ../../src/digital-letters-events -e ../../utils/py-mock-mesh -e ../../utils/py-utils diff --git a/lambdas/move-scanned-files-lambda/src/app/move-file-handler.ts b/lambdas/move-scanned-files-lambda/src/app/move-file-handler.ts index f8d0bb48..ac5a65c4 100644 --- a/lambdas/move-scanned-files-lambda/src/app/move-file-handler.ts +++ b/lambdas/move-scanned-files-lambda/src/app/move-file-handler.ts @@ -19,6 +19,7 @@ type ObjectMetadata = { senderId: string; messageReference: string; createdAt: string; + traceparent?: string; }; type ObjectsFromEvent = { @@ -34,6 +35,7 @@ const SCAN_RESULT_STATUS_NO_THREATS_FOUND = 'NO_THREATS_FOUND'; const METADATA_CREATED_AT = 'createdat'; const METADATA_MESSAGE_REFERENCE = 'messagereference'; const METADATA_SENDER_ID = 'senderid'; +const METADATA_TRACEPARENT = 'traceparent'; /** * Utility function to extract a subset of the object key for logging purposes. @@ -121,6 +123,7 @@ export class MoveFileHandler { const createdAt = metadataMap.get(METADATA_CREATED_AT); const messageReference = metadataMap.get(METADATA_MESSAGE_REFERENCE); const senderId = metadataMap.get(METADATA_SENDER_ID); + const traceparent = metadataMap.get(METADATA_TRACEPARENT); if (!messageReference || !senderId || !createdAt) { return null; @@ -137,6 +140,7 @@ export class MoveFileHandler { senderId, messageReference, createdAt, + traceparent, }; } @@ -160,6 +164,7 @@ export class MoveFileHandler { metadata.senderId, `s3://${destinationBucket}/${objectKey}`, metadata.createdAt, + metadata.traceparent, ); } else { destinationBucket = this.quarantineBucketName; @@ -176,6 +181,7 @@ export class MoveFileHandler { metadata.senderId, `s3://${destinationBucket}/${objectKey}`, metadata.createdAt, + metadata.traceparent, ); } diff --git a/lambdas/move-scanned-files-lambda/src/domain/mapper.ts b/lambdas/move-scanned-files-lambda/src/domain/mapper.ts index fe0d483a..8c90528a 100644 --- a/lambdas/move-scanned-files-lambda/src/domain/mapper.ts +++ b/lambdas/move-scanned-files-lambda/src/domain/mapper.ts @@ -7,6 +7,7 @@ function createEventWithCommonFields( senderId: string, letterUri: string, createdAt: string, + traceparent?: string, ): FileSafe | FileQuarantined { return { specversion: '1.0', @@ -14,7 +15,7 @@ function createEventWithCommonFields( subject: `customer/${senderId}/recipient/${messageReference}`, source: '/nhs/england/notify/production/primary/data-plane/digitalletters/print', // Note CCM-13892. - traceparent: '00-0af7651916cd43dd8448eb211c80319c-b7ad6b7169203331-01', // Note CCM-14255. + traceparent: traceparent ?? '', // EventPublisher will derive child or create root type: isFileSafe ? 'uk.nhs.notify.digital.letters.print.file.safe.v1' : 'uk.nhs.notify.digital.letters.print.file.quarantined.v1', @@ -36,6 +37,7 @@ export function createFileSafeEvent( senderId: string, letterUri: string, createdAt: string, + traceparent?: string, ): FileSafe { return createEventWithCommonFields( true, @@ -43,6 +45,7 @@ export function createFileSafeEvent( senderId, letterUri, createdAt, + traceparent, ) as FileSafe; } @@ -51,6 +54,7 @@ export function createFileQuarantinedEvent( senderId: string, letterUri: string, createdAt: string, + traceparent?: string, ): FileQuarantined { return createEventWithCommonFields( false, @@ -58,5 +62,6 @@ export function createFileQuarantinedEvent( senderId, letterUri, createdAt, + traceparent, ) as FileQuarantined; } diff --git a/lambdas/pdm-poll-lambda/src/__tests__/apis/sqs-handler.test.ts b/lambdas/pdm-poll-lambda/src/__tests__/apis/sqs-handler.test.ts index 74c1ac35..edc6f3f0 100644 --- a/lambdas/pdm-poll-lambda/src/__tests__/apis/sqs-handler.test.ts +++ b/lambdas/pdm-poll-lambda/src/__tests__/apis/sqs-handler.test.ts @@ -15,13 +15,24 @@ const pdm = mock(); jest.mock('node:crypto', () => ({ randomUUID: jest.fn(), + randomBytes: jest.fn(), })); +// eslint-disable-next-line @typescript-eslint/no-require-imports +const { randomBytes } = require('node:crypto'); + const mockRandomUUID = randomUUID as jest.MockedFunction; +const mockRandomBytes = randomBytes as jest.MockedFunction< + typeof import('node:crypto').randomBytes +>; const mockDate = jest.spyOn(Date.prototype, 'toISOString'); mockRandomUUID.mockReturnValue('550e8400-e29b-41d4-a716-446655440001'); +mockRandomBytes.mockImplementation(() => Buffer.from('aabbccdd11223344', 'hex')); mockDate.mockReturnValue('2023-06-20T12:00:00.250Z'); +const DERIVED_TRACEPARENT = + '00-0af7651916cd43dd8448eb211c80319c-aabbccdd11223344-01'; + const handler = createHandler({ eventPublisher, logger, @@ -51,6 +62,7 @@ describe('SQS Handler', () => { id: '550e8400-e29b-41d4-a716-446655440001', time: '2023-06-20T12:00:00.250Z', recordedtime: '2023-06-20T12:00:00.250Z', + traceparent: DERIVED_TRACEPARENT, dataschema: 'https://notify.nhs.uk/cloudevents/schemas/digital-letters/2025-10-draft/data/digital-letters-pdm-resource-available-data.schema.json', type: 'uk.nhs.notify.digital.letters.pdm.resource.available.v1', @@ -90,6 +102,7 @@ describe('SQS Handler', () => { id: '550e8400-e29b-41d4-a716-446655440001', time: '2023-06-20T12:00:00.250Z', recordedtime: '2023-06-20T12:00:00.250Z', + traceparent: DERIVED_TRACEPARENT, dataschema: 'https://notify.nhs.uk/cloudevents/schemas/digital-letters/2025-10-draft/data/digital-letters-pdm-resource-unavailable-data.schema.json', type: 'uk.nhs.notify.digital.letters.pdm.resource.unavailable.v1', @@ -132,6 +145,7 @@ describe('SQS Handler', () => { id: '550e8400-e29b-41d4-a716-446655440001', time: '2023-06-20T12:00:00.250Z', recordedtime: '2023-06-20T12:00:00.250Z', + traceparent: DERIVED_TRACEPARENT, dataschema: 'https://notify.nhs.uk/cloudevents/schemas/digital-letters/2025-10-draft/data/digital-letters-pdm-resource-available-data.schema.json', type: 'uk.nhs.notify.digital.letters.pdm.resource.available.v1', @@ -174,6 +188,7 @@ describe('SQS Handler', () => { id: '550e8400-e29b-41d4-a716-446655440001', time: '2023-06-20T12:00:00.250Z', recordedtime: '2023-06-20T12:00:00.250Z', + traceparent: DERIVED_TRACEPARENT, type: 'uk.nhs.notify.digital.letters.pdm.resource.unavailable.v1', data: { messageReference: @@ -219,6 +234,7 @@ describe('SQS Handler', () => { id: '550e8400-e29b-41d4-a716-446655440001', time: '2023-06-20T12:00:00.250Z', recordedtime: '2023-06-20T12:00:00.250Z', + traceparent: DERIVED_TRACEPARENT, dataschema: 'https://notify.nhs.uk/cloudevents/schemas/digital-letters/2025-10-draft/data/digital-letters-pdm-resource-retries-exceeded-data.schema.json', type: 'uk.nhs.notify.digital.letters.pdm.resource.retries.exceeded.v1', diff --git a/lambdas/pdm-poll-lambda/src/apis/sqs-handler.ts b/lambdas/pdm-poll-lambda/src/apis/sqs-handler.ts index 39ae9c72..1b7c3de2 100644 --- a/lambdas/pdm-poll-lambda/src/apis/sqs-handler.ts +++ b/lambdas/pdm-poll-lambda/src/apis/sqs-handler.ts @@ -184,16 +184,30 @@ export const createHandler = ({ if (pdmAvailability === 'unavailable') { if (retries >= pollMaxRetries) { - retriesExceededEvents.push( - generateRetriesExceededEvent(event, retries), - ); + const outgoing = generateRetriesExceededEvent(event, retries); + logger.info({ + description: 'TraceContext hop', + incoming_traceparent: event.traceparent, + outgoing_traceparent: outgoing.traceparent, + }); + retriesExceededEvents.push(outgoing); } else { - unavailableEvents.push(generateUnavailableEvent(event, retries)); + const outgoing = generateUnavailableEvent(event, retries); + logger.info({ + description: 'TraceContext hop', + incoming_traceparent: event.traceparent, + outgoing_traceparent: outgoing.traceparent, + }); + unavailableEvents.push(outgoing); } } else { - availableEvents.push( - generateAvailableEvent(event, nhsNumber, odsCode), - ); + const outgoing = generateAvailableEvent(event, nhsNumber, odsCode); + logger.info({ + description: 'TraceContext hop', + incoming_traceparent: event.traceparent, + outgoing_traceparent: outgoing.traceparent, + }); + availableEvents.push(outgoing); } } catch (error: any) { logger.warn({ diff --git a/lambdas/report-sender/requirements.txt b/lambdas/report-sender/requirements.txt index 5e7f1345..36ed181b 100644 --- a/lambdas/report-sender/requirements.txt +++ b/lambdas/report-sender/requirements.txt @@ -8,6 +8,8 @@ idna>=3.7 requests>=2.32.0 pyopenssl>=24.2.1 pydantic>=2.0.0 +opentelemetry-api>=1.25.0 +opentelemetry-sdk>=1.25.0 -e ../../src/digital-letters-events -e ../../utils/py-mock-mesh -e ../../utils/py-utils diff --git a/package-lock.json b/package-lock.json index 177d60c7..83738854 100644 --- a/package-lock.json +++ b/package-lock.json @@ -9424,6 +9424,102 @@ "dev": true, "license": "MIT" }, + "node_modules/@opentelemetry/api": { + "version": "1.9.0", + "resolved": "https://registry.npmjs.org/@opentelemetry/api/-/api-1.9.0.tgz", + "integrity": "sha512-3giAOQvZiH5F9bMlMiv8+GSPMeqg0dbaeo58/0SlA9sxSqZhnUtxzX9/2FzyhS9sWQf5S0GJE0AKBrFqjpeYcg==", + "license": "Apache-2.0", + "peer": true, + "engines": { + "node": ">=8.0.0" + } + }, + "node_modules/@opentelemetry/context-async-hooks": { + "version": "2.6.0", + "resolved": "https://registry.npmjs.org/@opentelemetry/context-async-hooks/-/context-async-hooks-2.6.0.tgz", + "integrity": "sha512-L8UyDwqpTcbkIK5cgwDRDYDoEhQoj8wp8BwsO19w3LB1Z41yEQm2VJyNfAi9DrLP/YTqXqWpKHyZfR9/tFYo1Q==", + "license": "Apache-2.0", + "engines": { + "node": "^18.19.0 || >=20.6.0" + }, + "peerDependencies": { + "@opentelemetry/api": ">=1.0.0 <1.10.0" + } + }, + "node_modules/@opentelemetry/core": { + "version": "2.6.0", + "resolved": "https://registry.npmjs.org/@opentelemetry/core/-/core-2.6.0.tgz", + "integrity": "sha512-HLM1v2cbZ4TgYN6KEOj+Bbj8rAKriOdkF9Ed3tG25FoprSiQl7kYc+RRT6fUZGOvx0oMi5U67GoFdT+XUn8zEg==", + "license": "Apache-2.0", + "dependencies": { + "@opentelemetry/semantic-conventions": "^1.29.0" + }, + "engines": { + "node": "^18.19.0 || >=20.6.0" + }, + "peerDependencies": { + "@opentelemetry/api": ">=1.0.0 <1.10.0" + } + }, + "node_modules/@opentelemetry/resources": { + "version": "2.6.0", + "resolved": "https://registry.npmjs.org/@opentelemetry/resources/-/resources-2.6.0.tgz", + "integrity": "sha512-D4y/+OGe3JSuYUCBxtH5T9DSAWNcvCb/nQWIga8HNtXTVPQn59j0nTBAgaAXxUVBDl40mG3Tc76b46wPlZaiJQ==", + "license": "Apache-2.0", + "dependencies": { + "@opentelemetry/core": "2.6.0", + "@opentelemetry/semantic-conventions": "^1.29.0" + }, + "engines": { + "node": "^18.19.0 || >=20.6.0" + }, + "peerDependencies": { + "@opentelemetry/api": ">=1.3.0 <1.10.0" + } + }, + "node_modules/@opentelemetry/sdk-trace-base": { + "version": "2.6.0", + "resolved": "https://registry.npmjs.org/@opentelemetry/sdk-trace-base/-/sdk-trace-base-2.6.0.tgz", + "integrity": "sha512-g/OZVkqlxllgFM7qMKqbPV9c1DUPhQ7d4n3pgZFcrnrNft9eJXZM2TNHTPYREJBrtNdRytYyvwjgL5geDKl3EQ==", + "license": "Apache-2.0", + "dependencies": { + "@opentelemetry/core": "2.6.0", + "@opentelemetry/resources": "2.6.0", + "@opentelemetry/semantic-conventions": "^1.29.0" + }, + "engines": { + "node": "^18.19.0 || >=20.6.0" + }, + "peerDependencies": { + "@opentelemetry/api": ">=1.3.0 <1.10.0" + } + }, + "node_modules/@opentelemetry/sdk-trace-node": { + "version": "2.6.0", + "resolved": "https://registry.npmjs.org/@opentelemetry/sdk-trace-node/-/sdk-trace-node-2.6.0.tgz", + "integrity": "sha512-YhswtasmsbIGEFvLGvR9p/y3PVRTfFf+mgY8van4Ygpnv4sA3vooAjvh+qAn9PNWxs4/IwGGqiQS0PPsaRJ0vQ==", + "license": "Apache-2.0", + "dependencies": { + "@opentelemetry/context-async-hooks": "2.6.0", + "@opentelemetry/core": "2.6.0", + "@opentelemetry/sdk-trace-base": "2.6.0" + }, + "engines": { + "node": "^18.19.0 || >=20.6.0" + }, + "peerDependencies": { + "@opentelemetry/api": ">=1.0.0 <1.10.0" + } + }, + "node_modules/@opentelemetry/semantic-conventions": { + "version": "1.40.0", + "resolved": "https://registry.npmjs.org/@opentelemetry/semantic-conventions/-/semantic-conventions-1.40.0.tgz", + "integrity": "sha512-cifvXDhcqMwwTlTK04GBNeIe7yyo28Mfby85QXFe1Yk8nmi36Ab/5UQwptOx84SsoGNRg+EVSjwzfSZMy6pmlw==", + "license": "Apache-2.0", + "engines": { + "node": ">=14" + } + }, "node_modules/@pdf-lib/standard-fonts": { "version": "1.0.0", "license": "MIT", @@ -22977,6 +23073,8 @@ "@aws-sdk/client-ssm": "^3.984.0", "@aws-sdk/lib-dynamodb": "^3.984.0", "@aws-sdk/lib-storage": "^3.984.0", + "@opentelemetry/api": "^1.9.0", + "@opentelemetry/sdk-trace-node": "^2.6.0", "async-mutex": "^0.4.0", "axios": "^1.13.5", "date-fns": "^4.1.0", diff --git a/utils/py-utils/dl_utils/__init__.py b/utils/py-utils/dl_utils/__init__.py index 1c8046ba..5d8c28da 100644 --- a/utils/py-utils/dl_utils/__init__.py +++ b/utils/py-utils/dl_utils/__init__.py @@ -22,6 +22,11 @@ report_expiry_time ) +from .trace_context import ( + create_traceparent, + derive_child_traceparent, +) + __all__ = [ 'EventPublisher', 'BaseMeshConfig', @@ -33,4 +38,6 @@ 'Metric', 'CertificateExpiryMonitor', 'report_expiry_time', + 'create_traceparent', + 'derive_child_traceparent', ] diff --git a/utils/py-utils/dl_utils/event_publisher.py b/utils/py-utils/dl_utils/event_publisher.py index b533ccba..c9d95b0c 100644 --- a/utils/py-utils/dl_utils/event_publisher.py +++ b/utils/py-utils/dl_utils/event_publisher.py @@ -11,6 +11,7 @@ import boto3 from botocore.exceptions import ClientError from pydantic import ValidationError +from .trace_context import create_traceparent, derive_child_traceparent DlqReason = Literal['INVALID_EVENT', 'EVENTBRIDGE_FAILURE'] @@ -220,14 +221,22 @@ def send_events(self, events: List[Dict[str, Any]], """ Send CloudEvents to EventBridge with validation and DLQ support. - 1. Validates events using the specified validator function - 2. Sends valid events to EventBridge - 3. Routes failed events to DLQ + 1. Stamps each event with a traceparent: derives a child if one exists, otherwise creates a new root traceparent. + 2. Validates events using the specified validator function + 3. Sends valid events to EventBridge + 4. Routes failed events to DLQ """ if not events: self.logger.info('No events to send') return [] + for event in events: + incoming = event.get('traceparent') + if incoming: + event['traceparent'] = derive_child_traceparent(incoming) + else: + event['traceparent'] = create_traceparent() + valid_events = [] invalid_events = [] diff --git a/utils/py-utils/dl_utils/trace_context.py b/utils/py-utils/dl_utils/trace_context.py new file mode 100644 index 00000000..6d92b880 --- /dev/null +++ b/utils/py-utils/dl_utils/trace_context.py @@ -0,0 +1,38 @@ +"""W3C TraceContext helpers for Digital Letters using OpenTelemetry. + +Uses the OpenTelemetry API for context propagation. This means: +- Trace IDs and span IDs are generated by the OTel SDK +- traceparent strings are standard W3C format: 00--- +- When an ADOT exporter is later configured, these traces will automatically flow into X-Ray / CloudWatch Application Signals with no code changes needed +""" + +from opentelemetry import trace +from opentelemetry.sdk.trace import TracerProvider +from opentelemetry.trace.propagation.tracecontext import TraceContextTextMapPropagator + +trace.set_tracer_provider(TracerProvider()) +_propagator = TraceContextTextMapPropagator() +_tracer = trace.get_tracer(__name__) + + +def create_traceparent() -> str: + """Create a new root W3C traceparent via the OTel tracer.""" + span = _tracer.start_span("root") + ctx = span.get_span_context() + span.end() + trace_id = format(ctx.trace_id, '032x') + span_id = format(ctx.span_id, '016x') + return f'00-{trace_id}-{span_id}-01' + + +def derive_child_traceparent(incoming: str) -> str: + """Return a child traceparent that shares the incoming trace-id, via OTel context.""" + carrier = {'traceparent': incoming} + parent_ctx = _propagator.extract(carrier=carrier) + span = _tracer.start_span("child", context=parent_ctx) + ctx = span.get_span_context() + span.end() + trace_id = format(ctx.trace_id, '032x') + span_id = format(ctx.span_id, '016x') + flags = format(ctx.trace_flags, '02x') + return f'00-{trace_id}-{span_id}-{flags}' diff --git a/utils/py-utils/requirements.txt b/utils/py-utils/requirements.txt index 3ae7ecbd..4824dadb 100644 --- a/utils/py-utils/requirements.txt +++ b/utils/py-utils/requirements.txt @@ -3,4 +3,6 @@ pydantic>=2.0.0 structlog>=21.5.0 mesh-client>=3.2.3 pyopenssl>=24.0.0 +opentelemetry-api>=1.25.0 +opentelemetry-sdk>=1.25.0 -e ../py-mock-mesh diff --git a/utils/utils/package.json b/utils/utils/package.json index 780d0109..723dd324 100644 --- a/utils/utils/package.json +++ b/utils/utils/package.json @@ -9,6 +9,8 @@ "@aws-sdk/client-ssm": "^3.984.0", "@aws-sdk/lib-dynamodb": "^3.984.0", "@aws-sdk/lib-storage": "^3.984.0", + "@opentelemetry/api": "^1.9.0", + "@opentelemetry/sdk-trace-node": "^2.6.0", "async-mutex": "^0.4.0", "axios": "^1.13.5", "date-fns": "^4.1.0", @@ -31,7 +33,8 @@ }, "exports": { ".": "./src/index.ts", - "./logger": "./src/logger.ts" + "./logger": "./src/logger.ts", + "./trace-context": "./src/trace-context/index.ts" }, "main": "src/index.ts", "name": "utils", diff --git a/utils/utils/src/event-publisher/event-publisher.ts b/utils/utils/src/event-publisher/event-publisher.ts index e7b6b45f..df853099 100644 --- a/utils/utils/src/event-publisher/event-publisher.ts +++ b/utils/utils/src/event-publisher/event-publisher.ts @@ -5,6 +5,10 @@ import { import { SQSClient, SendMessageBatchCommand } from '@aws-sdk/client-sqs'; import { randomUUID } from 'node:crypto'; import { Logger } from '../logger'; +import { + createTraceparent, + deriveChildTraceparent, +} from '../trace-context/trace-context'; type DlqReason = 'INVALID_EVENT' | 'EVENTBRIDGE_FAILURE'; @@ -197,6 +201,16 @@ export class EventPublisher { return []; } + // Stamp each event with a traceparent: derive child if one exists, else create root + for (const event of events) { + const incoming = (event as any).traceparent as string | undefined; + if (incoming) { + (event as any).traceparent = deriveChildTraceparent(incoming); + } else { + (event as any).traceparent = createTraceparent(); + } + } + const validEvents: T[] = []; const invalidEvents: T[] = []; diff --git a/utils/utils/src/index.ts b/utils/utils/src/index.ts index 4b9333bc..b93ced2c 100644 --- a/utils/utils/src/index.ts +++ b/utils/utils/src/index.ts @@ -16,3 +16,4 @@ export * from './key-generation-utils'; export * from './schema-utils'; export * from './pdm-client'; export * from './reporting'; +export * from './trace-context'; diff --git a/utils/utils/src/trace-context/index.ts b/utils/utils/src/trace-context/index.ts new file mode 100644 index 00000000..28ce4237 --- /dev/null +++ b/utils/utils/src/trace-context/index.ts @@ -0,0 +1 @@ +export * from './trace-context'; diff --git a/utils/utils/src/trace-context/trace-context.ts b/utils/utils/src/trace-context/trace-context.ts new file mode 100644 index 00000000..dd1fb385 --- /dev/null +++ b/utils/utils/src/trace-context/trace-context.ts @@ -0,0 +1,44 @@ +/** + * W3C TraceContext helpers for Digital Letters using OpenTelemetry. + * + * Uses the OpenTelemetry API for context propagation. This means: + * - Trace IDs and span IDs are generated by the OTel SDK + * - traceparent strings are standard W3C format: 00--- + * - When an ADOT exporter is later configured, these traces will automatically + * flow into X-Ray / CloudWatch Application Signals with no code changes needed here + */ + +import { + context, + propagation, + trace, + TraceFlags, + ROOT_CONTEXT, +} from '@opentelemetry/api'; +import { NodeTracerProvider } from '@opentelemetry/sdk-trace-node'; + +const provider = new NodeTracerProvider(); +provider.register(); + +const tracer = trace.getTracer('dl-trace-context'); + +/** Create a new root W3C traceparent via the OTel tracer */ +export function createTraceparent(): string { + const span = tracer.startSpan('root', undefined, ROOT_CONTEXT); + const ctx = span.spanContext(); + span.end(); + const traceId = ctx.traceId; + const spanId = ctx.spanId; + return `00-${traceId}-${spanId}-01`; // Formats into W3C traceparent string +} + +/** Return a child traceparent that shares the incoming trace-id, via OTel context */ +export function deriveChildTraceparent(incoming: string): string { + const carrier: Record = { traceparent: incoming }; + const parentCtx = propagation.extract(ROOT_CONTEXT, carrier); + const span = tracer.startSpan('child', undefined, parentCtx); + const ctx = span.spanContext(); + span.end(); + const flags = (ctx.traceFlags & TraceFlags.SAMPLED) ? '01' : '00'; + return `00-${ctx.traceId}-${ctx.spanId}-${flags}`; +}