From 0314faaf0d5a8db741484a0f6bc48ea7b3e2c67b Mon Sep 17 00:00:00 2001 From: Kathie Huang Date: Fri, 13 Feb 2026 15:12:03 -0500 Subject: [PATCH 01/35] Set up Cgroup, CpuStats, and CpuMetricsCollector structs, and cgroup file reading and cpu stats calculation functions --- Cargo.lock | 18 ++ crates/datadog-serverless-compat/src/main.rs | 27 +- crates/datadog-trace-agent/Cargo.toml | 2 + crates/datadog-trace-agent/src/lib.rs | 1 + .../src/metrics_collector.rs | 245 ++++++++++++++++++ 5 files changed, 291 insertions(+), 2 deletions(-) create mode 100644 crates/datadog-trace-agent/src/metrics_collector.rs diff --git a/Cargo.lock b/Cargo.lock index 19805168..9c6478f3 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -428,6 +428,7 @@ dependencies = [ "async-trait", "bytes", "datadog-fips", + "dogstatsd", "duplicate", "http-body-util", "hyper", @@ -437,6 +438,7 @@ dependencies = [ "libdd-trace-obfuscation", "libdd-trace-protobuf", "libdd-trace-utils", + "num_cpus", "reqwest", "rmp-serde", "serde", @@ -855,6 +857,12 @@ version = "0.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea" +[[package]] +name = "hermit-abi" +version = "0.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fc0fef456e4baa96da950455cd02c081ca953b141298e41db3fc7e36b1da849c" + [[package]] name = "hex" version = "0.4.3" @@ -1494,6 +1502,16 @@ dependencies = [ "autocfg", ] +[[package]] +name = "num_cpus" +version = "1.17.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "91df4bbde75afed763b708b7eee1e8e7651e02d97f6d5dd763e89367e957b23b" +dependencies = [ + "hermit-abi", + "libc", +] + [[package]] name = "once_cell" version = "1.21.3" diff --git a/crates/datadog-serverless-compat/src/main.rs b/crates/datadog-serverless-compat/src/main.rs index 6d764815..32f573cc 100644 --- a/crates/datadog-serverless-compat/src/main.rs +++ b/crates/datadog-serverless-compat/src/main.rs @@ -18,7 +18,8 @@ use zstd::zstd_safe::CompressionLevel; use datadog_trace_agent::{ aggregator::TraceAggregator, - config, env_verifier, mini_agent, proxy_flusher, stats_flusher, stats_processor, + config, env_verifier, metrics_collector, mini_agent, proxy_flusher, stats_flusher, + stats_processor, trace_flusher::{self, TraceFlusher}, trace_processor, }; @@ -39,6 +40,7 @@ use dogstatsd::{ use dogstatsd::metric::{SortedTags, EMPTY_TAGS}; use tokio_util::sync::CancellationToken; +const CPU_METRICS_COLLECTION_INTERVAL: u64 = 3; const DOGSTATSD_FLUSH_INTERVAL: u64 = 10; const DOGSTATSD_TIMEOUT_DURATION: Duration = Duration::from_secs(5); const DEFAULT_DOGSTATSD_PORT: u16 = 8125; @@ -104,6 +106,10 @@ pub async fn main() { .ok() .and_then(|val| parse_metric_namespace(&val)); + let dd_enhanced_metrics = env::var("DD_ENHANCED_METRICS") + .map(|val| val.to_lowercase() != "false") + .unwrap_or(true); + let https_proxy = env::var("DD_PROXY_HTTPS") .or_else(|_| env::var("HTTPS_PROXY")) .ok(); @@ -170,7 +176,7 @@ pub async fn main() { } }); - let (metrics_flusher, _aggregator_handle) = if dd_use_dogstatsd { + let (metrics_flusher, aggregator_handle) = if dd_use_dogstatsd { debug!("Starting dogstatsd"); let (_, metrics_flusher, aggregator_handle) = start_dogstatsd( dd_dogstatsd_port, @@ -194,6 +200,23 @@ pub async fn main() { (None, None) }; + // If DD_ENHANCED_METRICS is true, start the CPU metrics collector + // Use the existing aggregator handle + // TODO: See if this works in Google Cloud Functions Gen 1. If not, only enable this for Azure Functions. + if dd_enhanced_metrics { + if let Some(ref handle) = aggregator_handle { + let cpu_collector_handle = handle.clone(); + tokio::spawn(async move { + let mut cpu_collector = metrics_collector::CpuMetricsCollector::new( + cpu_collector_handle, + None, + -1, + CPU_METRICS_COLLECTION_INTERVAL, + ); + }); + } + } + let mut flush_interval = interval(Duration::from_secs(DOGSTATSD_FLUSH_INTERVAL)); flush_interval.tick().await; // discard first tick, which is instantaneous diff --git a/crates/datadog-trace-agent/Cargo.toml b/crates/datadog-trace-agent/Cargo.toml index aec60c93..a7e14c5a 100644 --- a/crates/datadog-trace-agent/Cargo.toml +++ b/crates/datadog-trace-agent/Cargo.toml @@ -33,6 +33,8 @@ libdd-trace-obfuscation = { git = "https://github.com/DataDog/libdatadog", rev = datadog-fips = { path = "../datadog-fips" } reqwest = { version = "0.12.23", features = ["json", "http2"], default-features = false } bytes = "1.10.1" +dogstatsd = { path = "../dogstatsd", default-features = true } +num_cpus = "1.16" [dev-dependencies] rmp-serde = "1.1.1" diff --git a/crates/datadog-trace-agent/src/lib.rs b/crates/datadog-trace-agent/src/lib.rs index a87bf56b..5ff530ae 100644 --- a/crates/datadog-trace-agent/src/lib.rs +++ b/crates/datadog-trace-agent/src/lib.rs @@ -11,6 +11,7 @@ pub mod aggregator; pub mod config; pub mod env_verifier; pub mod http_utils; +pub mod metrics_collector; pub mod mini_agent; pub mod proxy_flusher; pub mod stats_flusher; diff --git a/crates/datadog-trace-agent/src/metrics_collector.rs b/crates/datadog-trace-agent/src/metrics_collector.rs new file mode 100644 index 00000000..9474da86 --- /dev/null +++ b/crates/datadog-trace-agent/src/metrics_collector.rs @@ -0,0 +1,245 @@ +// Copyright 2023-Present Datadog, Inc. https://www.datadoghq.com/ +// SPDX-License-Identifier: Apache-2.0 + +//! CPU metrics collector for Azure Functions +//! +//! This module provides functionality to read raw CPU statistics from cgroup v1 files, +//! compute the CPU usage and limit, and submit them as distribution metrics to Datadog. +//! +//! All CPU metrics are reported in nanocores (1 core = 1,000,000,000 nanocores). + +use dogstatsd::aggregator_service::AggregatorHandle; +use dogstatsd::metric::SortedTags; +use num_cpus; +use std::fs; +use std::io; +use tracing::debug; + +const CGROUP_CPU_USAGE_PATH: &str = "/sys/fs/cgroup/cpu/cpuacct.usage"; // Reports the total CPU time, in nanoseconds, consumed by all tasks in this cgroup +const CGROUP_CPUSET_CPUS_PATH: &str = "/sys/fs/cgroup/cpuset/cpuset.cpus"; // Specifies the CPUs that tasks in this cgroup are permitted to access +const CGROUP_CPU_PERIOD_PATH: &str = "/sys/fs/cgroup/cpu/cpu.cfs_period_us"; // Specifies a period of time, in microseconds, for how regularly a cgroup's access to CPU resources should be reallocated +const CGROUP_CPU_QUOTA_PATH: &str = "/sys/fs/cgroup/cpu/cpu.cfs_quota_us"; // Specifies the total amount of time, in microseconds, for which all tasks in a cgroup can run during one period + +const CPU_USAGE_METRIC: &str = "azure.functions.cpu.usage"; +const CPU_LIMIT_METRIC: &str = "azure.functions.cpu.limit"; + +/// Statistics from cgroup v1 files, normalized to nanoseconds +struct CgroupStats { + total: Option, // Total CPU usage (from cpuacct.usage) in nanoseconds + cpu_count: Option, // Number of accessible logical CPUs (from cpuset.cpus) + scheduler_period: Option, // CFS scheduler period (from cpu.cfs_period_us) in nanoseconds + scheduler_quota: Option, // CFS scheduler quota (from cpu.cfs_quota_us) in nanoseconds +} + +/// Computed CPU total and limit metrics +struct CpuStats { + pub total: f64, // Total CPU usage in nanoseconds + pub limit: Option, // CPU limit in nanoseconds + pub defaulted_limit: bool, // Whether CPU limit was defaulted to host CPU count +} + +fn read_cpu_stats() -> Option { + let cgroup_stats = read_cgroup_stats(); + build_cpu_stats(&cgroup_stats) +} + +/// Builds CPU stats - rate and limit +fn build_cpu_stats(cgroup_stats: &CgroupStats) -> Option { + let total = cgroup_stats.total?; + + let (limit_pct, defaulted) = compute_cpu_limit_pct(cgroup_stats); + + Some(CpuStats { + total: total as f64, + limit: Some(limit_pct), + defaulted_limit: defaulted, + }) +} + +/// Reads raw CPU statistics from cgroup v1 files and converts to nanoseconds +fn read_cgroup_stats() -> CgroupStats { + let total = fs::read_to_string(CGROUP_CPU_USAGE_PATH) + .ok() + .and_then(|contents| contents.trim().parse::().ok()); + if total.is_none() { + debug!("Could not read CPU usage from {CGROUP_CPU_USAGE_PATH}"); + } + + let cpu_count = read_cpu_count_from_file(CGROUP_CPUSET_CPUS_PATH).ok(); + if cpu_count.is_none() { + debug!("Could not read CPU count from {CGROUP_CPUSET_CPUS_PATH}"); + } + + let scheduler_period = fs::read_to_string(CGROUP_CPU_PERIOD_PATH) + .ok() + .and_then(|contents| contents.trim().parse::().map(|v| v * 1000).ok()); // Convert from microseconds to nanoseconds + if scheduler_period.is_none() { + debug!("Could not read scheduler period from {CGROUP_CPU_PERIOD_PATH}"); + } + + let scheduler_quota = fs::read_to_string(CGROUP_CPU_QUOTA_PATH) + .ok() + .and_then(|contents| { + contents.trim().parse::().ok().and_then(|quota| { + // Convert from microseconds to nanoseconds + if quota == -1 { + debug!("CFS scheduler quota is -1, setting to None"); + None + } else { + Some((quota * 1000) as u64) + } + }) + }); + if scheduler_quota.is_none() { + debug!("Could not read scheduler quota from {CGROUP_CPU_QUOTA_PATH}"); + } + + CgroupStats { + total, + cpu_count, + scheduler_period, + scheduler_quota, + } +} + +/// Reads CPU count from cpuset.cpus +/// +/// The cpuset.cpus file contains a comma-separated list, with dashes to represent ranges of CPUs, +/// e.g., "0-2,16" represents CPUs 0, 1, 2, and 16 +/// This function returns the count of CPUs, in this case 4. +fn read_cpu_count_from_file(path: &str) -> Result { + let contents = fs::read_to_string(path)?; + let cpuset_str = contents.trim(); + if cpuset_str.is_empty() { + return Err(io::Error::new( + io::ErrorKind::InvalidData, + format!("File {path} is empty"), + )); + } + debug!("Contents of {path}: {cpuset_str}"); + + let mut cpu_count: u64 = 0; + + for part in cpuset_str.split(',') { + let range: Vec<&str> = part.split('-').collect(); + if range.len() == 2 { + // Range like "0-3" + debug!("Range: {range:?}"); + let start: u64 = range[0].parse().map_err(|e| { + io::Error::new( + io::ErrorKind::InvalidData, + format!("Failed to parse u64 from range {range:?}: {e}"), + ) + })?; + let end: u64 = range[1].parse().map_err(|e| { + io::Error::new( + io::ErrorKind::InvalidData, + format!("Failed to parse u64 from range {range:?}: {e}"), + ) + })?; + cpu_count += end - start + 1; + } else { + // Single CPU like "2" + debug!("Single CPU: {part}"); + cpu_count += 1; + } + } + + debug!("Total CPU count: {cpu_count}"); + Ok(cpu_count) +} + +/// Computes the CPU limit percentage +fn compute_cpu_limit_pct(cgroup_stats: &CgroupStats) -> (f64, bool) { + match compute_cgroup_cpu_limit_pct(cgroup_stats) { + Some(limit) => (limit, false), + None => { + let host_cpu_count = num_cpus::get() as f64; + debug!( + "No CPU limit found, defaulting to host CPU count: {} CPUs", + host_cpu_count + ); + (host_cpu_count * 100.0, true) + } + } +} + +/// Computes the CPU limit percentage from cgroup statistics +/// Limit is computed using min(CPUSet, CFS CPU Quota) +fn compute_cgroup_cpu_limit_pct(cgroup_stats: &CgroupStats) -> Option { + let mut limit_pct = None; + + if let Some(cpu_count) = cgroup_stats.cpu_count { + let host_cpu_count = num_cpus::get() as u64; + if cpu_count != host_cpu_count { + let cpuset_limit_pct = cpu_count as f64 * 100.0; + limit_pct = Some(cpuset_limit_pct); + debug!( + "CPU limit from cpuset: {} CPUs ({}%)", + cpu_count, cpuset_limit_pct + ); + } + } + + if let (Some(scheduler_quota), Some(scheduler_period)) = + (cgroup_stats.scheduler_quota, cgroup_stats.scheduler_period) + { + let quota_limit_pct = 100.0 * (scheduler_quota as f64 / scheduler_period as f64); + match limit_pct { + None => { + limit_pct = Some(quota_limit_pct); + debug!( + "limit_pct is None, setting CPU limit from cfs quota: {}%", + quota_limit_pct + ); + } + Some(current_limit_pct) if quota_limit_pct < current_limit_pct => { + limit_pct = Some(quota_limit_pct); + debug!("CPU limit from cfs quota is less than current limit, setting CPU limit from cfs quota: {}%", quota_limit_pct); + } + _ => { + debug!("Keeping cpuset limit: {:?}%", limit_pct); + } + } + } + limit_pct +} + +pub struct CpuMetricsCollector { + aggregator: AggregatorHandle, + tags: Option, + last_usage_ns: i64, + collection_interval_secs: u64, +} + +impl CpuMetricsCollector { + /// Creates a new CpuMetricsCollector + /// + /// # Arguments + /// + /// * `aggregator` - The aggregator handle to submit metrics to + /// * `tags` - Optional tags to attach to all metrics + /// * `last_usage_ns` - The last usage time in nanoseconds + /// * `collection_interval_secs` - The interval in seconds to collect the metrics + pub fn new( + aggregator: AggregatorHandle, + tags: Option, + last_usage_ns: i64, + collection_interval_secs: u64, + ) -> Self { + Self { + aggregator, + tags, + last_usage_ns, + collection_interval_secs, + } + } + + pub fn collect_and_submit(&self) { + if let Some(cpu_stats) = read_cpu_stats() { + // Submit metrics + } else { + debug!("Skipping CPU metrics collection - could not find data to generate CPU usage and limit enhanced metrics"); + } + } +} From af53bf0e863735c45e988d416a6090156745a23f Mon Sep 17 00:00:00 2001 From: Kathie Huang Date: Fri, 13 Feb 2026 15:36:29 -0500 Subject: [PATCH 02/35] Add cpu collector into loop with dogstatsd --- crates/datadog-serverless-compat/src/main.rs | 46 ++++++++++++-------- 1 file changed, 28 insertions(+), 18 deletions(-) diff --git a/crates/datadog-serverless-compat/src/main.rs b/crates/datadog-serverless-compat/src/main.rs index 32f573cc..604a6a42 100644 --- a/crates/datadog-serverless-compat/src/main.rs +++ b/crates/datadog-serverless-compat/src/main.rs @@ -203,29 +203,39 @@ pub async fn main() { // If DD_ENHANCED_METRICS is true, start the CPU metrics collector // Use the existing aggregator handle // TODO: See if this works in Google Cloud Functions Gen 1. If not, only enable this for Azure Functions. - if dd_enhanced_metrics { - if let Some(ref handle) = aggregator_handle { - let cpu_collector_handle = handle.clone(); - tokio::spawn(async move { - let mut cpu_collector = metrics_collector::CpuMetricsCollector::new( - cpu_collector_handle, - None, - -1, - CPU_METRICS_COLLECTION_INTERVAL, - ); - }); - } - } + let cpu_collector = if dd_enhanced_metrics { + aggregator_handle.as_ref().map(|handle| { + metrics_collector::CpuMetricsCollector::new( + handle.clone(), + None, + -1, + CPU_METRICS_COLLECTION_INTERVAL, + ) + }) + } else { + info!("Enhanced metrics disabled"); + None + }; let mut flush_interval = interval(Duration::from_secs(DOGSTATSD_FLUSH_INTERVAL)); + let mut cpu_collection_interval = + interval(Duration::from_secs(CPU_METRICS_COLLECTION_INTERVAL)); flush_interval.tick().await; // discard first tick, which is instantaneous + cpu_collection_interval.tick().await; loop { - flush_interval.tick().await; - - if let Some(metrics_flusher) = metrics_flusher.as_ref() { - debug!("Flushing dogstatsd metrics"); - metrics_flusher.flush().await; + tokio::select! { + _ = flush_interval.tick() => { + if let Some(metrics_flusher) = metrics_flusher.as_ref() { + debug!("Flushing dogstatsd metrics"); + metrics_flusher.flush().await; + } + } + _ = cpu_collection_interval.tick() => { + if let Some(cpu_collector) = cpu_collector.as_ref() { + cpu_collector.collect_and_submit(); + } + } } } } From 9d6916204d7555ac4c9ebaa58bdc2f5df1965e47 Mon Sep 17 00:00:00 2001 From: Kathie Huang Date: Fri, 13 Feb 2026 15:40:22 -0500 Subject: [PATCH 03/35] Fix license --- LICENSE-3rdparty.csv | 2 ++ 1 file changed, 2 insertions(+) diff --git a/LICENSE-3rdparty.csv b/LICENSE-3rdparty.csv index 26f606f9..ce85550b 100644 --- a/LICENSE-3rdparty.csv +++ b/LICENSE-3rdparty.csv @@ -69,6 +69,7 @@ headers,https://github.com/hyperium/headers,MIT,Sean McArthur heck,https://github.com/withoutboats/heck,MIT OR Apache-2.0,The heck Authors heck,https://github.com/withoutboats/heck,MIT OR Apache-2.0,Without Boats +hermit-abi,https://github.com/hermit-os/hermit-rs,MIT OR Apache-2.0,Stefan Lankes hex,https://github.com/KokaKiwi/rust-hex,MIT OR Apache-2.0,KokaKiwi home,https://github.com/rust-lang/cargo,MIT OR Apache-2.0,Brian Anderson http,https://github.com/hyperium/http,MIT OR Apache-2.0,"Alex Crichton , Carl Lerche , Sean McArthur " @@ -119,6 +120,7 @@ multimap,https://github.com/havarnov/multimap,MIT OR Apache-2.0,Håvar Nøvik , Josh Triplett , The Nushell Project Developers" num-traits,https://github.com/rust-num/num-traits,MIT OR Apache-2.0,The Rust Project Developers +num_cpus,https://github.com/seanmonstar/num_cpus,MIT OR Apache-2.0,Sean McArthur once_cell,https://github.com/matklad/once_cell,MIT OR Apache-2.0,Aleksey Kladov openssl-probe,https://github.com/rustls/openssl-probe,MIT OR Apache-2.0,Alex Crichton ordered-float,https://github.com/reem/rust-ordered-float,MIT,"Jonathan Reem , Matt Brubeck " From 36171b8fda968061968a4bf06e1266f057cfe8d8 Mon Sep 17 00:00:00 2001 From: Kathie Huang Date: Mon, 23 Feb 2026 11:52:00 -0500 Subject: [PATCH 04/35] Move metrics_collector into its own crate --- Cargo.lock | 10 ++++++++++ crates/datadog-metrics-collector/Cargo.toml | 11 +++++++++++ .../src/cpu.rs} | 11 +++++++++++ crates/datadog-metrics-collector/src/lib.rs | 10 ++++++++++ crates/datadog-serverless-compat/Cargo.toml | 1 + crates/datadog-serverless-compat/src/main.rs | 12 ++++-------- crates/datadog-trace-agent/src/lib.rs | 1 - 7 files changed, 47 insertions(+), 9 deletions(-) create mode 100644 crates/datadog-metrics-collector/Cargo.toml rename crates/{datadog-trace-agent/src/metrics_collector.rs => datadog-metrics-collector/src/cpu.rs} (95%) create mode 100644 crates/datadog-metrics-collector/src/lib.rs diff --git a/Cargo.lock b/Cargo.lock index 9c6478f3..d74ba9dd 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -390,6 +390,15 @@ dependencies = [ "tracing", ] +[[package]] +name = "datadog-metrics-collector" +version = "0.1.0" +dependencies = [ + "dogstatsd", + "num_cpus", + "tracing", +] + [[package]] name = "datadog-protos" version = "0.1.0" @@ -409,6 +418,7 @@ name = "datadog-serverless-compat" version = "0.1.0" dependencies = [ "datadog-fips", + "datadog-metrics-collector", "datadog-trace-agent", "dogstatsd", "libdd-trace-utils", diff --git a/crates/datadog-metrics-collector/Cargo.toml b/crates/datadog-metrics-collector/Cargo.toml new file mode 100644 index 00000000..4061bd13 --- /dev/null +++ b/crates/datadog-metrics-collector/Cargo.toml @@ -0,0 +1,11 @@ +[package] +name = "datadog-metrics-collector" +version = "0.1.0" +edition.workspace = true +license.workspace = true +description = "Collector to read, compute, and submit enhanced metrics in Serverless environments" + +[dependencies] +dogstatsd = { path = "../dogstatsd", default-features = true } +num_cpus = "1.16" +tracing = { version = "0.1", default-features = false } diff --git a/crates/datadog-trace-agent/src/metrics_collector.rs b/crates/datadog-metrics-collector/src/cpu.rs similarity index 95% rename from crates/datadog-trace-agent/src/metrics_collector.rs rename to crates/datadog-metrics-collector/src/cpu.rs index 9474da86..1450e8d9 100644 --- a/crates/datadog-trace-agent/src/metrics_collector.rs +++ b/crates/datadog-metrics-collector/src/cpu.rs @@ -238,6 +238,17 @@ impl CpuMetricsCollector { pub fn collect_and_submit(&self) { if let Some(cpu_stats) = read_cpu_stats() { // Submit metrics + debug!("Collected cpu stats!"); + debug!("CPU usage: {}", cpu_stats.total); + if let Some(limit) = cpu_stats.limit { + debug!( + "CPU limit: {}%, defaulted: {}", + limit, cpu_stats.defaulted_limit + ); + } else { + debug!("CPU limit: None, defaulted: {}", cpu_stats.defaulted_limit); + } + debug!("Submitting CPU metrics!"); } else { debug!("Skipping CPU metrics collection - could not find data to generate CPU usage and limit enhanced metrics"); } diff --git a/crates/datadog-metrics-collector/src/lib.rs b/crates/datadog-metrics-collector/src/lib.rs new file mode 100644 index 00000000..dc6c5f0f --- /dev/null +++ b/crates/datadog-metrics-collector/src/lib.rs @@ -0,0 +1,10 @@ +// Copyright 2023-Present Datadog, Inc. https://www.datadoghq.com/ +// SPDX-License-Identifier: Apache-2.0 + +#![cfg_attr(not(test), deny(clippy::panic))] +#![cfg_attr(not(test), deny(clippy::unwrap_used))] +#![cfg_attr(not(test), deny(clippy::expect_used))] +#![cfg_attr(not(test), deny(clippy::todo))] +#![cfg_attr(not(test), deny(clippy::unimplemented))] + +pub mod cpu; diff --git a/crates/datadog-serverless-compat/Cargo.toml b/crates/datadog-serverless-compat/Cargo.toml index ed573669..fcbd3da0 100644 --- a/crates/datadog-serverless-compat/Cargo.toml +++ b/crates/datadog-serverless-compat/Cargo.toml @@ -11,6 +11,7 @@ windows-pipes = ["datadog-trace-agent/windows-pipes", "dogstatsd/windows-pipes"] [dependencies] datadog-trace-agent = { path = "../datadog-trace-agent" } +datadog-metrics-collector = { path = "../datadog-metrics-collector" } libdd-trace-utils = { git = "https://github.com/DataDog/libdatadog", rev = "d52ee90209cb12a28bdda0114535c1a985a29d95" } datadog-fips = { path = "../datadog-fips", default-features = false } dogstatsd = { path = "../dogstatsd", default-features = true } diff --git a/crates/datadog-serverless-compat/src/main.rs b/crates/datadog-serverless-compat/src/main.rs index 604a6a42..8a388224 100644 --- a/crates/datadog-serverless-compat/src/main.rs +++ b/crates/datadog-serverless-compat/src/main.rs @@ -18,12 +18,13 @@ use zstd::zstd_safe::CompressionLevel; use datadog_trace_agent::{ aggregator::TraceAggregator, - config, env_verifier, metrics_collector, mini_agent, proxy_flusher, stats_flusher, - stats_processor, + config, env_verifier, mini_agent, proxy_flusher, stats_flusher, stats_processor, trace_flusher::{self, TraceFlusher}, trace_processor, }; +use datadog_metrics_collector::cpu::CpuMetricsCollector; + use libdd_trace_utils::{config_utils::read_cloud_env, trace_utils::EnvironmentType}; use datadog_fips::reqwest_adapter::create_reqwest_client_builder; @@ -205,12 +206,7 @@ pub async fn main() { // TODO: See if this works in Google Cloud Functions Gen 1. If not, only enable this for Azure Functions. let cpu_collector = if dd_enhanced_metrics { aggregator_handle.as_ref().map(|handle| { - metrics_collector::CpuMetricsCollector::new( - handle.clone(), - None, - -1, - CPU_METRICS_COLLECTION_INTERVAL, - ) + CpuMetricsCollector::new(handle.clone(), None, -1, CPU_METRICS_COLLECTION_INTERVAL) }) } else { info!("Enhanced metrics disabled"); diff --git a/crates/datadog-trace-agent/src/lib.rs b/crates/datadog-trace-agent/src/lib.rs index 5ff530ae..a87bf56b 100644 --- a/crates/datadog-trace-agent/src/lib.rs +++ b/crates/datadog-trace-agent/src/lib.rs @@ -11,7 +11,6 @@ pub mod aggregator; pub mod config; pub mod env_verifier; pub mod http_utils; -pub mod metrics_collector; pub mod mini_agent; pub mod proxy_flusher; pub mod stats_flusher; From 92526096d0fe9691c180f635f9b46d33b5dca467 Mon Sep 17 00:00:00 2001 From: Kathie Huang Date: Mon, 23 Feb 2026 17:34:02 -0500 Subject: [PATCH 05/35] Submit cpu usage and limit metrics and fix units --- crates/datadog-metrics-collector/src/cpu.rs | 106 ++++++++++++------- crates/datadog-serverless-compat/src/main.rs | 8 +- 2 files changed, 72 insertions(+), 42 deletions(-) diff --git a/crates/datadog-metrics-collector/src/cpu.rs b/crates/datadog-metrics-collector/src/cpu.rs index 1450e8d9..ed541f11 100644 --- a/crates/datadog-metrics-collector/src/cpu.rs +++ b/crates/datadog-metrics-collector/src/cpu.rs @@ -9,23 +9,23 @@ //! All CPU metrics are reported in nanocores (1 core = 1,000,000,000 nanocores). use dogstatsd::aggregator_service::AggregatorHandle; -use dogstatsd::metric::SortedTags; +use dogstatsd::metric::{Metric, MetricValue, SortedTags}; use num_cpus; use std::fs; use std::io; -use tracing::debug; +use tracing::{debug, error}; const CGROUP_CPU_USAGE_PATH: &str = "/sys/fs/cgroup/cpu/cpuacct.usage"; // Reports the total CPU time, in nanoseconds, consumed by all tasks in this cgroup const CGROUP_CPUSET_CPUS_PATH: &str = "/sys/fs/cgroup/cpuset/cpuset.cpus"; // Specifies the CPUs that tasks in this cgroup are permitted to access const CGROUP_CPU_PERIOD_PATH: &str = "/sys/fs/cgroup/cpu/cpu.cfs_period_us"; // Specifies a period of time, in microseconds, for how regularly a cgroup's access to CPU resources should be reallocated const CGROUP_CPU_QUOTA_PATH: &str = "/sys/fs/cgroup/cpu/cpu.cfs_quota_us"; // Specifies the total amount of time, in microseconds, for which all tasks in a cgroup can run during one period -const CPU_USAGE_METRIC: &str = "azure.functions.cpu.usage"; -const CPU_LIMIT_METRIC: &str = "azure.functions.cpu.limit"; +const CPU_USAGE_METRIC: &str = "azure.functions.enhanced.test.cpu.usage"; +const CPU_LIMIT_METRIC: &str = "azure.functions.enhanced.test.cpu.limit"; /// Statistics from cgroup v1 files, normalized to nanoseconds struct CgroupStats { - total: Option, // Total CPU usage (from cpuacct.usage) in nanoseconds + total: Option, // Cumulative CPU usage (from cpuacct.usage) in nanoseconds cpu_count: Option, // Number of accessible logical CPUs (from cpuset.cpus) scheduler_period: Option, // CFS scheduler period (from cpu.cfs_period_us) in nanoseconds scheduler_quota: Option, // CFS scheduler quota (from cpu.cfs_quota_us) in nanoseconds @@ -33,8 +33,8 @@ struct CgroupStats { /// Computed CPU total and limit metrics struct CpuStats { - pub total: f64, // Total CPU usage in nanoseconds - pub limit: Option, // CPU limit in nanoseconds + pub total: f64, // Cumulative CPU usage in nanoseconds + pub limit: Option, // CPU limit in nanocores pub defaulted_limit: bool, // Whether CPU limit was defaulted to host CPU count } @@ -47,7 +47,7 @@ fn read_cpu_stats() -> Option { fn build_cpu_stats(cgroup_stats: &CgroupStats) -> Option { let total = cgroup_stats.total?; - let (limit_pct, defaulted) = compute_cpu_limit_pct(cgroup_stats); + let (limit_pct, defaulted) = compute_cpu_limit_nc(cgroup_stats); Some(CpuStats { total: total as f64, @@ -149,9 +149,9 @@ fn read_cpu_count_from_file(path: &str) -> Result { Ok(cpu_count) } -/// Computes the CPU limit percentage -fn compute_cpu_limit_pct(cgroup_stats: &CgroupStats) -> (f64, bool) { - match compute_cgroup_cpu_limit_pct(cgroup_stats) { +/// Computes the CPU limit in nanocores, with fallback to host CPU count +fn compute_cpu_limit_nc(cgroup_stats: &CgroupStats) -> (f64, bool) { + match compute_cgroup_cpu_limit_nc(cgroup_stats) { Some(limit) => (limit, false), None => { let host_cpu_count = num_cpus::get() as f64; @@ -159,24 +159,24 @@ fn compute_cpu_limit_pct(cgroup_stats: &CgroupStats) -> (f64, bool) { "No CPU limit found, defaulting to host CPU count: {} CPUs", host_cpu_count ); - (host_cpu_count * 100.0, true) + (host_cpu_count * 1000000000.0, true) // Convert to nanocores } } } -/// Computes the CPU limit percentage from cgroup statistics +/// Computes the CPU limit in nanocores from cgroup statistics /// Limit is computed using min(CPUSet, CFS CPU Quota) -fn compute_cgroup_cpu_limit_pct(cgroup_stats: &CgroupStats) -> Option { - let mut limit_pct = None; +fn compute_cgroup_cpu_limit_nc(cgroup_stats: &CgroupStats) -> Option { + let mut limit_nc = None; if let Some(cpu_count) = cgroup_stats.cpu_count { let host_cpu_count = num_cpus::get() as u64; if cpu_count != host_cpu_count { - let cpuset_limit_pct = cpu_count as f64 * 100.0; - limit_pct = Some(cpuset_limit_pct); + let cpuset_limit_nc = cpu_count as f64 * 1000000000.0; // Convert to nanocores + limit_nc = Some(cpuset_limit_nc); debug!( - "CPU limit from cpuset: {} CPUs ({}%)", - cpu_count, cpuset_limit_pct + "CPU limit from cpuset: {} CPUs ({} nanocores)", + cpu_count, cpuset_limit_nc ); } } @@ -184,31 +184,31 @@ fn compute_cgroup_cpu_limit_pct(cgroup_stats: &CgroupStats) -> Option { if let (Some(scheduler_quota), Some(scheduler_period)) = (cgroup_stats.scheduler_quota, cgroup_stats.scheduler_period) { - let quota_limit_pct = 100.0 * (scheduler_quota as f64 / scheduler_period as f64); - match limit_pct { + let quota_limit_nc = 1000000000.0 * (scheduler_quota as f64 / scheduler_period as f64); + match limit_nc { None => { - limit_pct = Some(quota_limit_pct); + limit_nc = Some(quota_limit_nc); debug!( - "limit_pct is None, setting CPU limit from cfs quota: {}%", - quota_limit_pct + "limit_pct is None, setting CPU limit from cfs quota: {} nanocores", + quota_limit_nc ); } - Some(current_limit_pct) if quota_limit_pct < current_limit_pct => { - limit_pct = Some(quota_limit_pct); - debug!("CPU limit from cfs quota is less than current limit, setting CPU limit from cfs quota: {}%", quota_limit_pct); + Some(current_limit_nc) if quota_limit_nc < current_limit_nc => { + limit_nc = Some(quota_limit_nc); + debug!("CPU limit from cfs quota is less than current limit, setting CPU limit from cfs quota: {} nanocores", quota_limit_nc); } _ => { - debug!("Keeping cpuset limit: {:?}%", limit_pct); + debug!("Keeping cpuset limit: {:?} nanocores", limit_nc); } } } - limit_pct + limit_nc } pub struct CpuMetricsCollector { aggregator: AggregatorHandle, tags: Option, - last_usage_ns: i64, + last_usage_ns: f64, collection_interval_secs: u64, } @@ -224,7 +224,7 @@ impl CpuMetricsCollector { pub fn new( aggregator: AggregatorHandle, tags: Option, - last_usage_ns: i64, + last_usage_ns: f64, collection_interval_secs: u64, ) -> Self { Self { @@ -235,18 +235,48 @@ impl CpuMetricsCollector { } } - pub fn collect_and_submit(&self) { + pub fn collect_and_submit(&mut self) { if let Some(cpu_stats) = read_cpu_stats() { // Submit metrics debug!("Collected cpu stats!"); + let current_usage_ns = cpu_stats.total; debug!("CPU usage: {}", cpu_stats.total); + + // Skip first collection + if self.last_usage_ns == -1.0 { + debug!("First CPU collection, skipping rate computation"); + self.last_usage_ns = current_usage_ns; + return; + } + + let delta_ns = current_usage_ns - self.last_usage_ns as f64; + self.last_usage_ns = current_usage_ns; + + // Divide nanoseconds delta by collection interval to get usage rate in nanocores + let usage_rate_nc = delta_ns / self.collection_interval_secs as f64; + debug!("Usage rate: {} nanocores/s", usage_rate_nc); + + let now = std::time::UNIX_EPOCH.elapsed() + .map(|d| d.as_secs()) + .unwrap_or(0) + .try_into() + .unwrap_or(0); + + let usage_metric = Metric::new(CPU_USAGE_METRIC.into(), MetricValue::distribution(usage_rate_nc), self.tags.clone(), Some(now)); + + if let Err(e) = self.aggregator.insert_batch(vec![usage_metric]) { + error!("Failed to insert CPU usage metric: {}", e); + } + if let Some(limit) = cpu_stats.limit { - debug!( - "CPU limit: {}%, defaulted: {}", - limit, cpu_stats.defaulted_limit - ); - } else { - debug!("CPU limit: None, defaulted: {}", cpu_stats.defaulted_limit); + debug!("CPU limit: {}", limit); + if cpu_stats.defaulted_limit { + debug!("CPU limit defaulted to host CPU count"); + } + let limit_metric = Metric::new(CPU_LIMIT_METRIC.into(), MetricValue::distribution(limit), self.tags.clone(), Some(now)); + if let Err(e) = self.aggregator.insert_batch(vec![limit_metric]) { + error!("Failed to insert CPU limit metric: {}", e); + } } debug!("Submitting CPU metrics!"); } else { diff --git a/crates/datadog-serverless-compat/src/main.rs b/crates/datadog-serverless-compat/src/main.rs index 8a388224..1bece02e 100644 --- a/crates/datadog-serverless-compat/src/main.rs +++ b/crates/datadog-serverless-compat/src/main.rs @@ -204,9 +204,9 @@ pub async fn main() { // If DD_ENHANCED_METRICS is true, start the CPU metrics collector // Use the existing aggregator handle // TODO: See if this works in Google Cloud Functions Gen 1. If not, only enable this for Azure Functions. - let cpu_collector = if dd_enhanced_metrics { + let mut cpu_collector = if dd_enhanced_metrics { aggregator_handle.as_ref().map(|handle| { - CpuMetricsCollector::new(handle.clone(), None, -1, CPU_METRICS_COLLECTION_INTERVAL) + CpuMetricsCollector::new(handle.clone(), None, -1.0, CPU_METRICS_COLLECTION_INTERVAL) }) } else { info!("Enhanced metrics disabled"); @@ -228,8 +228,8 @@ pub async fn main() { } } _ = cpu_collection_interval.tick() => { - if let Some(cpu_collector) = cpu_collector.as_ref() { - cpu_collector.collect_and_submit(); + if let Some(ref mut collector) = cpu_collector { + collector.collect_and_submit(); } } } From f47c4ff8dd5b15f0ec8c07cf07653e98791ee2a6 Mon Sep 17 00:00:00 2001 From: Kathie Huang Date: Wed, 25 Feb 2026 13:08:20 -0500 Subject: [PATCH 06/35] Test more precise time interval, add instance ID as a tag --- crates/datadog-metrics-collector/src/cpu.rs | 56 +++++++++++++++----- crates/datadog-serverless-compat/src/main.rs | 16 +++++- 2 files changed, 58 insertions(+), 14 deletions(-) diff --git a/crates/datadog-metrics-collector/src/cpu.rs b/crates/datadog-metrics-collector/src/cpu.rs index ed541f11..2d23c891 100644 --- a/crates/datadog-metrics-collector/src/cpu.rs +++ b/crates/datadog-metrics-collector/src/cpu.rs @@ -21,6 +21,7 @@ const CGROUP_CPU_PERIOD_PATH: &str = "/sys/fs/cgroup/cpu/cpu.cfs_period_us"; // const CGROUP_CPU_QUOTA_PATH: &str = "/sys/fs/cgroup/cpu/cpu.cfs_quota_us"; // Specifies the total amount of time, in microseconds, for which all tasks in a cgroup can run during one period const CPU_USAGE_METRIC: &str = "azure.functions.enhanced.test.cpu.usage"; +const CPU_USAGE_PRECISE_METRIC: &str = "azure.functions.enhanced.test.cpu.usage.precise"; const CPU_LIMIT_METRIC: &str = "azure.functions.enhanced.test.cpu.limit"; /// Statistics from cgroup v1 files, normalized to nanoseconds @@ -47,11 +48,11 @@ fn read_cpu_stats() -> Option { fn build_cpu_stats(cgroup_stats: &CgroupStats) -> Option { let total = cgroup_stats.total?; - let (limit_pct, defaulted) = compute_cpu_limit_nc(cgroup_stats); + let (limit_nc, defaulted) = compute_cpu_limit_nc(cgroup_stats); Some(CpuStats { total: total as f64, - limit: Some(limit_pct), + limit: Some(limit_nc), defaulted_limit: defaulted, }) } @@ -170,8 +171,10 @@ fn compute_cgroup_cpu_limit_nc(cgroup_stats: &CgroupStats) -> Option { let mut limit_nc = None; if let Some(cpu_count) = cgroup_stats.cpu_count { + debug!("CPU count from cpuset: {cpu_count}"); let host_cpu_count = num_cpus::get() as u64; if cpu_count != host_cpu_count { + debug!("CPU count from cpuset is not equal to host CPU count"); let cpuset_limit_nc = cpu_count as f64 * 1000000000.0; // Convert to nanocores limit_nc = Some(cpuset_limit_nc); debug!( @@ -189,7 +192,7 @@ fn compute_cgroup_cpu_limit_nc(cgroup_stats: &CgroupStats) -> Option { None => { limit_nc = Some(quota_limit_nc); debug!( - "limit_pct is None, setting CPU limit from cfs quota: {} nanocores", + "limit_nc is None, setting CPU limit from cfs quota: {} nanocores", quota_limit_nc ); } @@ -208,8 +211,9 @@ fn compute_cgroup_cpu_limit_nc(cgroup_stats: &CgroupStats) -> Option { pub struct CpuMetricsCollector { aggregator: AggregatorHandle, tags: Option, - last_usage_ns: f64, collection_interval_secs: u64, + last_usage_ns: f64, + last_collection_time: std::time::Instant, } impl CpuMetricsCollector { @@ -219,19 +223,18 @@ impl CpuMetricsCollector { /// /// * `aggregator` - The aggregator handle to submit metrics to /// * `tags` - Optional tags to attach to all metrics - /// * `last_usage_ns` - The last usage time in nanoseconds /// * `collection_interval_secs` - The interval in seconds to collect the metrics pub fn new( aggregator: AggregatorHandle, tags: Option, - last_usage_ns: f64, collection_interval_secs: u64, ) -> Self { Self { aggregator, tags, - last_usage_ns, collection_interval_secs, + last_usage_ns: -1.0, + last_collection_time: std::time::Instant::now(), } } @@ -241,28 +244,52 @@ impl CpuMetricsCollector { debug!("Collected cpu stats!"); let current_usage_ns = cpu_stats.total; debug!("CPU usage: {}", cpu_stats.total); - + let now_instant = std::time::Instant::now(); + // Skip first collection if self.last_usage_ns == -1.0 { debug!("First CPU collection, skipping rate computation"); self.last_usage_ns = current_usage_ns; + self.last_collection_time = now_instant; return; } - let delta_ns = current_usage_ns - self.last_usage_ns as f64; + let delta_ns = current_usage_ns - self.last_usage_ns; self.last_usage_ns = current_usage_ns; + let elapsed_secs = self.last_collection_time.elapsed().as_secs_f64(); + debug!("Elapsed time: {} seconds", elapsed_secs); + self.last_collection_time = now_instant; // Divide nanoseconds delta by collection interval to get usage rate in nanocores let usage_rate_nc = delta_ns / self.collection_interval_secs as f64; debug!("Usage rate: {} nanocores/s", usage_rate_nc); + let precise_usage_rate_nc = delta_ns / elapsed_secs; + debug!("Precise usage rate: {} nanocores/s", precise_usage_rate_nc); - let now = std::time::UNIX_EPOCH.elapsed() + let now = std::time::UNIX_EPOCH + .elapsed() .map(|d| d.as_secs()) .unwrap_or(0) .try_into() .unwrap_or(0); - let usage_metric = Metric::new(CPU_USAGE_METRIC.into(), MetricValue::distribution(usage_rate_nc), self.tags.clone(), Some(now)); + let precise_metric = Metric::new( + CPU_USAGE_PRECISE_METRIC.into(), + MetricValue::distribution(precise_usage_rate_nc), + self.tags.clone(), + Some(now), + ); + + if let Err(e) = self.aggregator.insert_batch(vec![precise_metric]) { + error!("Failed to insert CPU usage precise metric: {}", e); + } + + let usage_metric = Metric::new( + CPU_USAGE_METRIC.into(), + MetricValue::distribution(usage_rate_nc), + self.tags.clone(), + Some(now), + ); if let Err(e) = self.aggregator.insert_batch(vec![usage_metric]) { error!("Failed to insert CPU usage metric: {}", e); @@ -273,7 +300,12 @@ impl CpuMetricsCollector { if cpu_stats.defaulted_limit { debug!("CPU limit defaulted to host CPU count"); } - let limit_metric = Metric::new(CPU_LIMIT_METRIC.into(), MetricValue::distribution(limit), self.tags.clone(), Some(now)); + let limit_metric = Metric::new( + CPU_LIMIT_METRIC.into(), + MetricValue::distribution(limit), + self.tags.clone(), + Some(now), + ); if let Err(e) = self.aggregator.insert_batch(vec![limit_metric]) { error!("Failed to insert CPU limit metric: {}", e); } diff --git a/crates/datadog-serverless-compat/src/main.rs b/crates/datadog-serverless-compat/src/main.rs index 1bece02e..eca409e1 100644 --- a/crates/datadog-serverless-compat/src/main.rs +++ b/crates/datadog-serverless-compat/src/main.rs @@ -53,7 +53,7 @@ pub async fn main() { .map(|val| val.to_lowercase()) .unwrap_or("info".to_string()); - let (_, env_type) = match read_cloud_env() { + let (app_name, env_type) = match read_cloud_env() { Some(value) => value, None => { error!("Unable to identify environment. Shutting down Mini Agent."); @@ -206,7 +206,19 @@ pub async fn main() { // TODO: See if this works in Google Cloud Functions Gen 1. If not, only enable this for Azure Functions. let mut cpu_collector = if dd_enhanced_metrics { aggregator_handle.as_ref().map(|handle| { - CpuMetricsCollector::new(handle.clone(), None, -1.0, CPU_METRICS_COLLECTION_INTERVAL) + // Elastic Premium and Premium plans use WEBSITE_INSTANCE_ID to identify the instance + // Flex Consumption and Consumption plans use WEBSITE_POD_NAME or CONTAINER_NAME + let instance_id = env::var("WEBSITE_INSTANCE_ID") + .or_else(|_| env::var("WEBSITE_POD_NAME")) + .or_else(|_| env::var("CONTAINER_NAME")) + .ok(); + debug!("Instance ID: {:?}", instance_id); + let mut tag_str = format!("functionname:{}", app_name); + if let Some(id) = instance_id { + tag_str.push_str(&format!(",instance_id:{}", id)); + } + let tags = SortedTags::parse(&tag_str).ok(); + CpuMetricsCollector::new(handle.clone(), tags, CPU_METRICS_COLLECTION_INTERVAL) }) } else { info!("Enhanced metrics disabled"); From b632c171b76231b65be221e795330632bdbe6741 Mon Sep 17 00:00:00 2001 From: Kathie Huang Date: Wed, 25 Feb 2026 19:35:58 -0500 Subject: [PATCH 07/35] Categorize metrics with azure.functions prefix as enhanced metrics --- crates/dogstatsd/src/origin.rs | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/crates/dogstatsd/src/origin.rs b/crates/dogstatsd/src/origin.rs index d0c0952d..ceb5ad61 100644 --- a/crates/dogstatsd/src/origin.rs +++ b/crates/dogstatsd/src/origin.rs @@ -18,6 +18,7 @@ const AZURE_FUNCTIONS_TAG_VALUE: &str = "azurefunction"; const DATADOG_PREFIX: &str = "datadog."; const AWS_LAMBDA_PREFIX: &str = "aws.lambda"; const GOOGLE_CLOUD_RUN_PREFIX: &str = "gcp.run"; +const AZURE_FUNCTIONS_PREFIX: &str = "azure.functions"; const JVM_PREFIX: &str = "jvm."; const RUNTIME_PREFIX: &str = "runtime."; @@ -87,7 +88,7 @@ impl Metric { || metric_name.starts_with(RUNTIME_PREFIX) { OriginService::ServerlessRuntime - } else if metric_prefix == AWS_LAMBDA_PREFIX || metric_prefix == GOOGLE_CLOUD_RUN_PREFIX { + } else if metric_prefix == AWS_LAMBDA_PREFIX || metric_prefix == GOOGLE_CLOUD_RUN_PREFIX || metric_prefix == AZURE_FUNCTIONS_PREFIX { OriginService::ServerlessEnhanced } else { OriginService::ServerlessCustom From 1bff3a82606d94dbc7435bf166d89a6324a977d7 Mon Sep 17 00:00:00 2001 From: Kathie Huang Date: Thu, 26 Feb 2026 13:59:24 -0500 Subject: [PATCH 08/35] Refactor to make CpuMetricsCollector, CpuStats, and metrics submission OS-agnostic, create separate crates for Windows and Linux for reading raw CPU data --- crates/datadog-metrics-collector/src/cpu.rs | 199 ++--------------- crates/datadog-metrics-collector/src/lib.rs | 4 + crates/datadog-metrics-collector/src/linux.rs | 201 ++++++++++++++++++ .../datadog-metrics-collector/src/windows.rs | 14 ++ crates/dogstatsd/src/origin.rs | 20 +- 5 files changed, 242 insertions(+), 196 deletions(-) create mode 100644 crates/datadog-metrics-collector/src/linux.rs create mode 100644 crates/datadog-metrics-collector/src/windows.rs diff --git a/crates/datadog-metrics-collector/src/cpu.rs b/crates/datadog-metrics-collector/src/cpu.rs index 2d23c891..d31ef1c6 100644 --- a/crates/datadog-metrics-collector/src/cpu.rs +++ b/crates/datadog-metrics-collector/src/cpu.rs @@ -3,212 +3,32 @@ //! CPU metrics collector for Azure Functions //! -//! This module provides functionality to read raw CPU statistics from cgroup v1 files, -//! compute the CPU usage and limit, and submit them as distribution metrics to Datadog. +//! This module provides OS-agnostic CPU metrics collection, computing CPU usage +//! adnd limit and submitting them as distribution metrics to Datadog. //! //! All CPU metrics are reported in nanocores (1 core = 1,000,000,000 nanocores). use dogstatsd::aggregator_service::AggregatorHandle; use dogstatsd::metric::{Metric, MetricValue, SortedTags}; -use num_cpus; -use std::fs; -use std::io; use tracing::{debug, error}; -const CGROUP_CPU_USAGE_PATH: &str = "/sys/fs/cgroup/cpu/cpuacct.usage"; // Reports the total CPU time, in nanoseconds, consumed by all tasks in this cgroup -const CGROUP_CPUSET_CPUS_PATH: &str = "/sys/fs/cgroup/cpuset/cpuset.cpus"; // Specifies the CPUs that tasks in this cgroup are permitted to access -const CGROUP_CPU_PERIOD_PATH: &str = "/sys/fs/cgroup/cpu/cpu.cfs_period_us"; // Specifies a period of time, in microseconds, for how regularly a cgroup's access to CPU resources should be reallocated -const CGROUP_CPU_QUOTA_PATH: &str = "/sys/fs/cgroup/cpu/cpu.cfs_quota_us"; // Specifies the total amount of time, in microseconds, for which all tasks in a cgroup can run during one period - const CPU_USAGE_METRIC: &str = "azure.functions.enhanced.test.cpu.usage"; const CPU_USAGE_PRECISE_METRIC: &str = "azure.functions.enhanced.test.cpu.usage.precise"; const CPU_LIMIT_METRIC: &str = "azure.functions.enhanced.test.cpu.limit"; -/// Statistics from cgroup v1 files, normalized to nanoseconds -struct CgroupStats { - total: Option, // Cumulative CPU usage (from cpuacct.usage) in nanoseconds - cpu_count: Option, // Number of accessible logical CPUs (from cpuset.cpus) - scheduler_period: Option, // CFS scheduler period (from cpu.cfs_period_us) in nanoseconds - scheduler_quota: Option, // CFS scheduler quota (from cpu.cfs_quota_us) in nanoseconds -} - /// Computed CPU total and limit metrics -struct CpuStats { +pub struct CpuStats { pub total: f64, // Cumulative CPU usage in nanoseconds pub limit: Option, // CPU limit in nanocores pub defaulted_limit: bool, // Whether CPU limit was defaulted to host CPU count } -fn read_cpu_stats() -> Option { - let cgroup_stats = read_cgroup_stats(); - build_cpu_stats(&cgroup_stats) -} - -/// Builds CPU stats - rate and limit -fn build_cpu_stats(cgroup_stats: &CgroupStats) -> Option { - let total = cgroup_stats.total?; - - let (limit_nc, defaulted) = compute_cpu_limit_nc(cgroup_stats); - - Some(CpuStats { - total: total as f64, - limit: Some(limit_nc), - defaulted_limit: defaulted, - }) -} - -/// Reads raw CPU statistics from cgroup v1 files and converts to nanoseconds -fn read_cgroup_stats() -> CgroupStats { - let total = fs::read_to_string(CGROUP_CPU_USAGE_PATH) - .ok() - .and_then(|contents| contents.trim().parse::().ok()); - if total.is_none() { - debug!("Could not read CPU usage from {CGROUP_CPU_USAGE_PATH}"); - } - - let cpu_count = read_cpu_count_from_file(CGROUP_CPUSET_CPUS_PATH).ok(); - if cpu_count.is_none() { - debug!("Could not read CPU count from {CGROUP_CPUSET_CPUS_PATH}"); - } - - let scheduler_period = fs::read_to_string(CGROUP_CPU_PERIOD_PATH) - .ok() - .and_then(|contents| contents.trim().parse::().map(|v| v * 1000).ok()); // Convert from microseconds to nanoseconds - if scheduler_period.is_none() { - debug!("Could not read scheduler period from {CGROUP_CPU_PERIOD_PATH}"); - } - - let scheduler_quota = fs::read_to_string(CGROUP_CPU_QUOTA_PATH) - .ok() - .and_then(|contents| { - contents.trim().parse::().ok().and_then(|quota| { - // Convert from microseconds to nanoseconds - if quota == -1 { - debug!("CFS scheduler quota is -1, setting to None"); - None - } else { - Some((quota * 1000) as u64) - } - }) - }); - if scheduler_quota.is_none() { - debug!("Could not read scheduler quota from {CGROUP_CPU_QUOTA_PATH}"); - } - - CgroupStats { - total, - cpu_count, - scheduler_period, - scheduler_quota, - } -} - -/// Reads CPU count from cpuset.cpus -/// -/// The cpuset.cpus file contains a comma-separated list, with dashes to represent ranges of CPUs, -/// e.g., "0-2,16" represents CPUs 0, 1, 2, and 16 -/// This function returns the count of CPUs, in this case 4. -fn read_cpu_count_from_file(path: &str) -> Result { - let contents = fs::read_to_string(path)?; - let cpuset_str = contents.trim(); - if cpuset_str.is_empty() { - return Err(io::Error::new( - io::ErrorKind::InvalidData, - format!("File {path} is empty"), - )); - } - debug!("Contents of {path}: {cpuset_str}"); - - let mut cpu_count: u64 = 0; - - for part in cpuset_str.split(',') { - let range: Vec<&str> = part.split('-').collect(); - if range.len() == 2 { - // Range like "0-3" - debug!("Range: {range:?}"); - let start: u64 = range[0].parse().map_err(|e| { - io::Error::new( - io::ErrorKind::InvalidData, - format!("Failed to parse u64 from range {range:?}: {e}"), - ) - })?; - let end: u64 = range[1].parse().map_err(|e| { - io::Error::new( - io::ErrorKind::InvalidData, - format!("Failed to parse u64 from range {range:?}: {e}"), - ) - })?; - cpu_count += end - start + 1; - } else { - // Single CPU like "2" - debug!("Single CPU: {part}"); - cpu_count += 1; - } - } - - debug!("Total CPU count: {cpu_count}"); - Ok(cpu_count) -} - -/// Computes the CPU limit in nanocores, with fallback to host CPU count -fn compute_cpu_limit_nc(cgroup_stats: &CgroupStats) -> (f64, bool) { - match compute_cgroup_cpu_limit_nc(cgroup_stats) { - Some(limit) => (limit, false), - None => { - let host_cpu_count = num_cpus::get() as f64; - debug!( - "No CPU limit found, defaulting to host CPU count: {} CPUs", - host_cpu_count - ); - (host_cpu_count * 1000000000.0, true) // Convert to nanocores - } - } -} - -/// Computes the CPU limit in nanocores from cgroup statistics -/// Limit is computed using min(CPUSet, CFS CPU Quota) -fn compute_cgroup_cpu_limit_nc(cgroup_stats: &CgroupStats) -> Option { - let mut limit_nc = None; - - if let Some(cpu_count) = cgroup_stats.cpu_count { - debug!("CPU count from cpuset: {cpu_count}"); - let host_cpu_count = num_cpus::get() as u64; - if cpu_count != host_cpu_count { - debug!("CPU count from cpuset is not equal to host CPU count"); - let cpuset_limit_nc = cpu_count as f64 * 1000000000.0; // Convert to nanocores - limit_nc = Some(cpuset_limit_nc); - debug!( - "CPU limit from cpuset: {} CPUs ({} nanocores)", - cpu_count, cpuset_limit_nc - ); - } - } - - if let (Some(scheduler_quota), Some(scheduler_period)) = - (cgroup_stats.scheduler_quota, cgroup_stats.scheduler_period) - { - let quota_limit_nc = 1000000000.0 * (scheduler_quota as f64 / scheduler_period as f64); - match limit_nc { - None => { - limit_nc = Some(quota_limit_nc); - debug!( - "limit_nc is None, setting CPU limit from cfs quota: {} nanocores", - quota_limit_nc - ); - } - Some(current_limit_nc) if quota_limit_nc < current_limit_nc => { - limit_nc = Some(quota_limit_nc); - debug!("CPU limit from cfs quota is less than current limit, setting CPU limit from cfs quota: {} nanocores", quota_limit_nc); - } - _ => { - debug!("Keeping cpuset limit: {:?} nanocores", limit_nc); - } - } - } - limit_nc +pub trait CpuStatsReader { + fn read(&self) -> Option; } pub struct CpuMetricsCollector { + reader: Box, aggregator: AggregatorHandle, tags: Option, collection_interval_secs: u64, @@ -229,7 +49,12 @@ impl CpuMetricsCollector { tags: Option, collection_interval_secs: u64, ) -> Self { + #[cfg(target_os = "windows")] + let reader: Box = Box::new(crate::windows::WindowsCpuStatsReader); + #[cfg(not(target_os = "windows"))] + let reader: Box = Box::new(crate::linux::LinuxCpuStatsReader); Self { + reader, aggregator, tags, collection_interval_secs, @@ -239,7 +64,7 @@ impl CpuMetricsCollector { } pub fn collect_and_submit(&mut self) { - if let Some(cpu_stats) = read_cpu_stats() { + if let Some(cpu_stats) = self.reader.read() { // Submit metrics debug!("Collected cpu stats!"); let current_usage_ns = cpu_stats.total; diff --git a/crates/datadog-metrics-collector/src/lib.rs b/crates/datadog-metrics-collector/src/lib.rs index dc6c5f0f..f600d65c 100644 --- a/crates/datadog-metrics-collector/src/lib.rs +++ b/crates/datadog-metrics-collector/src/lib.rs @@ -8,3 +8,7 @@ #![cfg_attr(not(test), deny(clippy::unimplemented))] pub mod cpu; +#[cfg(not(target_os = "windows"))] +pub mod linux; +#[cfg(target_os = "windows")] +pub mod windows; diff --git a/crates/datadog-metrics-collector/src/linux.rs b/crates/datadog-metrics-collector/src/linux.rs new file mode 100644 index 00000000..87220fc3 --- /dev/null +++ b/crates/datadog-metrics-collector/src/linux.rs @@ -0,0 +1,201 @@ +// Copyright 2023-Present Datadog, Inc. https://www.datadoghq.com/ +// SPDX-License-Identifier: Apache-2.0 + +//! CPU metrics collector for Azure Functions +//! +//! This module provides functionality to read raw CPU statistics from cgroup v1 files +//! and compute the CPU usage and limit in Linux environments. +//! +//! All CPU metrics are reported in nanocores (1 core = 1,000,000,000 nanocores). + +use crate::cpu::{CpuStats, CpuStatsReader}; +use num_cpus; +use std::fs; +use std::io; +use tracing::debug; + +const CGROUP_CPU_USAGE_PATH: &str = "/sys/fs/cgroup/cpu/cpuacct.usage"; // Reports the total CPU time, in nanoseconds, consumed by all tasks in this cgroup +const CGROUP_CPUSET_CPUS_PATH: &str = "/sys/fs/cgroup/cpuset/cpuset.cpus"; // Specifies the CPUs that tasks in this cgroup are permitted to access +const CGROUP_CPU_PERIOD_PATH: &str = "/sys/fs/cgroup/cpu/cpu.cfs_period_us"; // Specifies a period of time, in microseconds, for how regularly a cgroup's access to CPU resources should be reallocated +const CGROUP_CPU_QUOTA_PATH: &str = "/sys/fs/cgroup/cpu/cpu.cfs_quota_us"; // Specifies the total amount of time, in microseconds, for which all tasks in a cgroup can run during one period + +/// Statistics from cgroup v1 files, normalized to nanoseconds +struct CgroupStats { + total: Option, // Cumulative CPU usage (from cpuacct.usage) in nanoseconds + cpu_count: Option, // Number of accessible logical CPUs (from cpuset.cpus) + scheduler_period: Option, // CFS scheduler period (from cpu.cfs_period_us) in nanoseconds + scheduler_quota: Option, // CFS scheduler quota (from cpu.cfs_quota_us) in nanoseconds +} + +pub struct LinuxCpuStatsReader; + +impl CpuStatsReader for LinuxCpuStatsReader { + fn read(&self) -> Option { + let cgroup_stats = read_cgroup_stats(); + build_cpu_stats(&cgroup_stats) + } +} + +/// Builds CPU stats - rate and limit +fn build_cpu_stats(cgroup_stats: &CgroupStats) -> Option { + let total = cgroup_stats.total?; + + let (limit_nc, defaulted) = compute_cpu_limit_nc(cgroup_stats); + + Some(CpuStats { + total: total as f64, + limit: Some(limit_nc), + defaulted_limit: defaulted, + }) +} + +/// Reads raw CPU statistics from cgroup v1 files and converts to nanoseconds +fn read_cgroup_stats() -> CgroupStats { + let total = fs::read_to_string(CGROUP_CPU_USAGE_PATH) + .ok() + .and_then(|contents| contents.trim().parse::().ok()); + if total.is_none() { + debug!("Could not read CPU usage from {CGROUP_CPU_USAGE_PATH}"); + } + + let cpu_count = read_cpu_count_from_file(CGROUP_CPUSET_CPUS_PATH).ok(); + if cpu_count.is_none() { + debug!("Could not read CPU count from {CGROUP_CPUSET_CPUS_PATH}"); + } + + let scheduler_period = fs::read_to_string(CGROUP_CPU_PERIOD_PATH) + .ok() + .and_then(|contents| contents.trim().parse::().map(|v| v * 1000).ok()); // Convert from microseconds to nanoseconds + if scheduler_period.is_none() { + debug!("Could not read scheduler period from {CGROUP_CPU_PERIOD_PATH}"); + } + + let scheduler_quota = fs::read_to_string(CGROUP_CPU_QUOTA_PATH) + .ok() + .and_then(|contents| { + contents.trim().parse::().ok().and_then(|quota| { + // Convert from microseconds to nanoseconds + if quota == -1 { + debug!("CFS scheduler quota is -1, setting to None"); + None + } else { + Some((quota * 1000) as u64) + } + }) + }); + if scheduler_quota.is_none() { + debug!("Could not read scheduler quota from {CGROUP_CPU_QUOTA_PATH}"); + } + + CgroupStats { + total, + cpu_count, + scheduler_period, + scheduler_quota, + } +} + +/// Reads CPU count from cpuset.cpus +/// +/// The cpuset.cpus file contains a comma-separated list, with dashes to represent ranges of CPUs, +/// e.g., "0-2,16" represents CPUs 0, 1, 2, and 16 +/// This function returns the count of CPUs, in this case 4. +fn read_cpu_count_from_file(path: &str) -> Result { + let contents = fs::read_to_string(path)?; + let cpuset_str = contents.trim(); + if cpuset_str.is_empty() { + return Err(io::Error::new( + io::ErrorKind::InvalidData, + format!("File {path} is empty"), + )); + } + debug!("Contents of {path}: {cpuset_str}"); + + let mut cpu_count: u64 = 0; + + for part in cpuset_str.split(',') { + let range: Vec<&str> = part.split('-').collect(); + if range.len() == 2 { + // Range like "0-3" + debug!("Range: {range:?}"); + let start: u64 = range[0].parse().map_err(|e| { + io::Error::new( + io::ErrorKind::InvalidData, + format!("Failed to parse u64 from range {range:?}: {e}"), + ) + })?; + let end: u64 = range[1].parse().map_err(|e| { + io::Error::new( + io::ErrorKind::InvalidData, + format!("Failed to parse u64 from range {range:?}: {e}"), + ) + })?; + cpu_count += end - start + 1; + } else { + // Single CPU like "2" + debug!("Single CPU: {part}"); + cpu_count += 1; + } + } + + debug!("Total CPU count: {cpu_count}"); + Ok(cpu_count) +} + +/// Computes the CPU limit in nanocores, with fallback to host CPU count +fn compute_cpu_limit_nc(cgroup_stats: &CgroupStats) -> (f64, bool) { + match compute_cgroup_cpu_limit_nc(cgroup_stats) { + Some(limit) => (limit, false), + None => { + let host_cpu_count = num_cpus::get() as f64; + debug!( + "No CPU limit found, defaulting to host CPU count: {} CPUs", + host_cpu_count + ); + (host_cpu_count * 1000000000.0, true) // Convert to nanocores + } + } +} + +/// Computes the CPU limit in nanocores from cgroup statistics +/// Limit is computed using min(CPUSet, CFS CPU Quota) +fn compute_cgroup_cpu_limit_nc(cgroup_stats: &CgroupStats) -> Option { + let mut limit_nc = None; + + if let Some(cpu_count) = cgroup_stats.cpu_count { + debug!("CPU count from cpuset: {cpu_count}"); + let host_cpu_count = num_cpus::get() as u64; + if cpu_count != host_cpu_count { + debug!("CPU count from cpuset is not equal to host CPU count"); + let cpuset_limit_nc = cpu_count as f64 * 1000000000.0; // Convert to nanocores + limit_nc = Some(cpuset_limit_nc); + debug!( + "CPU limit from cpuset: {} CPUs ({} nanocores)", + cpu_count, cpuset_limit_nc + ); + } + } + + if let (Some(scheduler_quota), Some(scheduler_period)) = + (cgroup_stats.scheduler_quota, cgroup_stats.scheduler_period) + { + let quota_limit_nc = 1000000000.0 * (scheduler_quota as f64 / scheduler_period as f64); + match limit_nc { + None => { + limit_nc = Some(quota_limit_nc); + debug!( + "limit_nc is None, setting CPU limit from cfs quota: {} nanocores", + quota_limit_nc + ); + } + Some(current_limit_nc) if quota_limit_nc < current_limit_nc => { + limit_nc = Some(quota_limit_nc); + debug!("CPU limit from cfs quota is less than current limit, setting CPU limit from cfs quota: {} nanocores", quota_limit_nc); + } + _ => { + debug!("Keeping cpuset limit: {:?} nanocores", limit_nc); + } + } + } + limit_nc +} diff --git a/crates/datadog-metrics-collector/src/windows.rs b/crates/datadog-metrics-collector/src/windows.rs new file mode 100644 index 00000000..1be055ab --- /dev/null +++ b/crates/datadog-metrics-collector/src/windows.rs @@ -0,0 +1,14 @@ +// Copyright 2023-Present Datadog, Inc. https://www.datadoghq.com/ +// SPDX-License-Identifier: Apache-2.0 + +use crate::cpu::{CpuStats, CpuStatsReader}; +use tracing::debug; + +pub struct WindowsCpuStatsReader; + +impl CpuStatsReader for WindowsCpuStatsReader { + fn read(&self) -> Option { + debug!("Reading CPU stats from Windows"); + None + } +} diff --git a/crates/dogstatsd/src/origin.rs b/crates/dogstatsd/src/origin.rs index ceb5ad61..818766d5 100644 --- a/crates/dogstatsd/src/origin.rs +++ b/crates/dogstatsd/src/origin.rs @@ -84,15 +84,17 @@ impl Metric { .join("."); // Determine the service based on metric prefix first - let service = if metric_name.starts_with(JVM_PREFIX) - || metric_name.starts_with(RUNTIME_PREFIX) - { - OriginService::ServerlessRuntime - } else if metric_prefix == AWS_LAMBDA_PREFIX || metric_prefix == GOOGLE_CLOUD_RUN_PREFIX || metric_prefix == AZURE_FUNCTIONS_PREFIX { - OriginService::ServerlessEnhanced - } else { - OriginService::ServerlessCustom - }; + let service = + if metric_name.starts_with(JVM_PREFIX) || metric_name.starts_with(RUNTIME_PREFIX) { + OriginService::ServerlessRuntime + } else if metric_prefix == AWS_LAMBDA_PREFIX + || metric_prefix == GOOGLE_CLOUD_RUN_PREFIX + || metric_prefix == AZURE_FUNCTIONS_PREFIX + { + OriginService::ServerlessEnhanced + } else { + OriginService::ServerlessCustom + }; // Then determine the category based on tags let category = if has_tag_value(&tags, AWS_LAMBDA_TAG_KEY, "") { From fac5fda53da5af76f6e41adc736b8b3e94f16331 Mon Sep 17 00:00:00 2001 From: Kathie Huang Date: Wed, 4 Mar 2026 18:07:49 -0500 Subject: [PATCH 09/35] Testing different cpu collection methods --- crates/datadog-metrics-collector/Cargo.toml | 3 + crates/datadog-metrics-collector/src/cpu.rs | 4 +- crates/datadog-metrics-collector/src/lib.rs | 4 +- crates/datadog-metrics-collector/src/linux.rs | 76 +++++- .../datadog-metrics-collector/src/windows.rs | 225 +++++++++++++++++- 5 files changed, 305 insertions(+), 7 deletions(-) diff --git a/crates/datadog-metrics-collector/Cargo.toml b/crates/datadog-metrics-collector/Cargo.toml index 4061bd13..5531b8e0 100644 --- a/crates/datadog-metrics-collector/Cargo.toml +++ b/crates/datadog-metrics-collector/Cargo.toml @@ -9,3 +9,6 @@ description = "Collector to read, compute, and submit enhanced metrics in Server dogstatsd = { path = "../dogstatsd", default-features = true } num_cpus = "1.16" tracing = { version = "0.1", default-features = false } + +[target.'cfg(target_os = "windows")'.dependencies] +windows = { version = "0.58", features = ["Win32_System_Threading", "Win32_Foundation", "Win32_System_JobObjects", "Win32_System_Diagnostics_ToolHelp"] } \ No newline at end of file diff --git a/crates/datadog-metrics-collector/src/cpu.rs b/crates/datadog-metrics-collector/src/cpu.rs index d31ef1c6..7204af5d 100644 --- a/crates/datadog-metrics-collector/src/cpu.rs +++ b/crates/datadog-metrics-collector/src/cpu.rs @@ -3,8 +3,8 @@ //! CPU metrics collector for Azure Functions //! -//! This module provides OS-agnostic CPU metrics collection, computing CPU usage -//! adnd limit and submitting them as distribution metrics to Datadog. +//! This module provides OS-agnostic CPU stats collection, CPU usage +//! and limit computation, and metrics submission to Datadog. //! //! All CPU metrics are reported in nanocores (1 core = 1,000,000,000 nanocores). diff --git a/crates/datadog-metrics-collector/src/lib.rs b/crates/datadog-metrics-collector/src/lib.rs index f600d65c..abf337d5 100644 --- a/crates/datadog-metrics-collector/src/lib.rs +++ b/crates/datadog-metrics-collector/src/lib.rs @@ -9,6 +9,6 @@ pub mod cpu; #[cfg(not(target_os = "windows"))] -pub mod linux; +pub(crate) mod linux; #[cfg(target_os = "windows")] -pub mod windows; +pub(crate) mod windows; diff --git a/crates/datadog-metrics-collector/src/linux.rs b/crates/datadog-metrics-collector/src/linux.rs index 87220fc3..11e2dedd 100644 --- a/crates/datadog-metrics-collector/src/linux.rs +++ b/crates/datadog-metrics-collector/src/linux.rs @@ -9,7 +9,6 @@ //! All CPU metrics are reported in nanocores (1 core = 1,000,000,000 nanocores). use crate::cpu::{CpuStats, CpuStatsReader}; -use num_cpus; use std::fs; use std::io; use tracing::debug; @@ -18,6 +17,8 @@ const CGROUP_CPU_USAGE_PATH: &str = "/sys/fs/cgroup/cpu/cpuacct.usage"; // Repor const CGROUP_CPUSET_CPUS_PATH: &str = "/sys/fs/cgroup/cpuset/cpuset.cpus"; // Specifies the CPUs that tasks in this cgroup are permitted to access const CGROUP_CPU_PERIOD_PATH: &str = "/sys/fs/cgroup/cpu/cpu.cfs_period_us"; // Specifies a period of time, in microseconds, for how regularly a cgroup's access to CPU resources should be reallocated const CGROUP_CPU_QUOTA_PATH: &str = "/sys/fs/cgroup/cpu/cpu.cfs_quota_us"; // Specifies the total amount of time, in microseconds, for which all tasks in a cgroup can run during one period +const PROC_UPTIME_PATH: &str = "/proc/uptime"; // Reports the total uptime of the system, in seconds +const PROC_STAT_PATH: &str = "/proc/stat"; // Reports the total CPU time, in nanoseconds, consumed by all processes and threads in the system /// Statistics from cgroup v1 files, normalized to nanoseconds struct CgroupStats { @@ -29,8 +30,81 @@ struct CgroupStats { pub struct LinuxCpuStatsReader; +pub fn log_cgroup_processes() { + let tasks = match fs::read_to_string("/sys/fs/cgroup/cpu/cgroup.procs") { + Ok(t) => t, + Err(e) => { debug!("Failed to read cgroup tasks: {}", e); return; } + }; + + debug!("Processes in cgroup:"); + let mut total_ns: u64 = 0; + + for line in tasks.lines() { + let pid: u32 = match line.trim().parse() { + Ok(p) => p, + Err(_) => continue, + }; + + let comm = fs::read_to_string(format!("/proc/{}/comm", pid)) + .unwrap_or_default(); + let comm = comm.trim(); + + let stat = match fs::read_to_string(format!("/proc/{}/stat", pid)) { + Ok(s) => s, + Err(_) => { + debug!("PID={} name={} (exited)", pid, comm); + continue; + } + }; + + let fields: Vec<&str> = stat.split_whitespace().collect(); + if fields.len() < 15 { + continue; + } + let utime: u64 = fields[13].parse().unwrap_or(0); + let stime: u64 = fields[14].parse().unwrap_or(0); + let ns = (utime + stime) * 10_000_000; + total_ns += ns; + + debug!("PID={} name={} CPU: {} ns (user: {} ns, kernel: {} ns)", + pid, comm, ns, utime * 10_000_000, stime * 10_000_000); + } + + debug!("Sum of cgroup PIDs: {} ns", total_ns); + + if let Ok(cgroup) = fs::read_to_string(CGROUP_CPU_USAGE_PATH) { + if let Ok(cgroup_ns) = cgroup.trim().parse::() { + debug!("cpuacct.usage: {} ns", cgroup_ns); + debug!("cgroup PIDs sum is {:.1}% of cpuacct.usage", total_ns as f64 / cgroup_ns as f64 * 100.0); + } + } +} + + +fn read_proc_stat_snapshot() -> Option { + let contents = fs::read_to_string(PROC_STAT_PATH).ok()?; + let cpu_line = contents.lines().find(|l| l.starts_with("cpu "))?; + let mut values = cpu_line.split_whitespace(); + values.next(); // skip "cpu" label + let user: u64 = values.next()?.parse().ok()?; + let nice: u64 = values.next()?.parse().ok()?; + let system: u64 = values.next()?.parse().ok()?; + // jiffies to nanoseconds (USER_HZ=100, so 1 jiffy = 10_000_000 ns) + let active_ns = (user + nice + system) * 10_000_000; + debug!("proc/stat active: {} ns", active_ns); + Some(active_ns) +} + impl CpuStatsReader for LinuxCpuStatsReader { fn read(&self) -> Option { + debug!("Reading CPU stats from Linux - using procstat"); + log_cgroup_processes(); + // let total_time_ns = read_proc_stat_snapshot()?; + // Some(CpuStats { + // total: total_time_ns as f64, + // limit: Some(num_cpus::get() as f64 * 1000000000.0), + // defaulted_limit: true, + // }) let cgroup_stats = read_cgroup_stats(); build_cpu_stats(&cgroup_stats) } diff --git a/crates/datadog-metrics-collector/src/windows.rs b/crates/datadog-metrics-collector/src/windows.rs index 1be055ab..03868ad5 100644 --- a/crates/datadog-metrics-collector/src/windows.rs +++ b/crates/datadog-metrics-collector/src/windows.rs @@ -1,14 +1,235 @@ // Copyright 2023-Present Datadog, Inc. https://www.datadoghq.com/ // SPDX-License-Identifier: Apache-2.0 +//! CPU metrics collector for Azure Functions +//! +//! This module provides functionality to read raw CPU statistics +//! and compute the CPU usage and limit in Windows environments. +//! +//! All CPU metrics are reported in nanocores (1 core = 1,000,000,000 nanocores). + use crate::cpu::{CpuStats, CpuStatsReader}; +use std::mem::size_of; use tracing::debug; +use windows::Win32::Foundation::{FILETIME, CloseHandle}; +use windows::Win32::System::JobObjects::{ + JobObjectBasicAccountingInformation, JobObjectCpuRateControlInformation, + QueryInformationJobObject, JOBOBJECT_BASIC_ACCOUNTING_INFORMATION, + JOBOBJECT_CPU_RATE_CONTROL_INFORMATION, JOB_OBJECT_CPU_RATE_CONTROL_ENABLE, +}; +use windows::Win32::System::Threading::{GetCurrentProcess, GetProcessTimes, GetSystemTimes, OpenProcess, PROCESS_QUERY_LIMITED_INFORMATION}; +use windows::Win32::System::Diagnostics::ToolHelp::{ + CreateToolhelp32Snapshot, Process32FirstW, Process32NextW, + PROCESSENTRY32W, TH32CS_SNAPPROCESS, +}; pub struct WindowsCpuStatsReader; +pub fn log_all_processes() { + let snapshot = unsafe { CreateToolhelp32Snapshot(TH32CS_SNAPPROCESS, 0) }; + let Ok(snapshot) = snapshot else { + debug!("Failed to create process snapshot"); + return; + }; + + let mut entry = PROCESSENTRY32W::default(); + entry.dwSize = size_of::() as u32; + + if unsafe { Process32FirstW(snapshot, &mut entry) }.is_err() { + debug!("Failed to get first process"); + return; + } + + loop { + let name = String::from_utf16_lossy( + &entry.szExeFile[..entry.szExeFile.iter().position(|&c| c == 0).unwrap_or(260)] + ); + debug!( + "Process: PID={} PPID={} name={}", + entry.th32ProcessID, + entry.th32ParentProcessID, + name + ); + if let Some(cpu_ns) = read_process_cpu_time_ns(entry.th32ProcessID) { + debug!( + "PID={} CPU: {} ns", + entry.th32ProcessID, + cpu_ns + ); + } else { + debug!("PID={} CPU: unavailable", entry.th32ProcessID); + } + + + if unsafe { Process32NextW(snapshot, &mut entry) }.is_err() { + break; + } + } + let job_total = read_job_cpu_time_ns(); + debug!("Job Object total CPU: {:?} ns", job_total); +} + +fn read_system_cpu_usage_ns() -> Option { + let mut idle = FILETIME::default(); + let mut kernel = FILETIME::default(); + let mut user = FILETIME::default(); + + unsafe { + GetSystemTimes(Some(&mut idle), Some(&mut kernel), Some(&mut user)).ok()?; + } + let idle_ns = filetime_to_ns(&idle); + let kernel_ns = filetime_to_ns(&kernel); + let user_ns = filetime_to_ns(&user); + let active_ns = (kernel_ns - idle_ns) + user_ns; + Some(active_ns) +} + +fn filetime_to_ns(filetime: &FILETIME) -> u64 { + (((filetime.dwHighDateTime as u64) << 32) | filetime.dwLowDateTime as u64) * 100 +} + +fn read_job_cpu_time_ns() -> Option { + let mut info = JOBOBJECT_BASIC_ACCOUNTING_INFORMATION::default(); + unsafe { + QueryInformationJobObject( + None, // If the handle is null, the job associated with the current process is used + JobObjectBasicAccountingInformation, // The type of info to retrieve + &mut info as *mut _ as *mut _, // Pointer to the struct to receive the info + size_of::() as u32, + None, + ) + .ok()?; + }; + // TotalUserTime and TotalKernelTime are in 100-nanosecond units - multiply by 100 to get nanoseconds + let total_ns = (info.TotalUserTime + info.TotalKernelTime) as u64 * 100; + debug!( + "Job CPU time: {} ns (user: {} ns, kernel: {} ns)", + total_ns, + info.TotalUserTime as u64 * 100, + info.TotalKernelTime as u64 * 100 + ); + Some(total_ns) +} + +fn read_process_cpu_time_ns(pid: u32) -> Option { + let mut creation_time = FILETIME::default(); + let mut exit_time = FILETIME::default(); + let mut kernel_time = FILETIME::default(); + let mut user_time = FILETIME::default(); + + unsafe { + let handle = OpenProcess(PROCESS_QUERY_LIMITED_INFORMATION, false, pid).ok()?; + let result = GetProcessTimes( + handle, + &mut creation_time, + &mut exit_time, + &mut kernel_time, + &mut user_time, + ); + CloseHandle(handle).ok(); + result.ok()?; + } + + let user_ns = filetime_to_ns(&user_time); + let kernel_ns = filetime_to_ns(&kernel_time); + debug!("PID={} CPU: {} ns (user: {} ns, kernel: {} ns)", pid, user_ns + kernel_ns, user_ns, kernel_ns); + Some(user_ns + kernel_ns) +} + +fn read_process_cpu_usage_ns() -> Option { + // Using GetProcessTimes + let mut creation_time = FILETIME::default(); + let mut exit_time = FILETIME::default(); + let mut kernel_time = FILETIME::default(); + let mut user_time = FILETIME::default(); + + // All calls to Win32 APIs require unsafe in Rust + unsafe { + let handle = GetCurrentProcess(); + GetProcessTimes( + handle, + &mut creation_time, + &mut exit_time, + &mut kernel_time, + &mut user_time, + ) + .ok()?; + } + + // The FILETIME struct contains two 32-bit values that combine to form a 64-bit count of 100-nanosecond time units + // Multiply by 100 to get a 64-bit count of nanoseconds + let user_time_ns = + (((user_time.dwHighDateTime as u64) << 32) | user_time.dwLowDateTime as u64) * 100; + let kernel_time_ns = + (((kernel_time.dwHighDateTime as u64) << 32) | kernel_time.dwLowDateTime as u64) * 100; + let total_time_ns = user_time_ns + kernel_time_ns; + debug!( + "Windows CPU usage: {} ns (user: {} ns, kernel: {} ns)", + total_time_ns, user_time_ns, kernel_time_ns + ); + Some(total_time_ns) +} + +/// Reads the CPU rate limit for the job object in nanocores. +/// Falls back to host CPU count if no limit is set. +fn read_job_cpu_limit_nc() -> (f64, bool) { + let mut info = JOBOBJECT_CPU_RATE_CONTROL_INFORMATION::default(); + let result = unsafe { + QueryInformationJobObject( + None, + JobObjectCpuRateControlInformation, + &mut info as *mut _ as *mut _, + size_of::() as u32, + None, + ) + }; + if result.is_ok() + && (info.ControlFlags & JOB_OBJECT_CPU_RATE_CONTROL_ENABLE) + == JOB_OBJECT_CPU_RATE_CONTROL_ENABLE + { + // CpuRate is in units of 1/100th of a percent (10000 = 100% = 1 core) + let cpu_rate = unsafe { info.Anonymous.CpuRate } as f64; + let limit_nc = (cpu_rate / 10000.0) * num_cpus::get() as f64 * 1_000_000_000.0; + debug!( + "Job CPU rate limit: {} nanocores (CpuRate: {})", + limit_nc, cpu_rate + ); + (limit_nc, false) + } else { + let limit_nc = num_cpus::get() as f64 * 1_000_000_000.0; + debug!( + "No job CPU rate limit found, defaulting to host CPU count: {} nanocores", + limit_nc + ); + (limit_nc, true) + } +} + impl CpuStatsReader for WindowsCpuStatsReader { fn read(&self) -> Option { - debug!("Reading CPU stats from Windows"); - None + debug!("Reading CPU stats from Windows - using Job Object and comparing to GetProcessTimes for each process"); + log_all_processes(); + + // let total_time_ns = read_system_cpu_usage_ns()?; + // Using QueryInformationJobObject + let total_time_ns = read_job_cpu_time_ns()?; + // Using GetProcessTimes + // let total_time_ns = read_process_cpu_usage_ns()?; + + let (limit_nc, defaulted_limit) = read_job_cpu_limit_nc(); + Some(CpuStats { + total: total_time_ns as f64, + limit: Some(limit_nc), + defaulted_limit: defaulted_limit, + }) + + // let limit_nc = num_cpus::get() as f64 * 1000000000.0; + // debug!("Windows CPU limit: {} nc", limit_nc); + + // Some(CpuStats { + // total: total_time_ns as f64, + // limit: Some(limit_nc), + // defaulted_limit: true, + // }) } } From bf1e8a7ea7b613fd6e38e5daedbed9488866d3eb Mon Sep 17 00:00:00 2001 From: Kathie Huang Date: Wed, 4 Mar 2026 18:17:51 -0500 Subject: [PATCH 10/35] Clean up and emit cpu usage and host-level cpu usage metrics --- Cargo.lock | 65 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 65 insertions(+) diff --git a/Cargo.lock b/Cargo.lock index d74ba9dd..b98af55d 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -397,6 +397,7 @@ dependencies = [ "dogstatsd", "num_cpus", "tracing", + "windows", ] [[package]] @@ -3045,12 +3046,76 @@ dependencies = [ "rustix 0.38.44", ] +[[package]] +name = "windows" +version = "0.58.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dd04d41d93c4992d421894c18c8b43496aa748dd4c081bac0dc93eb0489272b6" +dependencies = [ + "windows-core", + "windows-targets 0.52.6", +] + +[[package]] +name = "windows-core" +version = "0.58.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6ba6d44ec8c2591c134257ce647b7ea6b20335bf6379a27dac5f1641fcf59f99" +dependencies = [ + "windows-implement", + "windows-interface", + "windows-result", + "windows-strings", + "windows-targets 0.52.6", +] + +[[package]] +name = "windows-implement" +version = "0.58.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2bbd5b46c938e506ecbce286b6628a02171d56153ba733b6c741fc627ec9579b" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.114", +] + +[[package]] +name = "windows-interface" +version = "0.58.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "053c4c462dc91d3b1504c6fe5a726dd15e216ba718e84a0e46a88fbe5ded3515" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.114", +] + [[package]] name = "windows-link" version = "0.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f0805222e57f7521d6a62e36fa9163bc891acd422f971defe97d64e70d0a4fe5" +[[package]] +name = "windows-result" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1d1043d8214f791817bab27572aaa8af63732e11bf84aa21a45a78d6c317ae0e" +dependencies = [ + "windows-targets 0.52.6", +] + +[[package]] +name = "windows-strings" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4cd9b125c486025df0eabcb585e62173c6c9eddcec5d117d3b6e8c30e2ee4d10" +dependencies = [ + "windows-result", + "windows-targets 0.52.6", +] + [[package]] name = "windows-sys" version = "0.52.0" From c43bb328ad81323e2c94b2502764536655b60b84 Mon Sep 17 00:00:00 2001 From: Kathie Huang Date: Wed, 4 Mar 2026 18:17:51 -0500 Subject: [PATCH 11/35] Clean up and emit cpu usage and host-level cpu usage metrics --- .vscode/settings.json | 3 + crates/datadog-metrics-collector/src/cpu.rs | 42 +++--- crates/datadog-metrics-collector/src/linux.rs | 61 +-------- .../datadog-metrics-collector/src/windows.rs | 124 ++---------------- 4 files changed, 40 insertions(+), 190 deletions(-) create mode 100644 .vscode/settings.json diff --git a/.vscode/settings.json b/.vscode/settings.json new file mode 100644 index 00000000..d5e92419 --- /dev/null +++ b/.vscode/settings.json @@ -0,0 +1,3 @@ +// { +// "rust-analyzer.cargo.target": "x86_64-pc-windows-gnu" +// } diff --git a/crates/datadog-metrics-collector/src/cpu.rs b/crates/datadog-metrics-collector/src/cpu.rs index 7204af5d..86a38454 100644 --- a/crates/datadog-metrics-collector/src/cpu.rs +++ b/crates/datadog-metrics-collector/src/cpu.rs @@ -12,13 +12,14 @@ use dogstatsd::aggregator_service::AggregatorHandle; use dogstatsd::metric::{Metric, MetricValue, SortedTags}; use tracing::{debug, error}; -const CPU_USAGE_METRIC: &str = "azure.functions.enhanced.test.cpu.usage"; -const CPU_USAGE_PRECISE_METRIC: &str = "azure.functions.enhanced.test.cpu.usage.precise"; -const CPU_LIMIT_METRIC: &str = "azure.functions.enhanced.test.cpu.limit"; +const CPU_USAGE_METRIC: &str = "azure.functions.enhanced.cpu.usage"; +const CPU_HOST_USAGE_METRIC: &str = "azure.functions.enhanced.cpu.host.usage"; +const CPU_LIMIT_METRIC: &str = "azure.functions.enhanced.cpu.limit"; /// Computed CPU total and limit metrics pub struct CpuStats { pub total: f64, // Cumulative CPU usage in nanoseconds + pub host_total: f64, // Cumulative CPU usage on the host in nanoseconds pub limit: Option, // CPU limit in nanocores pub defaulted_limit: bool, // Whether CPU limit was defaulted to host CPU count } @@ -33,6 +34,7 @@ pub struct CpuMetricsCollector { tags: Option, collection_interval_secs: u64, last_usage_ns: f64, + last_host_usage_ns: f64, last_collection_time: std::time::Instant, } @@ -59,6 +61,7 @@ impl CpuMetricsCollector { tags, collection_interval_secs, last_usage_ns: -1.0, + last_host_usage_ns: -1.0, last_collection_time: std::time::Instant::now(), } } @@ -69,27 +72,26 @@ impl CpuMetricsCollector { debug!("Collected cpu stats!"); let current_usage_ns = cpu_stats.total; debug!("CPU usage: {}", cpu_stats.total); + let current_host_usage_ns = cpu_stats.host_total; + debug!("Host CPU usage: {}", cpu_stats.host_total); let now_instant = std::time::Instant::now(); // Skip first collection if self.last_usage_ns == -1.0 { debug!("First CPU collection, skipping rate computation"); self.last_usage_ns = current_usage_ns; + self.last_host_usage_ns = current_host_usage_ns; self.last_collection_time = now_instant; return; } let delta_ns = current_usage_ns - self.last_usage_ns; self.last_usage_ns = current_usage_ns; - let elapsed_secs = self.last_collection_time.elapsed().as_secs_f64(); - debug!("Elapsed time: {} seconds", elapsed_secs); self.last_collection_time = now_instant; // Divide nanoseconds delta by collection interval to get usage rate in nanocores let usage_rate_nc = delta_ns / self.collection_interval_secs as f64; debug!("Usage rate: {} nanocores/s", usage_rate_nc); - let precise_usage_rate_nc = delta_ns / elapsed_secs; - debug!("Precise usage rate: {} nanocores/s", precise_usage_rate_nc); let now = std::time::UNIX_EPOCH .elapsed() @@ -98,17 +100,6 @@ impl CpuMetricsCollector { .try_into() .unwrap_or(0); - let precise_metric = Metric::new( - CPU_USAGE_PRECISE_METRIC.into(), - MetricValue::distribution(precise_usage_rate_nc), - self.tags.clone(), - Some(now), - ); - - if let Err(e) = self.aggregator.insert_batch(vec![precise_metric]) { - error!("Failed to insert CPU usage precise metric: {}", e); - } - let usage_metric = Metric::new( CPU_USAGE_METRIC.into(), MetricValue::distribution(usage_rate_nc), @@ -120,6 +111,21 @@ impl CpuMetricsCollector { error!("Failed to insert CPU usage metric: {}", e); } + // Host VM-level CPU usage + let host_delta_ns = current_host_usage_ns - self.last_host_usage_ns; + self.last_host_usage_ns = current_host_usage_ns; + let host_usage_rate_nc = host_delta_ns / self.collection_interval_secs as f64; + debug!("CPU host usage rate: {} nanocores", host_usage_rate_nc); + + if let Err(e) = self.aggregator.insert_batch(vec![Metric::new( + CPU_HOST_USAGE_METRIC.into(), + MetricValue::distribution(host_usage_rate_nc), + self.tags.clone(), + Some(now), + )]) { + error!("Failed to insert CPU host usage metric: {}", e); + } + if let Some(limit) = cpu_stats.limit { debug!("CPU limit: {}", limit); if cpu_stats.defaulted_limit { diff --git a/crates/datadog-metrics-collector/src/linux.rs b/crates/datadog-metrics-collector/src/linux.rs index 11e2dedd..47faf3a7 100644 --- a/crates/datadog-metrics-collector/src/linux.rs +++ b/crates/datadog-metrics-collector/src/linux.rs @@ -30,57 +30,6 @@ struct CgroupStats { pub struct LinuxCpuStatsReader; -pub fn log_cgroup_processes() { - let tasks = match fs::read_to_string("/sys/fs/cgroup/cpu/cgroup.procs") { - Ok(t) => t, - Err(e) => { debug!("Failed to read cgroup tasks: {}", e); return; } - }; - - debug!("Processes in cgroup:"); - let mut total_ns: u64 = 0; - - for line in tasks.lines() { - let pid: u32 = match line.trim().parse() { - Ok(p) => p, - Err(_) => continue, - }; - - let comm = fs::read_to_string(format!("/proc/{}/comm", pid)) - .unwrap_or_default(); - let comm = comm.trim(); - - let stat = match fs::read_to_string(format!("/proc/{}/stat", pid)) { - Ok(s) => s, - Err(_) => { - debug!("PID={} name={} (exited)", pid, comm); - continue; - } - }; - - let fields: Vec<&str> = stat.split_whitespace().collect(); - if fields.len() < 15 { - continue; - } - let utime: u64 = fields[13].parse().unwrap_or(0); - let stime: u64 = fields[14].parse().unwrap_or(0); - let ns = (utime + stime) * 10_000_000; - total_ns += ns; - - debug!("PID={} name={} CPU: {} ns (user: {} ns, kernel: {} ns)", - pid, comm, ns, utime * 10_000_000, stime * 10_000_000); - } - - debug!("Sum of cgroup PIDs: {} ns", total_ns); - - if let Ok(cgroup) = fs::read_to_string(CGROUP_CPU_USAGE_PATH) { - if let Ok(cgroup_ns) = cgroup.trim().parse::() { - debug!("cpuacct.usage: {} ns", cgroup_ns); - debug!("cgroup PIDs sum is {:.1}% of cpuacct.usage", total_ns as f64 / cgroup_ns as f64 * 100.0); - } - } -} - - fn read_proc_stat_snapshot() -> Option { let contents = fs::read_to_string(PROC_STAT_PATH).ok()?; let cpu_line = contents.lines().find(|l| l.starts_with("cpu "))?; @@ -98,13 +47,6 @@ fn read_proc_stat_snapshot() -> Option { impl CpuStatsReader for LinuxCpuStatsReader { fn read(&self) -> Option { debug!("Reading CPU stats from Linux - using procstat"); - log_cgroup_processes(); - // let total_time_ns = read_proc_stat_snapshot()?; - // Some(CpuStats { - // total: total_time_ns as f64, - // limit: Some(num_cpus::get() as f64 * 1000000000.0), - // defaulted_limit: true, - // }) let cgroup_stats = read_cgroup_stats(); build_cpu_stats(&cgroup_stats) } @@ -116,8 +58,11 @@ fn build_cpu_stats(cgroup_stats: &CgroupStats) -> Option { let (limit_nc, defaulted) = compute_cpu_limit_nc(cgroup_stats); + let host_total = read_proc_stat_snapshot().unwrap_or(0); + Some(CpuStats { total: total as f64, + host_total: host_total as f64, limit: Some(limit_nc), defaulted_limit: defaulted, }) diff --git a/crates/datadog-metrics-collector/src/windows.rs b/crates/datadog-metrics-collector/src/windows.rs index 03868ad5..dc2753bb 100644 --- a/crates/datadog-metrics-collector/src/windows.rs +++ b/crates/datadog-metrics-collector/src/windows.rs @@ -11,64 +11,22 @@ use crate::cpu::{CpuStats, CpuStatsReader}; use std::mem::size_of; use tracing::debug; -use windows::Win32::Foundation::{FILETIME, CloseHandle}; +use windows::Win32::Foundation::{CloseHandle, FILETIME}; +use windows::Win32::System::Diagnostics::ToolHelp::{ + CreateToolhelp32Snapshot, Process32FirstW, Process32NextW, PROCESSENTRY32W, TH32CS_SNAPPROCESS, +}; use windows::Win32::System::JobObjects::{ JobObjectBasicAccountingInformation, JobObjectCpuRateControlInformation, QueryInformationJobObject, JOBOBJECT_BASIC_ACCOUNTING_INFORMATION, JOBOBJECT_CPU_RATE_CONTROL_INFORMATION, JOB_OBJECT_CPU_RATE_CONTROL_ENABLE, }; -use windows::Win32::System::Threading::{GetCurrentProcess, GetProcessTimes, GetSystemTimes, OpenProcess, PROCESS_QUERY_LIMITED_INFORMATION}; -use windows::Win32::System::Diagnostics::ToolHelp::{ - CreateToolhelp32Snapshot, Process32FirstW, Process32NextW, - PROCESSENTRY32W, TH32CS_SNAPPROCESS, +use windows::Win32::System::Threading::{ + GetCurrentProcess, GetProcessTimes, GetSystemTimes, OpenProcess, + PROCESS_QUERY_LIMITED_INFORMATION, }; pub struct WindowsCpuStatsReader; -pub fn log_all_processes() { - let snapshot = unsafe { CreateToolhelp32Snapshot(TH32CS_SNAPPROCESS, 0) }; - let Ok(snapshot) = snapshot else { - debug!("Failed to create process snapshot"); - return; - }; - - let mut entry = PROCESSENTRY32W::default(); - entry.dwSize = size_of::() as u32; - - if unsafe { Process32FirstW(snapshot, &mut entry) }.is_err() { - debug!("Failed to get first process"); - return; - } - - loop { - let name = String::from_utf16_lossy( - &entry.szExeFile[..entry.szExeFile.iter().position(|&c| c == 0).unwrap_or(260)] - ); - debug!( - "Process: PID={} PPID={} name={}", - entry.th32ProcessID, - entry.th32ParentProcessID, - name - ); - if let Some(cpu_ns) = read_process_cpu_time_ns(entry.th32ProcessID) { - debug!( - "PID={} CPU: {} ns", - entry.th32ProcessID, - cpu_ns - ); - } else { - debug!("PID={} CPU: unavailable", entry.th32ProcessID); - } - - - if unsafe { Process32NextW(snapshot, &mut entry) }.is_err() { - break; - } - } - let job_total = read_job_cpu_time_ns(); - debug!("Job Object total CPU: {:?} ns", job_total); -} - fn read_system_cpu_usage_ns() -> Option { let mut idle = FILETIME::default(); let mut kernel = FILETIME::default(); @@ -111,65 +69,6 @@ fn read_job_cpu_time_ns() -> Option { Some(total_ns) } -fn read_process_cpu_time_ns(pid: u32) -> Option { - let mut creation_time = FILETIME::default(); - let mut exit_time = FILETIME::default(); - let mut kernel_time = FILETIME::default(); - let mut user_time = FILETIME::default(); - - unsafe { - let handle = OpenProcess(PROCESS_QUERY_LIMITED_INFORMATION, false, pid).ok()?; - let result = GetProcessTimes( - handle, - &mut creation_time, - &mut exit_time, - &mut kernel_time, - &mut user_time, - ); - CloseHandle(handle).ok(); - result.ok()?; - } - - let user_ns = filetime_to_ns(&user_time); - let kernel_ns = filetime_to_ns(&kernel_time); - debug!("PID={} CPU: {} ns (user: {} ns, kernel: {} ns)", pid, user_ns + kernel_ns, user_ns, kernel_ns); - Some(user_ns + kernel_ns) -} - -fn read_process_cpu_usage_ns() -> Option { - // Using GetProcessTimes - let mut creation_time = FILETIME::default(); - let mut exit_time = FILETIME::default(); - let mut kernel_time = FILETIME::default(); - let mut user_time = FILETIME::default(); - - // All calls to Win32 APIs require unsafe in Rust - unsafe { - let handle = GetCurrentProcess(); - GetProcessTimes( - handle, - &mut creation_time, - &mut exit_time, - &mut kernel_time, - &mut user_time, - ) - .ok()?; - } - - // The FILETIME struct contains two 32-bit values that combine to form a 64-bit count of 100-nanosecond time units - // Multiply by 100 to get a 64-bit count of nanoseconds - let user_time_ns = - (((user_time.dwHighDateTime as u64) << 32) | user_time.dwLowDateTime as u64) * 100; - let kernel_time_ns = - (((kernel_time.dwHighDateTime as u64) << 32) | kernel_time.dwLowDateTime as u64) * 100; - let total_time_ns = user_time_ns + kernel_time_ns; - debug!( - "Windows CPU usage: {} ns (user: {} ns, kernel: {} ns)", - total_time_ns, user_time_ns, kernel_time_ns - ); - Some(total_time_ns) -} - /// Reads the CPU rate limit for the job object in nanocores. /// Falls back to host CPU count if no limit is set. fn read_job_cpu_limit_nc() -> (f64, bool) { @@ -207,18 +106,15 @@ fn read_job_cpu_limit_nc() -> (f64, bool) { impl CpuStatsReader for WindowsCpuStatsReader { fn read(&self) -> Option { - debug!("Reading CPU stats from Windows - using Job Object and comparing to GetProcessTimes for each process"); - log_all_processes(); - - // let total_time_ns = read_system_cpu_usage_ns()?; // Using QueryInformationJobObject let total_time_ns = read_job_cpu_time_ns()?; - // Using GetProcessTimes - // let total_time_ns = read_process_cpu_usage_ns()?; + let host_total = read_system_cpu_usage_ns()?; let (limit_nc, defaulted_limit) = read_job_cpu_limit_nc(); + Some(CpuStats { total: total_time_ns as f64, + host_total: host_total as f64, limit: Some(limit_nc), defaulted_limit: defaulted_limit, }) From cba2d693f4da5cd5422d437a76f5904ade8d5a6a Mon Sep 17 00:00:00 2001 From: Kathie Huang Date: Thu, 5 Mar 2026 18:05:56 -0500 Subject: [PATCH 12/35] Add tags to metrics --- Cargo.lock | 1 + crates/datadog-serverless-compat/Cargo.toml | 1 + crates/datadog-serverless-compat/src/main.rs | 50 +++++++++++++++----- 3 files changed, 41 insertions(+), 11 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index b98af55d..62f3d9ad 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -422,6 +422,7 @@ dependencies = [ "datadog-metrics-collector", "datadog-trace-agent", "dogstatsd", + "libdd-common", "libdd-trace-utils", "reqwest", "tokio", diff --git a/crates/datadog-serverless-compat/Cargo.toml b/crates/datadog-serverless-compat/Cargo.toml index fcbd3da0..c5e13d65 100644 --- a/crates/datadog-serverless-compat/Cargo.toml +++ b/crates/datadog-serverless-compat/Cargo.toml @@ -13,6 +13,7 @@ windows-pipes = ["datadog-trace-agent/windows-pipes", "dogstatsd/windows-pipes"] datadog-trace-agent = { path = "../datadog-trace-agent" } datadog-metrics-collector = { path = "../datadog-metrics-collector" } libdd-trace-utils = { git = "https://github.com/DataDog/libdatadog", rev = "d52ee90209cb12a28bdda0114535c1a985a29d95" } +libdd-common = { git = "https://github.com/DataDog/libdatadog", rev = "d52ee90209cb12a28bdda0114535c1a985a29d95", default-features = false } datadog-fips = { path = "../datadog-fips", default-features = false } dogstatsd = { path = "../dogstatsd", default-features = true } reqwest = { version = "0.12.4", default-features = false } diff --git a/crates/datadog-serverless-compat/src/main.rs b/crates/datadog-serverless-compat/src/main.rs index eca409e1..55e7885f 100644 --- a/crates/datadog-serverless-compat/src/main.rs +++ b/crates/datadog-serverless-compat/src/main.rs @@ -25,6 +25,7 @@ use datadog_trace_agent::{ use datadog_metrics_collector::cpu::CpuMetricsCollector; +use libdd_common::azure_app_services; use libdd_trace_utils::{config_utils::read_cloud_env, trace_utils::EnvironmentType}; use datadog_fips::reqwest_adapter::create_reqwest_client_builder; @@ -206,18 +207,45 @@ pub async fn main() { // TODO: See if this works in Google Cloud Functions Gen 1. If not, only enable this for Azure Functions. let mut cpu_collector = if dd_enhanced_metrics { aggregator_handle.as_ref().map(|handle| { - // Elastic Premium and Premium plans use WEBSITE_INSTANCE_ID to identify the instance - // Flex Consumption and Consumption plans use WEBSITE_POD_NAME or CONTAINER_NAME - let instance_id = env::var("WEBSITE_INSTANCE_ID") - .or_else(|_| env::var("WEBSITE_POD_NAME")) - .or_else(|_| env::var("CONTAINER_NAME")) - .ok(); - debug!("Instance ID: {:?}", instance_id); - let mut tag_str = format!("functionname:{}", app_name); - if let Some(id) = instance_id { - tag_str.push_str(&format!(",instance_id:{}", id)); + let mut tag_parts = vec![format!("functionname:{}", app_name)]; + // Azure tags from ddcommon + if let Some(aas_metadata) = &*azure_app_services::AAS_METADATA_FUNCTION { + let aas_tags = [ + ("aas.resource.id", aas_metadata.get_resource_id()), + ("aas.resource.group", aas_metadata.get_resource_group()), + ("aas.subscription.id", aas_metadata.get_subscription_id()), + ("aas.site.name", aas_metadata.get_site_name()), + ]; + for (name, value) in aas_tags { + if value != "unknown" { + tag_parts.push(format!("{}:{}", name, value)); + } + } + } + + // Azure region and plan tier from env vars (not in traces) + for (tag_name, env_var) in [("region", "REGION_NAME"), ("plan_tier", "WEBSITE_SKU")] { + if let Ok(val) = env::var(env_var) { + if !val.is_empty() { + tag_parts.push(format!("{}:{}", tag_name, val)); + } + } + } + // Datadog tags + // Origin tag is already added by DogStatsD + for (tag_name, env_var) in [ + ("service", "DD_SERVICE"), + ("env", "DD_ENV"), + ("version", "DD_VERSION"), + ("serverless_compat_version", "DD_SERVERLESS_COMPAT_VERSION"), + ] { + if let Ok(val) = env::var(env_var) { + if !val.is_empty() { + tag_parts.push(format!("{}:{}", tag_name, val)); + } + } } - let tags = SortedTags::parse(&tag_str).ok(); + let tags = SortedTags::parse(&tag_parts.join(",")).ok(); CpuMetricsCollector::new(handle.clone(), tags, CPU_METRICS_COLLECTION_INTERVAL) }) } else { From 70aa6fee2f590187ff0097a3f526b745eb5129e7 Mon Sep 17 00:00:00 2001 From: Kathie Huang Date: Fri, 6 Mar 2026 12:43:58 -0500 Subject: [PATCH 13/35] Ensure tags match cloud integration metrics --- crates/datadog-serverless-compat/src/main.rs | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/crates/datadog-serverless-compat/src/main.rs b/crates/datadog-serverless-compat/src/main.rs index 55e7885f..52649b71 100644 --- a/crates/datadog-serverless-compat/src/main.rs +++ b/crates/datadog-serverless-compat/src/main.rs @@ -211,10 +211,10 @@ pub async fn main() { // Azure tags from ddcommon if let Some(aas_metadata) = &*azure_app_services::AAS_METADATA_FUNCTION { let aas_tags = [ - ("aas.resource.id", aas_metadata.get_resource_id()), - ("aas.resource.group", aas_metadata.get_resource_group()), - ("aas.subscription.id", aas_metadata.get_subscription_id()), - ("aas.site.name", aas_metadata.get_site_name()), + ("resource_id", aas_metadata.get_resource_id()), + ("resource_group", aas_metadata.get_resource_group()), + ("subscription_id", aas_metadata.get_subscription_id()), + ("name", aas_metadata.get_site_name()), ]; for (name, value) in aas_tags { if value != "unknown" { From eec42022a2070c02f222441c5cdec0178d268b11 Mon Sep 17 00:00:00 2001 From: Kathie Huang Date: Fri, 6 Mar 2026 14:11:55 -0500 Subject: [PATCH 14/35] Separate Windows CPU metrics collection into separate PR --- .../datadog-metrics-collector/src/windows.rs | 114 +----------------- 1 file changed, 2 insertions(+), 112 deletions(-) diff --git a/crates/datadog-metrics-collector/src/windows.rs b/crates/datadog-metrics-collector/src/windows.rs index dc2753bb..10ac474c 100644 --- a/crates/datadog-metrics-collector/src/windows.rs +++ b/crates/datadog-metrics-collector/src/windows.rs @@ -9,123 +9,13 @@ //! All CPU metrics are reported in nanocores (1 core = 1,000,000,000 nanocores). use crate::cpu::{CpuStats, CpuStatsReader}; -use std::mem::size_of; use tracing::debug; -use windows::Win32::Foundation::{CloseHandle, FILETIME}; -use windows::Win32::System::Diagnostics::ToolHelp::{ - CreateToolhelp32Snapshot, Process32FirstW, Process32NextW, PROCESSENTRY32W, TH32CS_SNAPPROCESS, -}; -use windows::Win32::System::JobObjects::{ - JobObjectBasicAccountingInformation, JobObjectCpuRateControlInformation, - QueryInformationJobObject, JOBOBJECT_BASIC_ACCOUNTING_INFORMATION, - JOBOBJECT_CPU_RATE_CONTROL_INFORMATION, JOB_OBJECT_CPU_RATE_CONTROL_ENABLE, -}; -use windows::Win32::System::Threading::{ - GetCurrentProcess, GetProcessTimes, GetSystemTimes, OpenProcess, - PROCESS_QUERY_LIMITED_INFORMATION, -}; pub struct WindowsCpuStatsReader; -fn read_system_cpu_usage_ns() -> Option { - let mut idle = FILETIME::default(); - let mut kernel = FILETIME::default(); - let mut user = FILETIME::default(); - - unsafe { - GetSystemTimes(Some(&mut idle), Some(&mut kernel), Some(&mut user)).ok()?; - } - let idle_ns = filetime_to_ns(&idle); - let kernel_ns = filetime_to_ns(&kernel); - let user_ns = filetime_to_ns(&user); - let active_ns = (kernel_ns - idle_ns) + user_ns; - Some(active_ns) -} - -fn filetime_to_ns(filetime: &FILETIME) -> u64 { - (((filetime.dwHighDateTime as u64) << 32) | filetime.dwLowDateTime as u64) * 100 -} - -fn read_job_cpu_time_ns() -> Option { - let mut info = JOBOBJECT_BASIC_ACCOUNTING_INFORMATION::default(); - unsafe { - QueryInformationJobObject( - None, // If the handle is null, the job associated with the current process is used - JobObjectBasicAccountingInformation, // The type of info to retrieve - &mut info as *mut _ as *mut _, // Pointer to the struct to receive the info - size_of::() as u32, - None, - ) - .ok()?; - }; - // TotalUserTime and TotalKernelTime are in 100-nanosecond units - multiply by 100 to get nanoseconds - let total_ns = (info.TotalUserTime + info.TotalKernelTime) as u64 * 100; - debug!( - "Job CPU time: {} ns (user: {} ns, kernel: {} ns)", - total_ns, - info.TotalUserTime as u64 * 100, - info.TotalKernelTime as u64 * 100 - ); - Some(total_ns) -} - -/// Reads the CPU rate limit for the job object in nanocores. -/// Falls back to host CPU count if no limit is set. -fn read_job_cpu_limit_nc() -> (f64, bool) { - let mut info = JOBOBJECT_CPU_RATE_CONTROL_INFORMATION::default(); - let result = unsafe { - QueryInformationJobObject( - None, - JobObjectCpuRateControlInformation, - &mut info as *mut _ as *mut _, - size_of::() as u32, - None, - ) - }; - if result.is_ok() - && (info.ControlFlags & JOB_OBJECT_CPU_RATE_CONTROL_ENABLE) - == JOB_OBJECT_CPU_RATE_CONTROL_ENABLE - { - // CpuRate is in units of 1/100th of a percent (10000 = 100% = 1 core) - let cpu_rate = unsafe { info.Anonymous.CpuRate } as f64; - let limit_nc = (cpu_rate / 10000.0) * num_cpus::get() as f64 * 1_000_000_000.0; - debug!( - "Job CPU rate limit: {} nanocores (CpuRate: {})", - limit_nc, cpu_rate - ); - (limit_nc, false) - } else { - let limit_nc = num_cpus::get() as f64 * 1_000_000_000.0; - debug!( - "No job CPU rate limit found, defaulting to host CPU count: {} nanocores", - limit_nc - ); - (limit_nc, true) - } -} - impl CpuStatsReader for WindowsCpuStatsReader { fn read(&self) -> Option { - // Using QueryInformationJobObject - let total_time_ns = read_job_cpu_time_ns()?; - let host_total = read_system_cpu_usage_ns()?; - - let (limit_nc, defaulted_limit) = read_job_cpu_limit_nc(); - - Some(CpuStats { - total: total_time_ns as f64, - host_total: host_total as f64, - limit: Some(limit_nc), - defaulted_limit: defaulted_limit, - }) - - // let limit_nc = num_cpus::get() as f64 * 1000000000.0; - // debug!("Windows CPU limit: {} nc", limit_nc); - - // Some(CpuStats { - // total: total_time_ns as f64, - // limit: Some(limit_nc), - // defaulted_limit: true, - // }) + debug!("CPU enhanced metrics not yet supported on Windows Azure Functions"); + None } } From ddfd37fa870a5035f43a03c3a62cceb610646ded Mon Sep 17 00:00:00 2001 From: Kathie Huang Date: Fri, 6 Mar 2026 14:15:39 -0500 Subject: [PATCH 15/35] Separate CPU host usage metrics collection into separate PR --- Cargo.lock | 65 ------------------- crates/datadog-metrics-collector/Cargo.toml | 5 +- crates/datadog-metrics-collector/src/cpu.rs | 22 ------- crates/datadog-metrics-collector/src/linux.rs | 19 ------ 4 files changed, 1 insertion(+), 110 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 62f3d9ad..52ce5a6a 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -397,7 +397,6 @@ dependencies = [ "dogstatsd", "num_cpus", "tracing", - "windows", ] [[package]] @@ -3047,76 +3046,12 @@ dependencies = [ "rustix 0.38.44", ] -[[package]] -name = "windows" -version = "0.58.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dd04d41d93c4992d421894c18c8b43496aa748dd4c081bac0dc93eb0489272b6" -dependencies = [ - "windows-core", - "windows-targets 0.52.6", -] - -[[package]] -name = "windows-core" -version = "0.58.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6ba6d44ec8c2591c134257ce647b7ea6b20335bf6379a27dac5f1641fcf59f99" -dependencies = [ - "windows-implement", - "windows-interface", - "windows-result", - "windows-strings", - "windows-targets 0.52.6", -] - -[[package]] -name = "windows-implement" -version = "0.58.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2bbd5b46c938e506ecbce286b6628a02171d56153ba733b6c741fc627ec9579b" -dependencies = [ - "proc-macro2", - "quote", - "syn 2.0.114", -] - -[[package]] -name = "windows-interface" -version = "0.58.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "053c4c462dc91d3b1504c6fe5a726dd15e216ba718e84a0e46a88fbe5ded3515" -dependencies = [ - "proc-macro2", - "quote", - "syn 2.0.114", -] - [[package]] name = "windows-link" version = "0.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f0805222e57f7521d6a62e36fa9163bc891acd422f971defe97d64e70d0a4fe5" -[[package]] -name = "windows-result" -version = "0.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1d1043d8214f791817bab27572aaa8af63732e11bf84aa21a45a78d6c317ae0e" -dependencies = [ - "windows-targets 0.52.6", -] - -[[package]] -name = "windows-strings" -version = "0.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4cd9b125c486025df0eabcb585e62173c6c9eddcec5d117d3b6e8c30e2ee4d10" -dependencies = [ - "windows-result", - "windows-targets 0.52.6", -] - [[package]] name = "windows-sys" version = "0.52.0" diff --git a/crates/datadog-metrics-collector/Cargo.toml b/crates/datadog-metrics-collector/Cargo.toml index 5531b8e0..1fe23323 100644 --- a/crates/datadog-metrics-collector/Cargo.toml +++ b/crates/datadog-metrics-collector/Cargo.toml @@ -8,7 +8,4 @@ description = "Collector to read, compute, and submit enhanced metrics in Server [dependencies] dogstatsd = { path = "../dogstatsd", default-features = true } num_cpus = "1.16" -tracing = { version = "0.1", default-features = false } - -[target.'cfg(target_os = "windows")'.dependencies] -windows = { version = "0.58", features = ["Win32_System_Threading", "Win32_Foundation", "Win32_System_JobObjects", "Win32_System_Diagnostics_ToolHelp"] } \ No newline at end of file +tracing = { version = "0.1", default-features = false } \ No newline at end of file diff --git a/crates/datadog-metrics-collector/src/cpu.rs b/crates/datadog-metrics-collector/src/cpu.rs index 86a38454..64dcaf24 100644 --- a/crates/datadog-metrics-collector/src/cpu.rs +++ b/crates/datadog-metrics-collector/src/cpu.rs @@ -13,13 +13,11 @@ use dogstatsd::metric::{Metric, MetricValue, SortedTags}; use tracing::{debug, error}; const CPU_USAGE_METRIC: &str = "azure.functions.enhanced.cpu.usage"; -const CPU_HOST_USAGE_METRIC: &str = "azure.functions.enhanced.cpu.host.usage"; const CPU_LIMIT_METRIC: &str = "azure.functions.enhanced.cpu.limit"; /// Computed CPU total and limit metrics pub struct CpuStats { pub total: f64, // Cumulative CPU usage in nanoseconds - pub host_total: f64, // Cumulative CPU usage on the host in nanoseconds pub limit: Option, // CPU limit in nanocores pub defaulted_limit: bool, // Whether CPU limit was defaulted to host CPU count } @@ -34,7 +32,6 @@ pub struct CpuMetricsCollector { tags: Option, collection_interval_secs: u64, last_usage_ns: f64, - last_host_usage_ns: f64, last_collection_time: std::time::Instant, } @@ -61,7 +58,6 @@ impl CpuMetricsCollector { tags, collection_interval_secs, last_usage_ns: -1.0, - last_host_usage_ns: -1.0, last_collection_time: std::time::Instant::now(), } } @@ -72,15 +68,12 @@ impl CpuMetricsCollector { debug!("Collected cpu stats!"); let current_usage_ns = cpu_stats.total; debug!("CPU usage: {}", cpu_stats.total); - let current_host_usage_ns = cpu_stats.host_total; - debug!("Host CPU usage: {}", cpu_stats.host_total); let now_instant = std::time::Instant::now(); // Skip first collection if self.last_usage_ns == -1.0 { debug!("First CPU collection, skipping rate computation"); self.last_usage_ns = current_usage_ns; - self.last_host_usage_ns = current_host_usage_ns; self.last_collection_time = now_instant; return; } @@ -111,21 +104,6 @@ impl CpuMetricsCollector { error!("Failed to insert CPU usage metric: {}", e); } - // Host VM-level CPU usage - let host_delta_ns = current_host_usage_ns - self.last_host_usage_ns; - self.last_host_usage_ns = current_host_usage_ns; - let host_usage_rate_nc = host_delta_ns / self.collection_interval_secs as f64; - debug!("CPU host usage rate: {} nanocores", host_usage_rate_nc); - - if let Err(e) = self.aggregator.insert_batch(vec![Metric::new( - CPU_HOST_USAGE_METRIC.into(), - MetricValue::distribution(host_usage_rate_nc), - self.tags.clone(), - Some(now), - )]) { - error!("Failed to insert CPU host usage metric: {}", e); - } - if let Some(limit) = cpu_stats.limit { debug!("CPU limit: {}", limit); if cpu_stats.defaulted_limit { diff --git a/crates/datadog-metrics-collector/src/linux.rs b/crates/datadog-metrics-collector/src/linux.rs index 47faf3a7..402fee15 100644 --- a/crates/datadog-metrics-collector/src/linux.rs +++ b/crates/datadog-metrics-collector/src/linux.rs @@ -17,8 +17,6 @@ const CGROUP_CPU_USAGE_PATH: &str = "/sys/fs/cgroup/cpu/cpuacct.usage"; // Repor const CGROUP_CPUSET_CPUS_PATH: &str = "/sys/fs/cgroup/cpuset/cpuset.cpus"; // Specifies the CPUs that tasks in this cgroup are permitted to access const CGROUP_CPU_PERIOD_PATH: &str = "/sys/fs/cgroup/cpu/cpu.cfs_period_us"; // Specifies a period of time, in microseconds, for how regularly a cgroup's access to CPU resources should be reallocated const CGROUP_CPU_QUOTA_PATH: &str = "/sys/fs/cgroup/cpu/cpu.cfs_quota_us"; // Specifies the total amount of time, in microseconds, for which all tasks in a cgroup can run during one period -const PROC_UPTIME_PATH: &str = "/proc/uptime"; // Reports the total uptime of the system, in seconds -const PROC_STAT_PATH: &str = "/proc/stat"; // Reports the total CPU time, in nanoseconds, consumed by all processes and threads in the system /// Statistics from cgroup v1 files, normalized to nanoseconds struct CgroupStats { @@ -30,20 +28,6 @@ struct CgroupStats { pub struct LinuxCpuStatsReader; -fn read_proc_stat_snapshot() -> Option { - let contents = fs::read_to_string(PROC_STAT_PATH).ok()?; - let cpu_line = contents.lines().find(|l| l.starts_with("cpu "))?; - let mut values = cpu_line.split_whitespace(); - values.next(); // skip "cpu" label - let user: u64 = values.next()?.parse().ok()?; - let nice: u64 = values.next()?.parse().ok()?; - let system: u64 = values.next()?.parse().ok()?; - // jiffies to nanoseconds (USER_HZ=100, so 1 jiffy = 10_000_000 ns) - let active_ns = (user + nice + system) * 10_000_000; - debug!("proc/stat active: {} ns", active_ns); - Some(active_ns) -} - impl CpuStatsReader for LinuxCpuStatsReader { fn read(&self) -> Option { debug!("Reading CPU stats from Linux - using procstat"); @@ -58,11 +42,8 @@ fn build_cpu_stats(cgroup_stats: &CgroupStats) -> Option { let (limit_nc, defaulted) = compute_cpu_limit_nc(cgroup_stats); - let host_total = read_proc_stat_snapshot().unwrap_or(0); - Some(CpuStats { total: total as f64, - host_total: host_total as f64, limit: Some(limit_nc), defaulted_limit: defaulted, }) From 5953d6832de0e5e114272052ada44d083dd786ff Mon Sep 17 00:00:00 2001 From: Kathie Huang Date: Fri, 6 Mar 2026 14:22:46 -0500 Subject: [PATCH 16/35] Remove functionname tag --- .vscode/settings.json | 3 --- crates/datadog-serverless-compat/src/main.rs | 2 +- 2 files changed, 1 insertion(+), 4 deletions(-) delete mode 100644 .vscode/settings.json diff --git a/.vscode/settings.json b/.vscode/settings.json deleted file mode 100644 index d5e92419..00000000 --- a/.vscode/settings.json +++ /dev/null @@ -1,3 +0,0 @@ -// { -// "rust-analyzer.cargo.target": "x86_64-pc-windows-gnu" -// } diff --git a/crates/datadog-serverless-compat/src/main.rs b/crates/datadog-serverless-compat/src/main.rs index 52649b71..d1c3a509 100644 --- a/crates/datadog-serverless-compat/src/main.rs +++ b/crates/datadog-serverless-compat/src/main.rs @@ -207,7 +207,7 @@ pub async fn main() { // TODO: See if this works in Google Cloud Functions Gen 1. If not, only enable this for Azure Functions. let mut cpu_collector = if dd_enhanced_metrics { aggregator_handle.as_ref().map(|handle| { - let mut tag_parts = vec![format!("functionname:{}", app_name)]; + let mut tag_parts = Vec::new(); // Azure tags from ddcommon if let Some(aas_metadata) = &*azure_app_services::AAS_METADATA_FUNCTION { let aas_tags = [ From 45317ff0eed4384d848c7a34e7ab78a557199871 Mon Sep 17 00:00:00 2001 From: Kathie Huang Date: Fri, 6 Mar 2026 18:43:14 -0500 Subject: [PATCH 17/35] Send enhanced metrics even if custom metrics are turned off --- crates/datadog-metrics-collector/src/cpu.rs | 2 +- crates/datadog-serverless-compat/src/main.rs | 143 ++++++++++--------- 2 files changed, 79 insertions(+), 66 deletions(-) diff --git a/crates/datadog-metrics-collector/src/cpu.rs b/crates/datadog-metrics-collector/src/cpu.rs index 64dcaf24..d9bad10f 100644 --- a/crates/datadog-metrics-collector/src/cpu.rs +++ b/crates/datadog-metrics-collector/src/cpu.rs @@ -8,7 +8,7 @@ //! //! All CPU metrics are reported in nanocores (1 core = 1,000,000,000 nanocores). -use dogstatsd::aggregator_service::AggregatorHandle; +use dogstatsd::aggregator::AggregatorHandle; use dogstatsd::metric::{Metric, MetricValue, SortedTags}; use tracing::{debug, error}; diff --git a/crates/datadog-serverless-compat/src/main.rs b/crates/datadog-serverless-compat/src/main.rs index d1c3a509..b4e981d8 100644 --- a/crates/datadog-serverless-compat/src/main.rs +++ b/crates/datadog-serverless-compat/src/main.rs @@ -54,7 +54,7 @@ pub async fn main() { .map(|val| val.to_lowercase()) .unwrap_or("info".to_string()); - let (app_name, env_type) = match read_cloud_env() { + let (_, env_type) = match read_cloud_env() { Some(value) => value, None => { error!("Unable to identify environment. Shutting down Mini Agent."); @@ -108,7 +108,7 @@ pub async fn main() { .ok() .and_then(|val| parse_metric_namespace(&val)); - let dd_enhanced_metrics = env::var("DD_ENHANCED_METRICS") + let dd_enhanced_metrics = env::var("DD_ENHANCED_METRICS_ENABLED") .map(|val| val.to_lowercase() != "false") .unwrap_or(true); @@ -178,27 +178,35 @@ pub async fn main() { } }); - let (metrics_flusher, aggregator_handle) = if dd_use_dogstatsd { - debug!("Starting dogstatsd"); - let (_, metrics_flusher, aggregator_handle) = start_dogstatsd( - dd_dogstatsd_port, - dd_api_key, - dd_site, - https_proxy, - dogstatsd_tags, - dd_statsd_metric_namespace, - #[cfg(all(windows, feature = "windows-pipes"))] - dd_dogstatsd_windows_pipe_name.clone(), - ) - .await; - if let Some(ref windows_pipe_name) = dd_dogstatsd_windows_pipe_name { - info!("dogstatsd-pipe: starting to listen on pipe {windows_pipe_name}"); + let needs_aggregator = dd_use_dogstatsd || dd_enhanced_metrics; + + let (metrics_flusher, aggregator_handle) = if needs_aggregator { + debug!("Creating metrics flusher and aggregator"); + + let (flusher, handle) = + start_aggregator(dd_api_key, dd_site, https_proxy, dogstatsd_tags).await; + + if dd_use_dogstatsd { + debug!("Starting dogstatsd"); + start_dogstatsd_listener( + dd_dogstatsd_port, + handle.clone(), + dd_statsd_metric_namespace, + #[cfg(all(windows, feature = "windows-pipes"))] + dd_dogstatsd_windows_pipe_name.clone(), + ) + .await; + if let Some(ref windows_pipe_name) = dd_dogstatsd_windows_pipe_name { + info!("dogstatsd-pipe: starting to listen on pipe {windows_pipe_name}"); + } else { + info!("dogstatsd-udp: starting to listen on port {dd_dogstatsd_port}"); + } } else { - info!("dogstatsd-udp: starting to listen on port {dd_dogstatsd_port}"); + info!("dogstatsd disabled"); } - (metrics_flusher, Some(aggregator_handle)) + (flusher, Some(handle)) } else { - info!("dogstatsd disabled"); + info!("dogstatsd and enhanced metrics disabled"); (None, None) }; @@ -276,16 +284,13 @@ pub async fn main() { } } -async fn start_dogstatsd( - port: u16, +async fn start_aggregator( dd_api_key: Option, dd_site: String, https_proxy: Option, dogstatsd_tags: &str, - metric_namespace: Option, - #[cfg(all(windows, feature = "windows-pipes"))] windows_pipe_name: Option, -) -> (CancellationToken, Option, AggregatorHandle) { - // 1. Create the aggregator service +) -> (Option, AggregatorHandle) { + // Create the aggregator service #[allow(clippy::expect_used)] let (service, handle) = AggregatorService::new( SortedTags::parse(dogstatsd_tags).unwrap_or(EMPTY_TAGS), @@ -293,53 +298,18 @@ async fn start_dogstatsd( ) .expect("Failed to create aggregator service"); - // 2. Start the aggregator service in the background + // Start the aggregator service in the background tokio::spawn(service.run()); - #[cfg(all(windows, feature = "windows-pipes"))] - let dogstatsd_config = DogStatsDConfig { - host: AGENT_HOST.to_string(), - port, - metric_namespace, - windows_pipe_name, - so_rcvbuf: None, - buffer_size: None, - queue_size: None, - }; - - #[cfg(not(all(windows, feature = "windows-pipes")))] - let dogstatsd_config = DogStatsDConfig { - host: AGENT_HOST.to_string(), - port, - metric_namespace, - so_rcvbuf: None, - buffer_size: None, - queue_size: None, - }; - let dogstatsd_cancel_token = tokio_util::sync::CancellationToken::new(); - - // 3. Use handle in DogStatsD (cheap to clone) - let dogstatsd_client = DogStatsD::new( - &dogstatsd_config, - handle.clone(), - dogstatsd_cancel_token.clone(), - ) - .await; - - tokio::spawn(async move { - dogstatsd_client.spin().await; - }); - let metrics_flusher = match dd_api_key { Some(dd_api_key) => { let client = match build_metrics_client(https_proxy, DOGSTATSD_TIMEOUT_DURATION) { Ok(client) => client, Err(e) => { error!("Failed to build HTTP client: {e}, won't flush metrics"); - return (dogstatsd_cancel_token, None, handle); + return (None, handle); } }; - let metrics_intake_url_prefix = match Site::new(dd_site) .map_err(|e| e.to_string()) .and_then(|site| { @@ -348,7 +318,7 @@ async fn start_dogstatsd( Ok(prefix) => prefix, Err(e) => { error!("Failed to create metrics intake URL: {e}, won't flush metrics"); - return (dogstatsd_cancel_token, None, handle); + return (None, handle); } }; @@ -368,7 +338,50 @@ async fn start_dogstatsd( } }; - (dogstatsd_cancel_token, metrics_flusher, handle) + (metrics_flusher, handle) +} + +async fn start_dogstatsd_listener( + port: u16, + handle: AggregatorHandle, + metric_namespace: Option, + #[cfg(all(windows, feature = "windows-pipes"))] windows_pipe_name: Option, +) -> CancellationToken { + #[cfg(all(windows, feature = "windows-pipes"))] + let dogstatsd_config = DogStatsDConfig { + host: AGENT_HOST.to_string(), + port, + metric_namespace, + windows_pipe_name, + so_rcvbuf: None, + buffer_size: None, + queue_size: None, + }; + + #[cfg(not(all(windows, feature = "windows-pipes")))] + let dogstatsd_config = DogStatsDConfig { + host: AGENT_HOST.to_string(), + port, + metric_namespace, + so_rcvbuf: None, + buffer_size: None, + queue_size: None, + }; + let dogstatsd_cancel_token = tokio_util::sync::CancellationToken::new(); + + // Use handle in DogStatsD (cheap to clone) + let dogstatsd_client = DogStatsD::new( + &dogstatsd_config, + handle.clone(), + dogstatsd_cancel_token.clone(), + ) + .await; + + tokio::spawn(async move { + dogstatsd_client.spin().await; + }); + + dogstatsd_cancel_token } fn build_metrics_client( From 12bfde213281e2d8b46059075691e8a92d643299 Mon Sep 17 00:00:00 2001 From: Kathie Huang Date: Fri, 6 Mar 2026 19:03:54 -0500 Subject: [PATCH 18/35] Pull out building metrics tags into function --- crates/datadog-serverless-compat/src/main.rs | 86 +++++++++++--------- 1 file changed, 47 insertions(+), 39 deletions(-) diff --git a/crates/datadog-serverless-compat/src/main.rs b/crates/datadog-serverless-compat/src/main.rs index b4e981d8..238a7759 100644 --- a/crates/datadog-serverless-compat/src/main.rs +++ b/crates/datadog-serverless-compat/src/main.rs @@ -215,45 +215,7 @@ pub async fn main() { // TODO: See if this works in Google Cloud Functions Gen 1. If not, only enable this for Azure Functions. let mut cpu_collector = if dd_enhanced_metrics { aggregator_handle.as_ref().map(|handle| { - let mut tag_parts = Vec::new(); - // Azure tags from ddcommon - if let Some(aas_metadata) = &*azure_app_services::AAS_METADATA_FUNCTION { - let aas_tags = [ - ("resource_id", aas_metadata.get_resource_id()), - ("resource_group", aas_metadata.get_resource_group()), - ("subscription_id", aas_metadata.get_subscription_id()), - ("name", aas_metadata.get_site_name()), - ]; - for (name, value) in aas_tags { - if value != "unknown" { - tag_parts.push(format!("{}:{}", name, value)); - } - } - } - - // Azure region and plan tier from env vars (not in traces) - for (tag_name, env_var) in [("region", "REGION_NAME"), ("plan_tier", "WEBSITE_SKU")] { - if let Ok(val) = env::var(env_var) { - if !val.is_empty() { - tag_parts.push(format!("{}:{}", tag_name, val)); - } - } - } - // Datadog tags - // Origin tag is already added by DogStatsD - for (tag_name, env_var) in [ - ("service", "DD_SERVICE"), - ("env", "DD_ENV"), - ("version", "DD_VERSION"), - ("serverless_compat_version", "DD_SERVERLESS_COMPAT_VERSION"), - ] { - if let Ok(val) = env::var(env_var) { - if !val.is_empty() { - tag_parts.push(format!("{}:{}", tag_name, val)); - } - } - } - let tags = SortedTags::parse(&tag_parts.join(",")).ok(); + let tags = build_cpu_metrics_tags(); CpuMetricsCollector::new(handle.clone(), tags, CPU_METRICS_COLLECTION_INTERVAL) }) } else { @@ -394,3 +356,49 @@ fn build_metrics_client( } Ok(builder.build()?) } + +fn build_cpu_metrics_tags() -> Option { + let mut tag_parts = Vec::new(); + // Azure tags from ddcommon + if let Some(aas_metadata) = &*azure_app_services::AAS_METADATA_FUNCTION { + let aas_tags = [ + ("resource_id", aas_metadata.get_resource_id()), + ("resource_group", aas_metadata.get_resource_group()), + ("subscription_id", aas_metadata.get_subscription_id()), + ("name", aas_metadata.get_site_name()), + ]; + for (name, value) in aas_tags { + if value != "unknown" { + tag_parts.push(format!("{}:{}", name, value)); + } + } + } + + // Azure region and plan tier from env vars (not in traces) + for (tag_name, env_var) in [("region", "REGION_NAME"), ("plan_tier", "WEBSITE_SKU")] { + if let Ok(val) = env::var(env_var) { + if !val.is_empty() { + tag_parts.push(format!("{}:{}", tag_name, val)); + } + } + } + // Datadog tags + // Origin tag is already added by DogStatsD + for (tag_name, env_var) in [ + ("service", "DD_SERVICE"), + ("env", "DD_ENV"), + ("version", "DD_VERSION"), + ("serverless_compat_version", "DD_SERVERLESS_COMPAT_VERSION"), + ] { + if let Ok(val) = env::var(env_var) { + if !val.is_empty() { + tag_parts.push(format!("{}:{}", tag_name, val)); + } + } + } + + if tag_parts.is_empty() { + return None; + } + SortedTags::parse(&tag_parts.join(",")).ok() +} From 8c4cf5f69010a27106e99ffa430554f22462aca4 Mon Sep 17 00:00:00 2001 From: Kathie Huang Date: Fri, 6 Mar 2026 19:44:02 -0500 Subject: [PATCH 19/35] Add unit tests --- crates/datadog-metrics-collector/src/linux.rs | 88 +++++++++++++++++++ .../datadog-metrics-collector/src/windows.rs | 2 +- 2 files changed, 89 insertions(+), 1 deletion(-) diff --git a/crates/datadog-metrics-collector/src/linux.rs b/crates/datadog-metrics-collector/src/linux.rs index 402fee15..e299d009 100644 --- a/crates/datadog-metrics-collector/src/linux.rs +++ b/crates/datadog-metrics-collector/src/linux.rs @@ -199,3 +199,91 @@ fn compute_cgroup_cpu_limit_nc(cgroup_stats: &CgroupStats) -> Option { } limit_nc } + +#[cfg(test)] +mod tests { + use super::*; + + fn make_stats( + cpu_count: Option, + scheduler_quota: Option, + scheduler_period: Option, + ) -> CgroupStats { + CgroupStats { + total: Some(0), + cpu_count, + scheduler_quota, + scheduler_period, + } + } + + #[test] + fn test_no_limit_returns_none() { + let stats = make_stats(None, None, None); + assert!(compute_cgroup_cpu_limit_nc(&stats).is_none()); + } + + #[test] + fn test_quota_unlimited_minus_one_returns_none() { + // quota=-1 is filtered out during parsing, so None here means unlimited + let stats = make_stats(None, None, Some(100_000_000)); + assert!(compute_cgroup_cpu_limit_nc(&stats).is_none()); + } + + #[test] + fn test_limited_to_2_cores_by_quota() { + let stats = make_stats(None, Some(200_000_000), Some(100_000_000)); // 200ms / 100ms = 2 cores + let result = compute_cgroup_cpu_limit_nc(&stats); + assert!((result.unwrap() - 2_000_000_000.0).abs() < 1_000.0); // Tolerance of 1,000 nanocores due to floating point arithmetic rounding errors + } + + #[test] + fn test_limited_to_half_core_by_quota() { + let stats = make_stats(None, Some(50_000_000), Some(100_000_000)); // 50ms / 100ms = 0.5 cores + let result = compute_cgroup_cpu_limit_nc(&stats); + assert!((result.unwrap() - 500_000_000.0).abs() < 1_000.0); + } + + #[test] + fn test_read_cpu_count_single() { + let dir = std::env::temp_dir(); + let path = dir.join("cpuset_single.txt"); + std::fs::write(&path, "0-3\n").unwrap(); + let count = read_cpu_count_from_file(path.to_str().unwrap()).unwrap(); + assert_eq!(count, 4); + } + + #[test] + fn test_read_cpu_count_mixed() { + let dir = std::env::temp_dir(); + let path = dir.join("cpuset_mixed.txt"); + std::fs::write(&path, "0-2,16\n").unwrap(); + let count = read_cpu_count_from_file(path.to_str().unwrap()).unwrap(); + assert_eq!(count, 4); // 0,1,2 + 16 + } + + #[test] + fn test_read_cpu_count_empty_file() { + let dir = std::env::temp_dir(); + let path = dir.join("cpuset_empty.txt"); + std::fs::write(&path, "").unwrap(); + assert!(read_cpu_count_from_file(path.to_str().unwrap()).is_err()); + } + + #[test] + fn test_compute_cpu_limit_nc_with_quota() { + let stats = make_stats(None, Some(200_000_000), Some(100_000_000)); + let (limit, defaulted) = compute_cpu_limit_nc(&stats); + assert!((limit - 2_000_000_000.0).abs() < 1_000.0); + assert!(!defaulted); + } + + #[test] + fn test_compute_cpu_limit_nc_defaults_to_host() { + let stats = make_stats(None, None, None); + let (limit, defaulted) = compute_cpu_limit_nc(&stats); + let expected = num_cpus::get() as f64 * 1_000_000_000.0; + assert!((limit - expected).abs() < 1_000.0); + assert!(defaulted); + } +} diff --git a/crates/datadog-metrics-collector/src/windows.rs b/crates/datadog-metrics-collector/src/windows.rs index 10ac474c..346b556b 100644 --- a/crates/datadog-metrics-collector/src/windows.rs +++ b/crates/datadog-metrics-collector/src/windows.rs @@ -15,7 +15,7 @@ pub struct WindowsCpuStatsReader; impl CpuStatsReader for WindowsCpuStatsReader { fn read(&self) -> Option { - debug!("CPU enhanced metrics not yet supported on Windows Azure Functions"); + debug!("CPU enhanced metrics are not yet supported on Windows Azure Functions"); None } } From a7f9f8de33010359d8d696efe899c1b612c0134d Mon Sep 17 00:00:00 2001 From: Kathie Huang Date: Fri, 6 Mar 2026 19:53:26 -0500 Subject: [PATCH 20/35] Clean up --- crates/datadog-metrics-collector/src/cpu.rs | 5 +---- crates/datadog-metrics-collector/src/linux.rs | 10 ---------- crates/datadog-serverless-compat/src/main.rs | 3 ++- 3 files changed, 3 insertions(+), 15 deletions(-) diff --git a/crates/datadog-metrics-collector/src/cpu.rs b/crates/datadog-metrics-collector/src/cpu.rs index d9bad10f..9ed7610d 100644 --- a/crates/datadog-metrics-collector/src/cpu.rs +++ b/crates/datadog-metrics-collector/src/cpu.rs @@ -65,9 +65,8 @@ impl CpuMetricsCollector { pub fn collect_and_submit(&mut self) { if let Some(cpu_stats) = self.reader.read() { // Submit metrics - debug!("Collected cpu stats!"); + debug!("Collected CPU stats!"); let current_usage_ns = cpu_stats.total; - debug!("CPU usage: {}", cpu_stats.total); let now_instant = std::time::Instant::now(); // Skip first collection @@ -84,7 +83,6 @@ impl CpuMetricsCollector { // Divide nanoseconds delta by collection interval to get usage rate in nanocores let usage_rate_nc = delta_ns / self.collection_interval_secs as f64; - debug!("Usage rate: {} nanocores/s", usage_rate_nc); let now = std::time::UNIX_EPOCH .elapsed() @@ -105,7 +103,6 @@ impl CpuMetricsCollector { } if let Some(limit) = cpu_stats.limit { - debug!("CPU limit: {}", limit); if cpu_stats.defaulted_limit { debug!("CPU limit defaulted to host CPU count"); } diff --git a/crates/datadog-metrics-collector/src/linux.rs b/crates/datadog-metrics-collector/src/linux.rs index e299d009..cb9f9cf9 100644 --- a/crates/datadog-metrics-collector/src/linux.rs +++ b/crates/datadog-metrics-collector/src/linux.rs @@ -30,7 +30,6 @@ pub struct LinuxCpuStatsReader; impl CpuStatsReader for LinuxCpuStatsReader { fn read(&self) -> Option { - debug!("Reading CPU stats from Linux - using procstat"); let cgroup_stats = read_cgroup_stats(); build_cpu_stats(&cgroup_stats) } @@ -109,7 +108,6 @@ fn read_cpu_count_from_file(path: &str) -> Result { format!("File {path} is empty"), )); } - debug!("Contents of {path}: {cpuset_str}"); let mut cpu_count: u64 = 0; @@ -117,7 +115,6 @@ fn read_cpu_count_from_file(path: &str) -> Result { let range: Vec<&str> = part.split('-').collect(); if range.len() == 2 { // Range like "0-3" - debug!("Range: {range:?}"); let start: u64 = range[0].parse().map_err(|e| { io::Error::new( io::ErrorKind::InvalidData, @@ -133,7 +130,6 @@ fn read_cpu_count_from_file(path: &str) -> Result { cpu_count += end - start + 1; } else { // Single CPU like "2" - debug!("Single CPU: {part}"); cpu_count += 1; } } @@ -163,16 +159,10 @@ fn compute_cgroup_cpu_limit_nc(cgroup_stats: &CgroupStats) -> Option { let mut limit_nc = None; if let Some(cpu_count) = cgroup_stats.cpu_count { - debug!("CPU count from cpuset: {cpu_count}"); let host_cpu_count = num_cpus::get() as u64; if cpu_count != host_cpu_count { - debug!("CPU count from cpuset is not equal to host CPU count"); let cpuset_limit_nc = cpu_count as f64 * 1000000000.0; // Convert to nanocores limit_nc = Some(cpuset_limit_nc); - debug!( - "CPU limit from cpuset: {} CPUs ({} nanocores)", - cpu_count, cpuset_limit_nc - ); } } diff --git a/crates/datadog-serverless-compat/src/main.rs b/crates/datadog-serverless-compat/src/main.rs index 238a7759..352df142 100644 --- a/crates/datadog-serverless-compat/src/main.rs +++ b/crates/datadog-serverless-compat/src/main.rs @@ -188,7 +188,7 @@ pub async fn main() { if dd_use_dogstatsd { debug!("Starting dogstatsd"); - start_dogstatsd_listener( + let _ = start_dogstatsd_listener( dd_dogstatsd_port, handle.clone(), dd_statsd_metric_namespace, @@ -382,6 +382,7 @@ fn build_cpu_metrics_tags() -> Option { } } } + // Datadog tags // Origin tag is already added by DogStatsD for (tag_name, env_var) in [ From feca14c681751873e4302db21df564005ec6beff Mon Sep 17 00:00:00 2001 From: Kathie Huang Date: Fri, 6 Mar 2026 20:00:50 -0500 Subject: [PATCH 21/35] Refactor --- crates/datadog-metrics-collector/Cargo.toml | 2 +- crates/datadog-serverless-compat/src/main.rs | 17 +++-------------- 2 files changed, 4 insertions(+), 15 deletions(-) diff --git a/crates/datadog-metrics-collector/Cargo.toml b/crates/datadog-metrics-collector/Cargo.toml index 1fe23323..4061bd13 100644 --- a/crates/datadog-metrics-collector/Cargo.toml +++ b/crates/datadog-metrics-collector/Cargo.toml @@ -8,4 +8,4 @@ description = "Collector to read, compute, and submit enhanced metrics in Server [dependencies] dogstatsd = { path = "../dogstatsd", default-features = true } num_cpus = "1.16" -tracing = { version = "0.1", default-features = false } \ No newline at end of file +tracing = { version = "0.1", default-features = false } diff --git a/crates/datadog-serverless-compat/src/main.rs b/crates/datadog-serverless-compat/src/main.rs index 352df142..1be99615 100644 --- a/crates/datadog-serverless-compat/src/main.rs +++ b/crates/datadog-serverless-compat/src/main.rs @@ -210,9 +210,6 @@ pub async fn main() { (None, None) }; - // If DD_ENHANCED_METRICS is true, start the CPU metrics collector - // Use the existing aggregator handle - // TODO: See if this works in Google Cloud Functions Gen 1. If not, only enable this for Azure Functions. let mut cpu_collector = if dd_enhanced_metrics { aggregator_handle.as_ref().map(|handle| { let tags = build_cpu_metrics_tags(); @@ -374,18 +371,10 @@ fn build_cpu_metrics_tags() -> Option { } } - // Azure region and plan tier from env vars (not in traces) - for (tag_name, env_var) in [("region", "REGION_NAME"), ("plan_tier", "WEBSITE_SKU")] { - if let Ok(val) = env::var(env_var) { - if !val.is_empty() { - tag_parts.push(format!("{}:{}", tag_name, val)); - } - } - } - - // Datadog tags - // Origin tag is already added by DogStatsD + // Tags from env vars (not in ddcommon) - origin tag is added by DogStatsD for (tag_name, env_var) in [ + ("region", "REGION_NAME"), + ("plan_tier", "WEBSITE_SKU"), ("service", "DD_SERVICE"), ("env", "DD_ENV"), ("version", "DD_VERSION"), From c6a55dc810598afe1d3f1bd2cbfd53d648ded225 Mon Sep 17 00:00:00 2001 From: Kathie Huang Date: Fri, 6 Mar 2026 20:02:52 -0500 Subject: [PATCH 22/35] Remove last_collection_time --- crates/datadog-metrics-collector/src/cpu.rs | 4 ---- 1 file changed, 4 deletions(-) diff --git a/crates/datadog-metrics-collector/src/cpu.rs b/crates/datadog-metrics-collector/src/cpu.rs index 9ed7610d..af114c99 100644 --- a/crates/datadog-metrics-collector/src/cpu.rs +++ b/crates/datadog-metrics-collector/src/cpu.rs @@ -32,7 +32,6 @@ pub struct CpuMetricsCollector { tags: Option, collection_interval_secs: u64, last_usage_ns: f64, - last_collection_time: std::time::Instant, } impl CpuMetricsCollector { @@ -58,7 +57,6 @@ impl CpuMetricsCollector { tags, collection_interval_secs, last_usage_ns: -1.0, - last_collection_time: std::time::Instant::now(), } } @@ -73,13 +71,11 @@ impl CpuMetricsCollector { if self.last_usage_ns == -1.0 { debug!("First CPU collection, skipping rate computation"); self.last_usage_ns = current_usage_ns; - self.last_collection_time = now_instant; return; } let delta_ns = current_usage_ns - self.last_usage_ns; self.last_usage_ns = current_usage_ns; - self.last_collection_time = now_instant; // Divide nanoseconds delta by collection interval to get usage rate in nanocores let usage_rate_nc = delta_ns / self.collection_interval_secs as f64; From dfe28a3a43e9aebc307bc498f27f091d2c735f63 Mon Sep 17 00:00:00 2001 From: Kathie Huang Date: Fri, 6 Mar 2026 20:23:33 -0500 Subject: [PATCH 23/35] Only send enhanced metrics for Azure Functions --- crates/datadog-serverless-compat/src/main.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crates/datadog-serverless-compat/src/main.rs b/crates/datadog-serverless-compat/src/main.rs index 1be99615..1d5cf6c7 100644 --- a/crates/datadog-serverless-compat/src/main.rs +++ b/crates/datadog-serverless-compat/src/main.rs @@ -210,7 +210,7 @@ pub async fn main() { (None, None) }; - let mut cpu_collector = if dd_enhanced_metrics { + let mut cpu_collector = if dd_enhanced_metrics && env_type == EnvironmentType::AzureFunction { aggregator_handle.as_ref().map(|handle| { let tags = build_cpu_metrics_tags(); CpuMetricsCollector::new(handle.clone(), tags, CPU_METRICS_COLLECTION_INTERVAL) From 058ef533f80d4c9bbdee60558da4bca60b5c8dfa Mon Sep 17 00:00:00 2001 From: Kathie Huang Date: Fri, 6 Mar 2026 20:45:23 -0500 Subject: [PATCH 24/35] Add back last_collection_time --- crates/datadog-metrics-collector/src/cpu.rs | 20 ++++++++++---------- crates/datadog-serverless-compat/src/main.rs | 5 +++-- 2 files changed, 13 insertions(+), 12 deletions(-) diff --git a/crates/datadog-metrics-collector/src/cpu.rs b/crates/datadog-metrics-collector/src/cpu.rs index af114c99..f5da1d32 100644 --- a/crates/datadog-metrics-collector/src/cpu.rs +++ b/crates/datadog-metrics-collector/src/cpu.rs @@ -30,8 +30,8 @@ pub struct CpuMetricsCollector { reader: Box, aggregator: AggregatorHandle, tags: Option, - collection_interval_secs: u64, last_usage_ns: f64, + last_collection_time: std::time::Instant, } impl CpuMetricsCollector { @@ -41,12 +41,7 @@ impl CpuMetricsCollector { /// /// * `aggregator` - The aggregator handle to submit metrics to /// * `tags` - Optional tags to attach to all metrics - /// * `collection_interval_secs` - The interval in seconds to collect the metrics - pub fn new( - aggregator: AggregatorHandle, - tags: Option, - collection_interval_secs: u64, - ) -> Self { + pub fn new(aggregator: AggregatorHandle, tags: Option) -> Self { #[cfg(target_os = "windows")] let reader: Box = Box::new(crate::windows::WindowsCpuStatsReader); #[cfg(not(target_os = "windows"))] @@ -55,8 +50,8 @@ impl CpuMetricsCollector { reader, aggregator, tags, - collection_interval_secs, last_usage_ns: -1.0, + last_collection_time: std::time::Instant::now(), } } @@ -71,14 +66,19 @@ impl CpuMetricsCollector { if self.last_usage_ns == -1.0 { debug!("First CPU collection, skipping rate computation"); self.last_usage_ns = current_usage_ns; + self.last_collection_time = now_instant; return; } let delta_ns = current_usage_ns - self.last_usage_ns; self.last_usage_ns = current_usage_ns; + let elapsed_secs = now_instant + .duration_since(self.last_collection_time) + .as_secs_f64(); + self.last_collection_time = now_instant; - // Divide nanoseconds delta by collection interval to get usage rate in nanocores - let usage_rate_nc = delta_ns / self.collection_interval_secs as f64; + // Divide nanoseconds delta by elapsed time to get usage rate in nanocores + let usage_rate_nc = delta_ns / elapsed_secs; let now = std::time::UNIX_EPOCH .elapsed() diff --git a/crates/datadog-serverless-compat/src/main.rs b/crates/datadog-serverless-compat/src/main.rs index 1d5cf6c7..19fea126 100644 --- a/crates/datadog-serverless-compat/src/main.rs +++ b/crates/datadog-serverless-compat/src/main.rs @@ -178,7 +178,8 @@ pub async fn main() { } }); - let needs_aggregator = dd_use_dogstatsd || dd_enhanced_metrics; + let needs_aggregator = + dd_use_dogstatsd || (dd_enhanced_metrics && env_type == EnvironmentType::AzureFunction); let (metrics_flusher, aggregator_handle) = if needs_aggregator { debug!("Creating metrics flusher and aggregator"); @@ -213,7 +214,7 @@ pub async fn main() { let mut cpu_collector = if dd_enhanced_metrics && env_type == EnvironmentType::AzureFunction { aggregator_handle.as_ref().map(|handle| { let tags = build_cpu_metrics_tags(); - CpuMetricsCollector::new(handle.clone(), tags, CPU_METRICS_COLLECTION_INTERVAL) + CpuMetricsCollector::new(handle.clone(), tags) }) } else { info!("Enhanced metrics disabled"); From 36bba177c9b3a595be0d590b8aaf346527536874 Mon Sep 17 00:00:00 2001 From: Kathie Huang Date: Mon, 9 Mar 2026 10:01:02 -0400 Subject: [PATCH 25/35] Only enable enhanced metrics for Azure Functions --- crates/datadog-serverless-compat/src/main.rs | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/crates/datadog-serverless-compat/src/main.rs b/crates/datadog-serverless-compat/src/main.rs index 19fea126..a9a51666 100644 --- a/crates/datadog-serverless-compat/src/main.rs +++ b/crates/datadog-serverless-compat/src/main.rs @@ -108,9 +108,11 @@ pub async fn main() { .ok() .and_then(|val| parse_metric_namespace(&val)); - let dd_enhanced_metrics = env::var("DD_ENHANCED_METRICS_ENABLED") - .map(|val| val.to_lowercase() != "false") - .unwrap_or(true); + // Only enable enhanced metrics for Azure Functions + let dd_enhanced_metrics = env_type == EnvironmentType::AzureFunction + && env::var("DD_ENHANCED_METRICS_ENABLED") + .map(|val| val.to_lowercase() != "false") + .unwrap_or(true); let https_proxy = env::var("DD_PROXY_HTTPS") .or_else(|_| env::var("HTTPS_PROXY")) @@ -178,8 +180,7 @@ pub async fn main() { } }); - let needs_aggregator = - dd_use_dogstatsd || (dd_enhanced_metrics && env_type == EnvironmentType::AzureFunction); + let needs_aggregator = dd_use_dogstatsd || dd_enhanced_metrics; let (metrics_flusher, aggregator_handle) = if needs_aggregator { debug!("Creating metrics flusher and aggregator"); @@ -211,7 +212,7 @@ pub async fn main() { (None, None) }; - let mut cpu_collector = if dd_enhanced_metrics && env_type == EnvironmentType::AzureFunction { + let mut cpu_collector = if dd_enhanced_metrics { aggregator_handle.as_ref().map(|handle| { let tags = build_cpu_metrics_tags(); CpuMetricsCollector::new(handle.clone(), tags) From c626c03b9ff2fae423ac3d7964dbf8f417ba10fd Mon Sep 17 00:00:00 2001 From: Kathie Huang Date: Mon, 9 Mar 2026 10:11:17 -0400 Subject: [PATCH 26/35] Only create CPUMetricsCollector when metrics flusher is successfully created --- crates/datadog-serverless-compat/src/main.rs | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/crates/datadog-serverless-compat/src/main.rs b/crates/datadog-serverless-compat/src/main.rs index a9a51666..1bc0242c 100644 --- a/crates/datadog-serverless-compat/src/main.rs +++ b/crates/datadog-serverless-compat/src/main.rs @@ -212,13 +212,17 @@ pub async fn main() { (None, None) }; - let mut cpu_collector = if dd_enhanced_metrics { + let mut cpu_collector = if dd_enhanced_metrics && metrics_flusher.is_some() { aggregator_handle.as_ref().map(|handle| { let tags = build_cpu_metrics_tags(); CpuMetricsCollector::new(handle.clone(), tags) }) } else { - info!("Enhanced metrics disabled"); + if !dd_enhanced_metrics { + info!("Enhanced metrics disabled"); + } else { + info!("Enhanced metrics enabled but metrics flusher not found"); + } None }; From 071d2f0488f068473d2dfed9dae1a3c8a3e306a4 Mon Sep 17 00:00:00 2001 From: Kathie Huang Date: Mon, 9 Mar 2026 10:24:34 -0400 Subject: [PATCH 27/35] Launch metrics flusher as independent task from collector --- crates/datadog-serverless-compat/src/main.rs | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/crates/datadog-serverless-compat/src/main.rs b/crates/datadog-serverless-compat/src/main.rs index 1bc0242c..d69e42d1 100644 --- a/crates/datadog-serverless-compat/src/main.rs +++ b/crates/datadog-serverless-compat/src/main.rs @@ -235,9 +235,11 @@ pub async fn main() { loop { tokio::select! { _ = flush_interval.tick() => { - if let Some(metrics_flusher) = metrics_flusher.as_ref() { + if let Some(metrics_flusher) = metrics_flusher.clone() { debug!("Flushing dogstatsd metrics"); - metrics_flusher.flush().await; + tokio::spawn(async move { + metrics_flusher.flush().await; + }); } } _ = cpu_collection_interval.tick() => { From f6c269404de24d80674d2c1e621ce8326bb4d9f9 Mon Sep 17 00:00:00 2001 From: Kathie Huang Date: Tue, 10 Mar 2026 11:01:14 -0400 Subject: [PATCH 28/35] Create windows-enhanced-metrics feature for Windows-specific logic --- .github/workflows/build-datadog-serverless-compat.yml | 2 +- .github/workflows/cargo.yml | 2 +- crates/datadog-metrics-collector/Cargo.toml | 3 +++ crates/datadog-metrics-collector/src/cpu.rs | 4 ++-- crates/datadog-metrics-collector/src/lib.rs | 4 ++-- crates/datadog-serverless-compat/Cargo.toml | 1 + 6 files changed, 10 insertions(+), 6 deletions(-) diff --git a/.github/workflows/build-datadog-serverless-compat.yml b/.github/workflows/build-datadog-serverless-compat.yml index 13d18ee6..0b04eba6 100644 --- a/.github/workflows/build-datadog-serverless-compat.yml +++ b/.github/workflows/build-datadog-serverless-compat.yml @@ -45,7 +45,7 @@ jobs: retention-days: 3 - if: ${{ inputs.runner == 'windows-2022' }} shell: bash - run: cargo build --release -p datadog-serverless-compat --features windows-pipes + run: cargo build --release -p datadog-serverless-compat --features windows-pipes,windows-enhanced-metrics - if: ${{ inputs.runner == 'windows-2022' }} uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # 4.6.2 with: diff --git a/.github/workflows/cargo.yml b/.github/workflows/cargo.yml index fc640c5d..7863afd2 100644 --- a/.github/workflows/cargo.yml +++ b/.github/workflows/cargo.yml @@ -95,7 +95,7 @@ jobs: - shell: bash run: | if [[ "${{ inputs.runner }}" == "windows-2022" ]]; then - cargo nextest run --workspace --features datadog-serverless-compat/windows-pipes + cargo nextest run --workspace --features datadog-serverless-compat/windows-pipes,datadog-serverless-compat/windows-enhanced-metrics else cargo nextest run --workspace fi diff --git a/crates/datadog-metrics-collector/Cargo.toml b/crates/datadog-metrics-collector/Cargo.toml index 4061bd13..957949e6 100644 --- a/crates/datadog-metrics-collector/Cargo.toml +++ b/crates/datadog-metrics-collector/Cargo.toml @@ -9,3 +9,6 @@ description = "Collector to read, compute, and submit enhanced metrics in Server dogstatsd = { path = "../dogstatsd", default-features = true } num_cpus = "1.16" tracing = { version = "0.1", default-features = false } + +[features] +windows-enhanced-metrics = [] diff --git a/crates/datadog-metrics-collector/src/cpu.rs b/crates/datadog-metrics-collector/src/cpu.rs index f5da1d32..f6c37147 100644 --- a/crates/datadog-metrics-collector/src/cpu.rs +++ b/crates/datadog-metrics-collector/src/cpu.rs @@ -42,9 +42,9 @@ impl CpuMetricsCollector { /// * `aggregator` - The aggregator handle to submit metrics to /// * `tags` - Optional tags to attach to all metrics pub fn new(aggregator: AggregatorHandle, tags: Option) -> Self { - #[cfg(target_os = "windows")] + #[cfg(feature = "windows-enhanced-metrics")] let reader: Box = Box::new(crate::windows::WindowsCpuStatsReader); - #[cfg(not(target_os = "windows"))] + #[cfg(not(feature = "windows-enhanced-metrics"))] let reader: Box = Box::new(crate::linux::LinuxCpuStatsReader); Self { reader, diff --git a/crates/datadog-metrics-collector/src/lib.rs b/crates/datadog-metrics-collector/src/lib.rs index abf337d5..aa565e5f 100644 --- a/crates/datadog-metrics-collector/src/lib.rs +++ b/crates/datadog-metrics-collector/src/lib.rs @@ -8,7 +8,7 @@ #![cfg_attr(not(test), deny(clippy::unimplemented))] pub mod cpu; -#[cfg(not(target_os = "windows"))] +#[cfg(not(feature = "windows-enhanced-metrics"))] pub(crate) mod linux; -#[cfg(target_os = "windows")] +#[cfg(feature = "windows-enhanced-metrics")] pub(crate) mod windows; diff --git a/crates/datadog-serverless-compat/Cargo.toml b/crates/datadog-serverless-compat/Cargo.toml index c5e13d65..172be4c7 100644 --- a/crates/datadog-serverless-compat/Cargo.toml +++ b/crates/datadog-serverless-compat/Cargo.toml @@ -8,6 +8,7 @@ description = "Binary to run trace-agent and dogstatsd servers in Serverless env [features] default = [] windows-pipes = ["datadog-trace-agent/windows-pipes", "dogstatsd/windows-pipes"] +windows-enhanced-metrics = ["datadog-metrics-collector/windows-enhanced-metrics"] [dependencies] datadog-trace-agent = { path = "../datadog-trace-agent" } From aae174a7415499f32f3da2366e5952ab723a90c7 Mon Sep 17 00:00:00 2001 From: Kathie Huang Date: Tue, 10 Mar 2026 11:02:19 -0400 Subject: [PATCH 29/35] Add unit to collection interval variable --- crates/datadog-serverless-compat/src/main.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/crates/datadog-serverless-compat/src/main.rs b/crates/datadog-serverless-compat/src/main.rs index d69e42d1..3f8b047b 100644 --- a/crates/datadog-serverless-compat/src/main.rs +++ b/crates/datadog-serverless-compat/src/main.rs @@ -42,7 +42,7 @@ use dogstatsd::{ use dogstatsd::metric::{SortedTags, EMPTY_TAGS}; use tokio_util::sync::CancellationToken; -const CPU_METRICS_COLLECTION_INTERVAL: u64 = 3; +const CPU_METRICS_COLLECTION_INTERVAL_SECS: u64 = 3; const DOGSTATSD_FLUSH_INTERVAL: u64 = 10; const DOGSTATSD_TIMEOUT_DURATION: Duration = Duration::from_secs(5); const DEFAULT_DOGSTATSD_PORT: u16 = 8125; @@ -228,7 +228,7 @@ pub async fn main() { let mut flush_interval = interval(Duration::from_secs(DOGSTATSD_FLUSH_INTERVAL)); let mut cpu_collection_interval = - interval(Duration::from_secs(CPU_METRICS_COLLECTION_INTERVAL)); + interval(Duration::from_secs(CPU_METRICS_COLLECTION_INTERVAL_SECS)); flush_interval.tick().await; // discard first tick, which is instantaneous cpu_collection_interval.tick().await; From f4437940ae6a990b81fd0c03911e287232c4809c Mon Sep 17 00:00:00 2001 From: Kathie Huang Date: Tue, 10 Mar 2026 11:45:48 -0400 Subject: [PATCH 30/35] Make last_usage_ns an Option and keep CPU total as u64 until f64 is needed --- crates/datadog-metrics-collector/src/cpu.rs | 24 ++++++++++++------- crates/datadog-metrics-collector/src/linux.rs | 2 +- 2 files changed, 16 insertions(+), 10 deletions(-) diff --git a/crates/datadog-metrics-collector/src/cpu.rs b/crates/datadog-metrics-collector/src/cpu.rs index f6c37147..43a5f2cf 100644 --- a/crates/datadog-metrics-collector/src/cpu.rs +++ b/crates/datadog-metrics-collector/src/cpu.rs @@ -17,7 +17,7 @@ const CPU_LIMIT_METRIC: &str = "azure.functions.enhanced.cpu.limit"; /// Computed CPU total and limit metrics pub struct CpuStats { - pub total: f64, // Cumulative CPU usage in nanoseconds + pub total: u64, // Cumulative CPU usage in nanoseconds pub limit: Option, // CPU limit in nanocores pub defaulted_limit: bool, // Whether CPU limit was defaulted to host CPU count } @@ -30,7 +30,7 @@ pub struct CpuMetricsCollector { reader: Box, aggregator: AggregatorHandle, tags: Option, - last_usage_ns: f64, + last_usage_ns: Option, last_collection_time: std::time::Instant, } @@ -50,7 +50,7 @@ impl CpuMetricsCollector { reader, aggregator, tags, - last_usage_ns: -1.0, + last_usage_ns: None, last_collection_time: std::time::Instant::now(), } } @@ -63,15 +63,21 @@ impl CpuMetricsCollector { let now_instant = std::time::Instant::now(); // Skip first collection - if self.last_usage_ns == -1.0 { - debug!("First CPU collection, skipping rate computation"); - self.last_usage_ns = current_usage_ns; + let Some(last_usage_ns) = self.last_usage_ns else { + debug!("First CPU collection, skipping interval"); + self.last_usage_ns = Some(current_usage_ns); self.last_collection_time = now_instant; return; - } + }; - let delta_ns = current_usage_ns - self.last_usage_ns; - self.last_usage_ns = current_usage_ns; + if current_usage_ns < last_usage_ns { + debug!("Current CPU usage is less than last usage, skipping interval"); + self.last_usage_ns = Some(current_usage_ns); + self.last_collection_time = now_instant; + return; + } + let delta_ns = (current_usage_ns - last_usage_ns) as f64; + self.last_usage_ns = Some(current_usage_ns); let elapsed_secs = now_instant .duration_since(self.last_collection_time) .as_secs_f64(); diff --git a/crates/datadog-metrics-collector/src/linux.rs b/crates/datadog-metrics-collector/src/linux.rs index cb9f9cf9..f07e43fa 100644 --- a/crates/datadog-metrics-collector/src/linux.rs +++ b/crates/datadog-metrics-collector/src/linux.rs @@ -42,7 +42,7 @@ fn build_cpu_stats(cgroup_stats: &CgroupStats) -> Option { let (limit_nc, defaulted) = compute_cpu_limit_nc(cgroup_stats); Some(CpuStats { - total: total as f64, + total: total, limit: Some(limit_nc), defaulted_limit: defaulted, }) From 5f1605384b281aba699c7df9c1c5191c7efaeff4 Mon Sep 17 00:00:00 2001 From: Kathie Huang Date: Wed, 11 Mar 2026 12:47:36 -0400 Subject: [PATCH 31/35] Change collection interval to 1 for precision and remove unneeded logs --- crates/datadog-metrics-collector/src/cpu.rs | 2 -- crates/datadog-metrics-collector/src/linux.rs | 9 --------- crates/datadog-serverless-compat/src/main.rs | 2 +- 3 files changed, 1 insertion(+), 12 deletions(-) diff --git a/crates/datadog-metrics-collector/src/cpu.rs b/crates/datadog-metrics-collector/src/cpu.rs index 43a5f2cf..9a347040 100644 --- a/crates/datadog-metrics-collector/src/cpu.rs +++ b/crates/datadog-metrics-collector/src/cpu.rs @@ -58,7 +58,6 @@ impl CpuMetricsCollector { pub fn collect_and_submit(&mut self) { if let Some(cpu_stats) = self.reader.read() { // Submit metrics - debug!("Collected CPU stats!"); let current_usage_ns = cpu_stats.total; let now_instant = std::time::Instant::now(); @@ -118,7 +117,6 @@ impl CpuMetricsCollector { error!("Failed to insert CPU limit metric: {}", e); } } - debug!("Submitting CPU metrics!"); } else { debug!("Skipping CPU metrics collection - could not find data to generate CPU usage and limit enhanced metrics"); } diff --git a/crates/datadog-metrics-collector/src/linux.rs b/crates/datadog-metrics-collector/src/linux.rs index f07e43fa..13dff714 100644 --- a/crates/datadog-metrics-collector/src/linux.rs +++ b/crates/datadog-metrics-collector/src/linux.rs @@ -82,9 +82,6 @@ fn read_cgroup_stats() -> CgroupStats { } }) }); - if scheduler_quota.is_none() { - debug!("Could not read scheduler quota from {CGROUP_CPU_QUOTA_PATH}"); - } CgroupStats { total, @@ -133,8 +130,6 @@ fn read_cpu_count_from_file(path: &str) -> Result { cpu_count += 1; } } - - debug!("Total CPU count: {cpu_count}"); Ok(cpu_count) } @@ -144,10 +139,6 @@ fn compute_cpu_limit_nc(cgroup_stats: &CgroupStats) -> (f64, bool) { Some(limit) => (limit, false), None => { let host_cpu_count = num_cpus::get() as f64; - debug!( - "No CPU limit found, defaulting to host CPU count: {} CPUs", - host_cpu_count - ); (host_cpu_count * 1000000000.0, true) // Convert to nanocores } } diff --git a/crates/datadog-serverless-compat/src/main.rs b/crates/datadog-serverless-compat/src/main.rs index 3f8b047b..dd59752f 100644 --- a/crates/datadog-serverless-compat/src/main.rs +++ b/crates/datadog-serverless-compat/src/main.rs @@ -42,7 +42,7 @@ use dogstatsd::{ use dogstatsd::metric::{SortedTags, EMPTY_TAGS}; use tokio_util::sync::CancellationToken; -const CPU_METRICS_COLLECTION_INTERVAL_SECS: u64 = 3; +const CPU_METRICS_COLLECTION_INTERVAL_SECS: u64 = 1; const DOGSTATSD_FLUSH_INTERVAL: u64 = 10; const DOGSTATSD_TIMEOUT_DURATION: Duration = Duration::from_secs(5); const DEFAULT_DOGSTATSD_PORT: u16 = 8125; From 60cdecf682ea06d1c45b3226f1eaedfd6bb39ad1 Mon Sep 17 00:00:00 2001 From: Kathie Huang Date: Wed, 11 Mar 2026 13:14:17 -0400 Subject: [PATCH 32/35] Add comment to clarify shared aggregator between dogstatsd and cpu collector --- crates/datadog-serverless-compat/src/main.rs | 3 +++ 1 file changed, 3 insertions(+) diff --git a/crates/datadog-serverless-compat/src/main.rs b/crates/datadog-serverless-compat/src/main.rs index dd59752f..3afe3eab 100644 --- a/crates/datadog-serverless-compat/src/main.rs +++ b/crates/datadog-serverless-compat/src/main.rs @@ -182,6 +182,9 @@ pub async fn main() { let needs_aggregator = dd_use_dogstatsd || dd_enhanced_metrics; + // The aggregator is shared between dogstatsd and enhanced metrics. + // It is started independently so that either can be enabled without the other. + // Only dogstatsd needs the dogstatsd listener let (metrics_flusher, aggregator_handle) = if needs_aggregator { debug!("Creating metrics flusher and aggregator"); From f867f6f1ee8ca942f21c840934a11839614d0603 Mon Sep 17 00:00:00 2001 From: Kathie Huang Date: Wed, 11 Mar 2026 15:09:50 -0400 Subject: [PATCH 33/35] Move tag building logic from datadog-serverless-compat to datadog-metrics-collector --- Cargo.lock | 2 +- crates/datadog-metrics-collector/Cargo.toml | 1 + crates/datadog-metrics-collector/src/cpu.rs | 41 +++++++++++++++++++ crates/datadog-serverless-compat/Cargo.toml | 1 - crates/datadog-serverless-compat/src/main.rs | 42 +------------------- 5 files changed, 44 insertions(+), 43 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 52ce5a6a..99d61bc7 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -395,6 +395,7 @@ name = "datadog-metrics-collector" version = "0.1.0" dependencies = [ "dogstatsd", + "libdd-common", "num_cpus", "tracing", ] @@ -421,7 +422,6 @@ dependencies = [ "datadog-metrics-collector", "datadog-trace-agent", "dogstatsd", - "libdd-common", "libdd-trace-utils", "reqwest", "tokio", diff --git a/crates/datadog-metrics-collector/Cargo.toml b/crates/datadog-metrics-collector/Cargo.toml index 957949e6..98ee2a48 100644 --- a/crates/datadog-metrics-collector/Cargo.toml +++ b/crates/datadog-metrics-collector/Cargo.toml @@ -9,6 +9,7 @@ description = "Collector to read, compute, and submit enhanced metrics in Server dogstatsd = { path = "../dogstatsd", default-features = true } num_cpus = "1.16" tracing = { version = "0.1", default-features = false } +libdd-common = { git = "https://github.com/DataDog/libdatadog", rev = "d52ee90209cb12a28bdda0114535c1a985a29d95", default-features = false } [features] windows-enhanced-metrics = [] diff --git a/crates/datadog-metrics-collector/src/cpu.rs b/crates/datadog-metrics-collector/src/cpu.rs index 9a347040..2dffb97e 100644 --- a/crates/datadog-metrics-collector/src/cpu.rs +++ b/crates/datadog-metrics-collector/src/cpu.rs @@ -10,6 +10,8 @@ use dogstatsd::aggregator::AggregatorHandle; use dogstatsd::metric::{Metric, MetricValue, SortedTags}; +use libdd_common::azure_app_services; +use std::env; use tracing::{debug, error}; const CPU_USAGE_METRIC: &str = "azure.functions.enhanced.cpu.usage"; @@ -122,3 +124,42 @@ impl CpuMetricsCollector { } } } + +pub fn build_cpu_metrics_tags() -> Option { + let mut tag_parts = Vec::new(); + // Azure tags from ddcommon + if let Some(aas_metadata) = &*azure_app_services::AAS_METADATA_FUNCTION { + let aas_tags = [ + ("resource_id", aas_metadata.get_resource_id()), + ("resource_group", aas_metadata.get_resource_group()), + ("subscription_id", aas_metadata.get_subscription_id()), + ("name", aas_metadata.get_site_name()), + ]; + for (name, value) in aas_tags { + if value != "unknown" { + tag_parts.push(format!("{}:{}", name, value)); + } + } + } + + // Tags from env vars (not in ddcommon) - origin tag is added by DogStatsD + for (tag_name, env_var) in [ + ("region", "REGION_NAME"), + ("plan_tier", "WEBSITE_SKU"), + ("service", "DD_SERVICE"), + ("env", "DD_ENV"), + ("version", "DD_VERSION"), + ("serverless_compat_version", "DD_SERVERLESS_COMPAT_VERSION"), + ] { + if let Ok(val) = env::var(env_var) { + if !val.is_empty() { + tag_parts.push(format!("{}:{}", tag_name, val)); + } + } + } + + if tag_parts.is_empty() { + return None; + } + SortedTags::parse(&tag_parts.join(",")).ok() +} diff --git a/crates/datadog-serverless-compat/Cargo.toml b/crates/datadog-serverless-compat/Cargo.toml index 172be4c7..6fb19a4d 100644 --- a/crates/datadog-serverless-compat/Cargo.toml +++ b/crates/datadog-serverless-compat/Cargo.toml @@ -14,7 +14,6 @@ windows-enhanced-metrics = ["datadog-metrics-collector/windows-enhanced-metrics" datadog-trace-agent = { path = "../datadog-trace-agent" } datadog-metrics-collector = { path = "../datadog-metrics-collector" } libdd-trace-utils = { git = "https://github.com/DataDog/libdatadog", rev = "d52ee90209cb12a28bdda0114535c1a985a29d95" } -libdd-common = { git = "https://github.com/DataDog/libdatadog", rev = "d52ee90209cb12a28bdda0114535c1a985a29d95", default-features = false } datadog-fips = { path = "../datadog-fips", default-features = false } dogstatsd = { path = "../dogstatsd", default-features = true } reqwest = { version = "0.12.4", default-features = false } diff --git a/crates/datadog-serverless-compat/src/main.rs b/crates/datadog-serverless-compat/src/main.rs index 3afe3eab..f4c3009c 100644 --- a/crates/datadog-serverless-compat/src/main.rs +++ b/crates/datadog-serverless-compat/src/main.rs @@ -25,7 +25,6 @@ use datadog_trace_agent::{ use datadog_metrics_collector::cpu::CpuMetricsCollector; -use libdd_common::azure_app_services; use libdd_trace_utils::{config_utils::read_cloud_env, trace_utils::EnvironmentType}; use datadog_fips::reqwest_adapter::create_reqwest_client_builder; @@ -217,7 +216,7 @@ pub async fn main() { let mut cpu_collector = if dd_enhanced_metrics && metrics_flusher.is_some() { aggregator_handle.as_ref().map(|handle| { - let tags = build_cpu_metrics_tags(); + let tags = datadog_metrics_collector::cpu::build_cpu_metrics_tags(); CpuMetricsCollector::new(handle.clone(), tags) }) } else { @@ -364,42 +363,3 @@ fn build_metrics_client( } Ok(builder.build()?) } - -fn build_cpu_metrics_tags() -> Option { - let mut tag_parts = Vec::new(); - // Azure tags from ddcommon - if let Some(aas_metadata) = &*azure_app_services::AAS_METADATA_FUNCTION { - let aas_tags = [ - ("resource_id", aas_metadata.get_resource_id()), - ("resource_group", aas_metadata.get_resource_group()), - ("subscription_id", aas_metadata.get_subscription_id()), - ("name", aas_metadata.get_site_name()), - ]; - for (name, value) in aas_tags { - if value != "unknown" { - tag_parts.push(format!("{}:{}", name, value)); - } - } - } - - // Tags from env vars (not in ddcommon) - origin tag is added by DogStatsD - for (tag_name, env_var) in [ - ("region", "REGION_NAME"), - ("plan_tier", "WEBSITE_SKU"), - ("service", "DD_SERVICE"), - ("env", "DD_ENV"), - ("version", "DD_VERSION"), - ("serverless_compat_version", "DD_SERVERLESS_COMPAT_VERSION"), - ] { - if let Ok(val) = env::var(env_var) { - if !val.is_empty() { - tag_parts.push(format!("{}:{}", tag_name, val)); - } - } - } - - if tag_parts.is_empty() { - return None; - } - SortedTags::parse(&tag_parts.join(",")).ok() -} From 2ad5f248d4bd78a57d43c8f631fc3b32329ddd11 Mon Sep 17 00:00:00 2001 From: Kathie Huang Date: Wed, 11 Mar 2026 15:11:06 -0400 Subject: [PATCH 34/35] Remove unused dependencies from datadog-trace-agent --- Cargo.lock | 2 -- crates/datadog-trace-agent/Cargo.toml | 2 -- 2 files changed, 4 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 99d61bc7..9571b01c 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -439,7 +439,6 @@ dependencies = [ "async-trait", "bytes", "datadog-fips", - "dogstatsd", "duplicate", "http-body-util", "hyper", @@ -449,7 +448,6 @@ dependencies = [ "libdd-trace-obfuscation", "libdd-trace-protobuf", "libdd-trace-utils", - "num_cpus", "reqwest", "rmp-serde", "serde", diff --git a/crates/datadog-trace-agent/Cargo.toml b/crates/datadog-trace-agent/Cargo.toml index a7e14c5a..aec60c93 100644 --- a/crates/datadog-trace-agent/Cargo.toml +++ b/crates/datadog-trace-agent/Cargo.toml @@ -33,8 +33,6 @@ libdd-trace-obfuscation = { git = "https://github.com/DataDog/libdatadog", rev = datadog-fips = { path = "../datadog-fips" } reqwest = { version = "0.12.23", features = ["json", "http2"], default-features = false } bytes = "1.10.1" -dogstatsd = { path = "../dogstatsd", default-features = true } -num_cpus = "1.16" [dev-dependencies] rmp-serde = "1.1.1" From b4a762468ef729df9863e2a27ed8f784d1605814 Mon Sep 17 00:00:00 2001 From: Kathie Huang Date: Wed, 11 Mar 2026 15:19:38 -0400 Subject: [PATCH 35/35] Formatting --- crates/datadog-metrics-collector/src/cpu.rs | 4 +++- crates/datadog-metrics-collector/src/linux.rs | 5 ++++- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/crates/datadog-metrics-collector/src/cpu.rs b/crates/datadog-metrics-collector/src/cpu.rs index 2dffb97e..7e6522bb 100644 --- a/crates/datadog-metrics-collector/src/cpu.rs +++ b/crates/datadog-metrics-collector/src/cpu.rs @@ -120,7 +120,9 @@ impl CpuMetricsCollector { } } } else { - debug!("Skipping CPU metrics collection - could not find data to generate CPU usage and limit enhanced metrics"); + debug!( + "Skipping CPU metrics collection - could not find data to generate CPU usage and limit enhanced metrics" + ); } } } diff --git a/crates/datadog-metrics-collector/src/linux.rs b/crates/datadog-metrics-collector/src/linux.rs index 13dff714..d8f7b59d 100644 --- a/crates/datadog-metrics-collector/src/linux.rs +++ b/crates/datadog-metrics-collector/src/linux.rs @@ -171,7 +171,10 @@ fn compute_cgroup_cpu_limit_nc(cgroup_stats: &CgroupStats) -> Option { } Some(current_limit_nc) if quota_limit_nc < current_limit_nc => { limit_nc = Some(quota_limit_nc); - debug!("CPU limit from cfs quota is less than current limit, setting CPU limit from cfs quota: {} nanocores", quota_limit_nc); + debug!( + "CPU limit from cfs quota is less than current limit, setting CPU limit from cfs quota: {} nanocores", + quota_limit_nc + ); } _ => { debug!("Keeping cpuset limit: {:?} nanocores", limit_nc);