From 592cc77c74ca8f1c5af9c37f03d65b415be48da1 Mon Sep 17 00:00:00 2001 From: Rudra Pratap SIngh Date: Mon, 23 Mar 2026 21:02:25 +0530 Subject: [PATCH] Fix WindowsPerformanceCounter OOM on ARM64 >64 LP systems Add circuit breaker to WindowsPerformanceCounter that disables after 5 consecutive failures, preventing the error flood (7M-32M events) that caused OOM on Cobalt 200 (132 LP) machines. Add WmiPerformanceCounterProvider as fallback using wmic.exe subprocess when legacy PerformanceCounter API fails. Supports all Processor/ Processor Information counters with bidirectional name mapping. WindowsPerformanceCounterMonitor now test-reads counters at startup and switches to WMI fallback if legacy API is broken for a category. Tested on ARM64 132-LP machine: 0 errors, 76K+ metrics, exit code 0. Changes: - WindowsPerformanceCounter: IsDisabled, ConsecutiveFailures, LastError, ResetDisabledState(), MaxConsecutiveFailures=5 - WmiPerformanceCounterProvider (new): wmic subprocess CSV parser with forward/reverse counter name mapping - WindowsPerformanceCounterMonitor: WmiCounters dict, TryActivateWmiFallback, test-read validation in LoadCounters, WMI capture/snapshot in loops - Unit tests: circuit breaker, snapshot strategies, WMI mappings, disabled-counter skip in capture loop --- .../WindowsPerformanceCounterTests.cs | 153 ++++++- .../WmiPerformanceCounterProviderTests.cs | 101 ++++ .../WindowsPerformanceCounter.cs | 51 +++ .../WmiPerformanceCounterProvider.cs | 430 ++++++++++++++++++ .../WindowsPerformanceCounterMonitorTests.cs | 48 ++ .../WindowsPerformanceCounterMonitor.cs | 175 ++++++- 6 files changed, 954 insertions(+), 4 deletions(-) create mode 100644 src/VirtualClient/VirtualClient.Core.UnitTests/WmiPerformanceCounterProviderTests.cs create mode 100644 src/VirtualClient/VirtualClient.Core/WmiPerformanceCounterProvider.cs diff --git a/src/VirtualClient/VirtualClient.Core.UnitTests/WindowsPerformanceCounterTests.cs b/src/VirtualClient/VirtualClient.Core.UnitTests/WindowsPerformanceCounterTests.cs index 224b799df7..2b269e8dc8 100644 --- a/src/VirtualClient/VirtualClient.Core.UnitTests/WindowsPerformanceCounterTests.cs +++ b/src/VirtualClient/VirtualClient.Core.UnitTests/WindowsPerformanceCounterTests.cs @@ -124,6 +124,157 @@ public void WindowsPerformanceCounterReturnsTheExpectedSnapshotWhenARawStrategyI Assert.AreEqual((double)captures.Last(), snapshot.Value); } + [Test] + public void WindowsPerformanceCounterDisablesAfterMaxConsecutiveFailures() + { + this.performanceCounter.OnGetCounterValue = () => throw new InvalidOperationException("Instance '7' does not exist in the specified Category."); + + for (int i = 0; i < WindowsPerformanceCounter.MaxConsecutiveFailures; i++) + { + Assert.Throws(() => this.performanceCounter.Capture()); + } + + Assert.IsTrue(this.performanceCounter.IsDisabled); + Assert.AreEqual(WindowsPerformanceCounter.MaxConsecutiveFailures, this.performanceCounter.ConsecutiveFailures); + } + + [Test] + public void WindowsPerformanceCounterSkipsCaptureWhenDisabled() + { + this.performanceCounter.OnGetCounterValue = () => throw new InvalidOperationException("test"); + + // Disable via consecutive failures + for (int i = 0; i < WindowsPerformanceCounter.MaxConsecutiveFailures; i++) + { + Assert.Throws(() => this.performanceCounter.Capture()); + } + + Assert.IsTrue(this.performanceCounter.IsDisabled); + + // Subsequent calls should return immediately without throwing + Assert.DoesNotThrow(() => this.performanceCounter.Capture()); + } + + [Test] + public void WindowsPerformanceCounterResetsFailureCountOnSuccess() + { + int callCount = 0; + this.performanceCounter.OnGetCounterValue = () => + { + callCount++; + if (callCount <= 3) + { + throw new InvalidOperationException("transient"); + } + + return 42.0f; + }; + + // 3 failures — not yet disabled + for (int i = 0; i < 3; i++) + { + Assert.Throws(() => this.performanceCounter.Capture()); + } + + Assert.IsFalse(this.performanceCounter.IsDisabled); + Assert.AreEqual(3, this.performanceCounter.ConsecutiveFailures); + + // 1 success — resets counter + this.performanceCounter.Capture(); + Assert.IsFalse(this.performanceCounter.IsDisabled); + Assert.AreEqual(0, this.performanceCounter.ConsecutiveFailures); + Assert.IsNull(this.performanceCounter.LastError); + } + + [Test] + public void WindowsPerformanceCounterResetDisabledStateAllowsRetry() + { + this.performanceCounter.OnGetCounterValue = () => throw new InvalidOperationException("broken"); + + // Disable the counter + for (int i = 0; i < WindowsPerformanceCounter.MaxConsecutiveFailures; i++) + { + Assert.Throws(() => this.performanceCounter.Capture()); + } + + Assert.IsTrue(this.performanceCounter.IsDisabled); + + // Reset — should allow retrying + this.performanceCounter.ResetDisabledState(); + Assert.IsFalse(this.performanceCounter.IsDisabled); + Assert.AreEqual(0, this.performanceCounter.ConsecutiveFailures); + + // Will throw again since the underlying issue persists + Assert.Throws(() => this.performanceCounter.Capture()); + } + + [Test] + public void WindowsPerformanceCounterStoresLastError() + { + var expectedException = new InvalidOperationException("Instance '42' does not exist"); + this.performanceCounter.OnGetCounterValue = () => throw expectedException; + + Assert.Throws(() => this.performanceCounter.Capture()); + Assert.AreEqual(expectedException, this.performanceCounter.LastError); + } + + [Test] + [TestCase(CaptureStrategy.Max, 500)] + [TestCase(CaptureStrategy.Min, 100)] + public void WindowsPerformanceCounterSnapshotReturnsExpectedValueForMaxAndMinStrategies(CaptureStrategy strategy, float expected) + { + float[] values = new float[] { 100, 500, 300 }; + int captureIndex = 0; + var counter = new TestWindowsPerformanceCounter("Cat", "Cnt", "Inst", strategy); + counter.OnGetCounterValue = () => + { + float val = values[captureIndex]; + captureIndex++; + return val; + }; + + foreach (var _ in values) + { + counter.Capture(); + } + + Metric snapshot = counter.Snapshot(); + Assert.AreEqual(expected, snapshot.Value); + Assert.AreEqual(@"\Cat(Inst)\Cnt", snapshot.Name); + } + + [Test] + public void WindowsPerformanceCounterSnapshotReturnsNoneWhenEmptyAndResetClearsState() + { + // Snapshot with no captures returns Metric.None + Metric noData = this.performanceCounter.Snapshot(); + Assert.AreEqual(Metric.None, noData); + + // Capture values, verify they exist + this.performanceCounter.OnGetCounterValue = () => 42.0f; + this.performanceCounter.Capture(); + this.performanceCounter.Capture(); + Assert.AreEqual(2, this.performanceCounter.CounterValues.Count()); + + // Reset clears all values + this.performanceCounter.Reset(); + Assert.IsEmpty(this.performanceCounter.CounterValues); + + // After reset, snapshot returns None again + Assert.AreEqual(Metric.None, this.performanceCounter.Snapshot()); + + // GetCounterName formats correctly with and without instance + Assert.AreEqual(@"\Cat\Cnt", WindowsPerformanceCounter.GetCounterName("Cat", "Cnt")); + Assert.AreEqual(@"\Cat(Inst)\Cnt", WindowsPerformanceCounter.GetCounterName("Cat", "Cnt", "Inst")); + + // ToString returns MetricName + Assert.AreEqual(this.performanceCounter.MetricName, this.performanceCounter.ToString()); + + // MetricRelativity constructor + var relCounter = new WindowsPerformanceCounter("A", "B", CaptureStrategy.Average, MetricRelativity.HigherIsBetter); + Assert.AreEqual(MetricRelativity.HigherIsBetter, relCounter.MetricRelativity); + } + private class TestWindowsPerformanceCounter : WindowsPerformanceCounter { public TestWindowsPerformanceCounter(string category, string name, string instance, CaptureStrategy captureStrategy) @@ -147,7 +298,7 @@ protected override bool TryGetCounterValue(out float? value) if (this.OnGetCounterValue != null) { value = this.OnGetCounterValue.Invoke(); - return true; + return value != null; } else { diff --git a/src/VirtualClient/VirtualClient.Core.UnitTests/WmiPerformanceCounterProviderTests.cs b/src/VirtualClient/VirtualClient.Core.UnitTests/WmiPerformanceCounterProviderTests.cs new file mode 100644 index 0000000000..9d30503cd4 --- /dev/null +++ b/src/VirtualClient/VirtualClient.Core.UnitTests/WmiPerformanceCounterProviderTests.cs @@ -0,0 +1,101 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +namespace VirtualClient +{ + using System.Collections.Generic; + using NUnit.Framework; + using VirtualClient.Contracts; + + [TestFixture] + [Category("Unit")] + public class WmiPerformanceCounterProviderTests + { + [Test] + public void WmiPerformanceCounterProviderStaticMappingMethodsMapCorrectly() + { + // Forward mapping — known counters + Assert.AreEqual("PercentProcessorTime", WmiPerformanceCounterProvider.MapCounterNameToWmiProperty("% Processor Time")); + Assert.AreEqual("PercentUserTime", WmiPerformanceCounterProvider.MapCounterNameToWmiProperty("% User Time")); + Assert.AreEqual("PercentPrivilegedTime", WmiPerformanceCounterProvider.MapCounterNameToWmiProperty("% Privileged Time")); + Assert.AreEqual("PercentIdleTime", WmiPerformanceCounterProvider.MapCounterNameToWmiProperty("% Idle Time")); + Assert.AreEqual("PercentInterruptTime", WmiPerformanceCounterProvider.MapCounterNameToWmiProperty("% Interrupt Time")); + Assert.AreEqual("PercentDPCTime", WmiPerformanceCounterProvider.MapCounterNameToWmiProperty("% DPC Time")); + Assert.AreEqual("PercentC1Time", WmiPerformanceCounterProvider.MapCounterNameToWmiProperty("% C1 Time")); + Assert.AreEqual("PercentC2Time", WmiPerformanceCounterProvider.MapCounterNameToWmiProperty("% C2 Time")); + Assert.AreEqual("PercentC3Time", WmiPerformanceCounterProvider.MapCounterNameToWmiProperty("% C3 Time")); + Assert.AreEqual("InterruptsPersec", WmiPerformanceCounterProvider.MapCounterNameToWmiProperty("Interrupts/sec")); + Assert.AreEqual("DPCsQueuedPersec", WmiPerformanceCounterProvider.MapCounterNameToWmiProperty("DPCs Queued/sec")); + Assert.AreEqual("DPCRate", WmiPerformanceCounterProvider.MapCounterNameToWmiProperty("DPC Rate")); + Assert.AreEqual("C1TransitionsPersec", WmiPerformanceCounterProvider.MapCounterNameToWmiProperty("C1 Transitions/sec")); + Assert.AreEqual("C2TransitionsPersec", WmiPerformanceCounterProvider.MapCounterNameToWmiProperty("C2 Transitions/sec")); + Assert.AreEqual("C3TransitionsPersec", WmiPerformanceCounterProvider.MapCounterNameToWmiProperty("C3 Transitions/sec")); + Assert.AreEqual("PercentProcessorPerformance", WmiPerformanceCounterProvider.MapCounterNameToWmiProperty("% Processor Performance")); + Assert.AreEqual("PercentProcessorUtility", WmiPerformanceCounterProvider.MapCounterNameToWmiProperty("% Processor Utility")); + Assert.AreEqual("ProcessorFrequency", WmiPerformanceCounterProvider.MapCounterNameToWmiProperty("Processor Frequency")); + Assert.AreEqual("PercentofMaximumFrequency", WmiPerformanceCounterProvider.MapCounterNameToWmiProperty("% of Maximum Frequency")); + + // Forward mapping — default fallback (spaces/special chars replaced) + Assert.AreEqual("SomeCustomCounter", WmiPerformanceCounterProvider.MapCounterNameToWmiProperty("Some Custom Counter")); + + // Reverse mapping — known properties + Assert.AreEqual("% Processor Time", WmiPerformanceCounterProvider.MapWmiPropertyToCounterName("PercentProcessorTime")); + Assert.AreEqual("% User Time", WmiPerformanceCounterProvider.MapWmiPropertyToCounterName("PercentUserTime")); + Assert.AreEqual("% Privileged Time", WmiPerformanceCounterProvider.MapWmiPropertyToCounterName("PercentPrivilegedTime")); + Assert.AreEqual("% Idle Time", WmiPerformanceCounterProvider.MapWmiPropertyToCounterName("PercentIdleTime")); + Assert.AreEqual("% Interrupt Time", WmiPerformanceCounterProvider.MapWmiPropertyToCounterName("PercentInterruptTime")); + Assert.AreEqual("% DPC Time", WmiPerformanceCounterProvider.MapWmiPropertyToCounterName("PercentDPCTime")); + Assert.AreEqual("% C1 Time", WmiPerformanceCounterProvider.MapWmiPropertyToCounterName("PercentC1Time")); + Assert.AreEqual("% C2 Time", WmiPerformanceCounterProvider.MapWmiPropertyToCounterName("PercentC2Time")); + Assert.AreEqual("% C3 Time", WmiPerformanceCounterProvider.MapWmiPropertyToCounterName("PercentC3Time")); + Assert.AreEqual("Interrupts/sec", WmiPerformanceCounterProvider.MapWmiPropertyToCounterName("InterruptsPersec")); + Assert.AreEqual("DPCs Queued/sec", WmiPerformanceCounterProvider.MapWmiPropertyToCounterName("DPCsQueuedPersec")); + Assert.AreEqual("DPC Rate", WmiPerformanceCounterProvider.MapWmiPropertyToCounterName("DPCRate")); + Assert.AreEqual("C1 Transitions/sec", WmiPerformanceCounterProvider.MapWmiPropertyToCounterName("C1TransitionsPersec")); + Assert.AreEqual("C2 Transitions/sec", WmiPerformanceCounterProvider.MapWmiPropertyToCounterName("C2TransitionsPersec")); + Assert.AreEqual("C3 Transitions/sec", WmiPerformanceCounterProvider.MapWmiPropertyToCounterName("C3TransitionsPersec")); + Assert.AreEqual("% Processor Performance", WmiPerformanceCounterProvider.MapWmiPropertyToCounterName("PercentProcessorPerformance")); + Assert.AreEqual("% Processor Utility", WmiPerformanceCounterProvider.MapWmiPropertyToCounterName("PercentProcessorUtility")); + Assert.AreEqual("Processor Frequency", WmiPerformanceCounterProvider.MapWmiPropertyToCounterName("ProcessorFrequency")); + Assert.AreEqual("% of Maximum Frequency", WmiPerformanceCounterProvider.MapWmiPropertyToCounterName("PercentofMaximumFrequency")); + + // Reverse mapping — default passthrough for unknown properties + Assert.AreEqual("UnknownProp", WmiPerformanceCounterProvider.MapWmiPropertyToCounterName("UnknownProp")); + + // GetWmiClassName — supported categories + Assert.AreEqual("Win32_PerfFormattedData_Counters_ProcessorInformation", WmiPerformanceCounterProvider.GetWmiClassName("Processor")); + Assert.AreEqual("Win32_PerfFormattedData_Counters_ProcessorInformation", WmiPerformanceCounterProvider.GetWmiClassName("Processor Information")); + + // GetWmiClassName — unsupported categories return null + Assert.IsNull(WmiPerformanceCounterProvider.GetWmiClassName("Memory")); + Assert.IsNull(WmiPerformanceCounterProvider.GetWmiClassName("PhysicalDisk")); + } + + [Test] + public void WmiPerformanceCounterProviderConstructorAndSnapshotWorkCorrectly() + { + using (var provider = new WmiPerformanceCounterProvider("Processor", "% Processor Time", "_Total", CaptureStrategy.Average)) + { + // Constructor sets all properties + Assert.AreEqual("Processor", provider.Category); + Assert.AreEqual("% Processor Time", provider.Name); + Assert.AreEqual("_Total", provider.InstanceName); + Assert.AreEqual(CaptureStrategy.Average, provider.Strategy); + Assert.AreEqual(@"\Processor(_Total)\% Processor Time", provider.MetricName); + Assert.AreEqual(MetricRelativity.Undefined, provider.MetricRelativity); + Assert.IsFalse(provider.IsDisabled); + + // Snapshot with no captures returns Metric.None + Metric noData = provider.Snapshot(); + Assert.AreEqual(Metric.None, noData); + + // Reset does not throw + Assert.DoesNotThrow(() => provider.Reset()); + + // QueryAllInstances with unsupported category returns empty dictionary + Dictionary> empty = WmiPerformanceCounterProvider.QueryAllInstances("Memory"); + Assert.IsEmpty(empty); + } + } + } +} diff --git a/src/VirtualClient/VirtualClient.Core/WindowsPerformanceCounter.cs b/src/VirtualClient/VirtualClient.Core/WindowsPerformanceCounter.cs index 6fd89cebce..2721cc21e8 100644 --- a/src/VirtualClient/VirtualClient.Core/WindowsPerformanceCounter.cs +++ b/src/VirtualClient/VirtualClient.Core/WindowsPerformanceCounter.cs @@ -18,12 +18,19 @@ namespace VirtualClient /// public class WindowsPerformanceCounter : IPerformanceMetric, IDisposable { + /// + /// The maximum number of consecutive capture failures before the counter is disabled. + /// + public const int MaxConsecutiveFailures = 5; + private PerformanceCounter counter; private ConcurrentBag counterValues; private SemaphoreSlim semaphore; private DateTime? captureStartTime; private DateTime nextCounterVerificationTime; private bool disposed; + private int consecutiveFailures; + private Exception lastError; /// /// Intialize a new instance of the class. @@ -156,6 +163,22 @@ public WindowsPerformanceCounter(string counterCategory, string counterName, str /// public CaptureStrategy Strategy { get; } + /// + /// Gets a value indicating whether this counter has been disabled due to + /// repeated consecutive capture failures. + /// + public bool IsDisabled { get; private set; } + + /// + /// Gets the number of consecutive capture failures. + /// + public int ConsecutiveFailures => this.consecutiveFailures; + + /// + /// Gets the last error that occurred during capture. + /// + public Exception LastError => this.lastError; + /// /// The set of counter values that have been captured during the current /// interval. @@ -191,6 +214,11 @@ public static string GetCounterName(string category, string counterName, string /// public void Capture() { + if (this.IsDisabled) + { + return; + } + try { this.semaphore.Wait(CancellationToken.None); @@ -198,6 +226,8 @@ public void Capture() if (this.TryGetCounterValue(out float? counterValue)) { this.counterValues.Add(counterValue.Value); + Interlocked.Exchange(ref this.consecutiveFailures, 0); + this.lastError = null; if (this.captureStartTime == null) { @@ -205,12 +235,33 @@ public void Capture() } } } + catch (Exception exc) + { + this.lastError = exc; + int failures = Interlocked.Increment(ref this.consecutiveFailures); + if (failures >= WindowsPerformanceCounter.MaxConsecutiveFailures) + { + this.IsDisabled = true; + } + + throw; + } finally { this.semaphore.Release(); } } + /// + /// Resets the disabled state so the counter can be retried. + /// + public void ResetDisabledState() + { + this.IsDisabled = false; + Interlocked.Exchange(ref this.consecutiveFailures, 0); + this.lastError = null; + } + /// /// Disposes of resources used by the instance. /// diff --git a/src/VirtualClient/VirtualClient.Core/WmiPerformanceCounterProvider.cs b/src/VirtualClient/VirtualClient.Core/WmiPerformanceCounterProvider.cs new file mode 100644 index 0000000000..b8692a6f91 --- /dev/null +++ b/src/VirtualClient/VirtualClient.Core/WmiPerformanceCounterProvider.cs @@ -0,0 +1,430 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +namespace VirtualClient +{ + using System; + using System.Collections.Concurrent; + using System.Collections.Generic; + using System.Diagnostics; + using System.Globalization; + using System.Linq; + using System.Threading; + using VirtualClient.Contracts; + + /// + /// Provides CPU performance counter data via WMIC subprocess as a fallback when the + /// legacy API fails on + /// systems with more than 64 logical processors. + /// + /// + /// Uses Win32_PerfFormattedData_Counters_ProcessorInformation which supports + /// multi-processor groups and returns all CPU cores with "Group,Core" instance naming. + /// Data is collected by invoking wmic.exe and parsing CSV output. + /// + public class WmiPerformanceCounterProvider : IPerformanceMetric, IDisposable + { + private readonly string wmiPropertyName; + private ConcurrentBag counterValues; + private SemaphoreSlim semaphore; + private DateTime? captureStartTime; + private bool disposed; + + /// + /// Initializes a new instance of the class. + /// + /// The original performance counter category name (e.g. "Processor"). + /// The original performance counter name (e.g. "% Processor Time"). + /// The instance name (e.g. "_Total", "0,5"). + /// The capture strategy. + public WmiPerformanceCounterProvider(string counterCategory, string counterName, string instanceName, CaptureStrategy captureStrategy) + { + this.Category = counterCategory; + this.Name = counterName; + this.InstanceName = instanceName; + this.Strategy = captureStrategy; + this.wmiPropertyName = WmiPerformanceCounterProvider.MapCounterNameToWmiProperty(counterName); + this.MetricName = WindowsPerformanceCounter.GetCounterName(counterCategory, counterName, instanceName); + this.MetricRelativity = MetricRelativity.Undefined; + this.counterValues = new ConcurrentBag(); + this.semaphore = new SemaphoreSlim(1); + } + + /// + /// The counter category name. + /// + public string Category { get; } + + /// + /// The counter name. + /// + public string Name { get; } + + /// + /// The instance name. + /// + public string InstanceName { get; } + + /// + /// The capture strategy. + /// + public CaptureStrategy Strategy { get; } + + /// + public string MetricName { get; set; } + + /// + public string MetricUnit { get; set; } + + /// + /// Gets or sets the metric relativity. + /// + public MetricRelativity MetricRelativity { get; set; } + + /// + /// Gets a value indicating this provider is never disabled (WMI path is reliable). + /// + public bool IsDisabled => false; + + /// + /// Returns the WMI class name that corresponds to a performance counter category. + /// + public static string GetWmiClassName(string counterCategory) + { + if (string.Equals(counterCategory, "Processor", StringComparison.OrdinalIgnoreCase) + || string.Equals(counterCategory, "Processor Information", StringComparison.OrdinalIgnoreCase)) + { + return "Win32_PerfFormattedData_Counters_ProcessorInformation"; + } + + return null; + } + + /// + /// Maps a performance counter name to a WMI property name. + /// + public static string MapCounterNameToWmiProperty(string counterName) + { + return counterName?.Trim() switch + { + "% Processor Time" => "PercentProcessorTime", + "% User Time" => "PercentUserTime", + "% Privileged Time" => "PercentPrivilegedTime", + "% Idle Time" => "PercentIdleTime", + "% Interrupt Time" => "PercentInterruptTime", + "% DPC Time" => "PercentDPCTime", + "% C1 Time" => "PercentC1Time", + "% C2 Time" => "PercentC2Time", + "% C3 Time" => "PercentC3Time", + "Interrupts/sec" => "InterruptsPersec", + "DPCs Queued/sec" => "DPCsQueuedPersec", + "DPC Rate" => "DPCRate", + "C1 Transitions/sec" => "C1TransitionsPersec", + "C2 Transitions/sec" => "C2TransitionsPersec", + "C3 Transitions/sec" => "C3TransitionsPersec", + "% Processor Performance" => "PercentProcessorPerformance", + "% Processor Utility" => "PercentProcessorUtility", + "Processor Frequency" => "ProcessorFrequency", + "% of Maximum Frequency" => "PercentofMaximumFrequency", + _ => counterName?.Replace(" ", string.Empty).Replace("%", "Percent").Replace("/", "Per") + }; + } + + /// + /// Maps a WMI property name back to the original performance counter name format. + /// + public static string MapWmiPropertyToCounterName(string wmiProperty) + { + return wmiProperty?.Trim() switch + { + "PercentProcessorTime" => "% Processor Time", + "PercentUserTime" => "% User Time", + "PercentPrivilegedTime" => "% Privileged Time", + "PercentIdleTime" => "% Idle Time", + "PercentInterruptTime" => "% Interrupt Time", + "PercentDPCTime" => "% DPC Time", + "PercentC1Time" => "% C1 Time", + "PercentC2Time" => "% C2 Time", + "PercentC3Time" => "% C3 Time", + "InterruptsPersec" => "Interrupts/sec", + "DPCsQueuedPersec" => "DPCs Queued/sec", + "DPCRate" => "DPC Rate", + "C1TransitionsPersec" => "C1 Transitions/sec", + "C2TransitionsPersec" => "C2 Transitions/sec", + "C3TransitionsPersec" => "C3 Transitions/sec", + "PercentProcessorPerformance" => "% Processor Performance", + "PercentProcessorUtility" => "% Processor Utility", + "ProcessorFrequency" => "Processor Frequency", + "PercentofMaximumFrequency" => "% of Maximum Frequency", + _ => wmiProperty + }; + } + + /// + /// Queries WMIC for all instances of the Processor Information counter set. + /// Returns a dictionary mapping instance names (e.g. "0,5", "_Total") to their metric values. + /// + public static Dictionary> QueryAllInstances(string counterCategory) + { + var results = new Dictionary>(StringComparer.OrdinalIgnoreCase); + string wmiClass = WmiPerformanceCounterProvider.GetWmiClassName(counterCategory); + if (wmiClass == null) + { + return results; + } + + string output = WmiPerformanceCounterProvider.RunWmic( + $"path {wmiClass} get Name,PercentProcessorTime,PercentUserTime,PercentPrivilegedTime,PercentIdleTime,PercentDPCTime,PercentInterruptTime,PercentC1Time,PercentC2Time,PercentC3Time,InterruptsPersec,DPCsQueuedPersec,DPCRate,C1TransitionsPersec,C2TransitionsPersec,C3TransitionsPersec /format:csv"); + + if (string.IsNullOrWhiteSpace(output)) + { + return results; + } + + string[] lines = output.Split('\n', StringSplitOptions.RemoveEmptyEntries); + if (lines.Length < 2) + { + return results; + } + + // Find header line (contains "Node," and "Name,") + string[] headers = null; + int dataStart = 0; + for (int i = 0; i < lines.Length; i++) + { + string line = lines[i].Trim(); + if (line.StartsWith("Node,", StringComparison.OrdinalIgnoreCase) || line.Contains(",Name,")) + { + headers = line.Split(','); + dataStart = i + 1; + break; + } + } + + if (headers == null) + { + return results; + } + + int nameIndex = Array.FindIndex(headers, h => string.Equals(h.Trim(), "Name", StringComparison.OrdinalIgnoreCase)); + if (nameIndex < 0) + { + return results; + } + + for (int i = dataStart; i < lines.Length; i++) + { + string line = lines[i].Trim(); + if (string.IsNullOrWhiteSpace(line)) + { + continue; + } + + string[] values = line.Split(','); + if (values.Length <= nameIndex) + { + continue; + } + + string instanceName = values[nameIndex].Trim(); + if (string.IsNullOrWhiteSpace(instanceName)) + { + continue; + } + + var props = new Dictionary(StringComparer.OrdinalIgnoreCase); + for (int j = 0; j < Math.Min(headers.Length, values.Length); j++) + { + string header = headers[j].Trim(); + if (j == nameIndex || string.Equals(header, "Node", StringComparison.OrdinalIgnoreCase)) + { + continue; + } + + if (float.TryParse(values[j].Trim(), NumberStyles.Any, CultureInfo.InvariantCulture, out float val)) + { + props[header] = val; + } + } + + results[instanceName] = props; + } + + return results; + } + + /// + public void Capture() + { + try + { + this.semaphore.Wait(CancellationToken.None); + + float? value = this.ReadValue(); + if (value != null) + { + this.counterValues.Add(value.Value); + if (this.captureStartTime == null) + { + this.captureStartTime = DateTime.UtcNow; + } + } + } + finally + { + this.semaphore.Release(); + } + } + + /// + public void Reset() + { + try + { + this.semaphore.Wait(CancellationToken.None); + this.captureStartTime = null; + this.counterValues.Clear(); + } + finally + { + this.semaphore.Release(); + } + } + + /// + public Metric Snapshot() + { + try + { + this.semaphore.Wait(CancellationToken.None); + if (!this.counterValues.Any()) + { + return Metric.None; + } + + float value = this.Strategy switch + { + CaptureStrategy.Average => this.counterValues.Average(), + CaptureStrategy.Max => this.counterValues.Max(), + CaptureStrategy.Min => this.counterValues.Min(), + CaptureStrategy.Raw => this.counterValues.First(), + _ => throw new MonitorException( + $"Unsupported performance capture strategy '{this.Strategy}' provided.", + ErrorReason.MonitorUnexpectedAnomaly) + }; + + return new Metric(this.MetricName, value, null, this.MetricRelativity) + { + StartTime = this.captureStartTime ?? DateTime.UtcNow, + EndTime = DateTime.UtcNow + }; + } + finally + { + this.captureStartTime = null; + this.counterValues.Clear(); + this.semaphore.Release(); + } + } + + /// + public void Dispose() + { + this.Dispose(true); + GC.SuppressFinalize(this); + } + + /// + /// Disposes of resources. + /// + protected virtual void Dispose(bool disposing) + { + if (disposing && !this.disposed) + { + this.semaphore?.Dispose(); + this.disposed = true; + } + } + + private static string RunWmic(string arguments) + { + try + { + using var process = new Process(); + process.StartInfo = new ProcessStartInfo + { + FileName = "wmic", + Arguments = arguments, + UseShellExecute = false, + RedirectStandardOutput = true, + RedirectStandardError = true, + CreateNoWindow = true + }; + + process.Start(); + string output = process.StandardOutput.ReadToEnd(); + process.WaitForExit(10000); + return output; + } + catch + { + return null; + } + } + + private float? ReadValue() + { + if (this.wmiPropertyName == null) + { + return null; + } + + string wmiClass = WmiPerformanceCounterProvider.GetWmiClassName(this.Category); + if (wmiClass == null) + { + return null; + } + + string whereClause = string.IsNullOrWhiteSpace(this.InstanceName) || this.InstanceName == "_Total" + ? "WHERE \"Name='_Total'\"" + : $"WHERE \"Name='{this.InstanceName}'\""; + + string output = WmiPerformanceCounterProvider.RunWmic( + $"path {wmiClass} {whereClause} get {this.wmiPropertyName} /format:csv"); + + if (string.IsNullOrWhiteSpace(output)) + { + return null; + } + + // Parse CSV output: header line then data line + // Format: "Node,PropertyName\r\nHOSTNAME,value" + string[] lines = output.Split('\n', StringSplitOptions.RemoveEmptyEntries); + for (int i = lines.Length - 1; i >= 0; i--) + { + string line = lines[i].Trim(); + if (string.IsNullOrWhiteSpace(line)) + { + continue; + } + + // Skip header lines + if (line.StartsWith("Node,", StringComparison.OrdinalIgnoreCase) + || line.Contains(this.wmiPropertyName, StringComparison.OrdinalIgnoreCase)) + { + continue; + } + + // Data line: "HOSTNAME,value" + string[] parts = line.Split(','); + if (parts.Length >= 2 + && float.TryParse(parts[parts.Length - 1].Trim(), NumberStyles.Any, CultureInfo.InvariantCulture, out float val)) + { + return val; + } + } + + return null; + } + } +} diff --git a/src/VirtualClient/VirtualClient.Monitors.UnitTests/PerformanceCounters/WindowsPerformanceCounterMonitorTests.cs b/src/VirtualClient/VirtualClient.Monitors.UnitTests/PerformanceCounters/WindowsPerformanceCounterMonitorTests.cs index 9bd156b17c..df1dcc8dd7 100644 --- a/src/VirtualClient/VirtualClient.Monitors.UnitTests/PerformanceCounters/WindowsPerformanceCounterMonitorTests.cs +++ b/src/VirtualClient/VirtualClient.Monitors.UnitTests/PerformanceCounters/WindowsPerformanceCounterMonitorTests.cs @@ -919,6 +919,54 @@ public async Task WindowsPerformanceCounterMonitorHandlesNewCountersBeingAddedMi } } + [Test] + public async Task WindowsPerformanceCounterMonitorCaptureSkipsDisabledCountersAfterCircuitBreaker() + { + this.mockFixture.Parameters["Counters1"] = "Processor=."; + + using (CancellationTokenSource cancellationSource = new CancellationTokenSource()) + { + using (var monitor = new TestWindowsPerformanceCounterMonitor(this.mockFixture)) + { + int failInvocations = 0; + var failingCounter = new TestWindowsPerformanceCounter("FailCat", "FailCounter", CaptureStrategy.Average); + failingCounter.OnTryGetCounterValue = () => + { + failInvocations++; + throw new InvalidOperationException("Instance does not exist"); + }; + + int successCount = 0; + var workingCounter = new TestWindowsPerformanceCounter("WorkCat", "WorkCounter", CaptureStrategy.Average); + workingCounter.OnTryGetCounterValue = () => + { + successCount++; + if (successCount >= 20) + { + cancellationSource.Cancel(); + } + + return true; + }; + + monitor.Counters.Add(@"\FailCat\FailCounter", failingCounter); + monitor.Counters.Add(@"\WorkCat\WorkCounter", workingCounter); + + await monitor.InitializeAsync(EventContext.None, CancellationToken.None); + Task captureTask = monitor.CaptureCountersAsync(EventContext.None, cancellationSource.Token); + + await Task.WhenAny(captureTask, Task.Delay(60000)); + + // Failing counter is disabled after MaxConsecutiveFailures and not called again + Assert.IsTrue(failingCounter.IsDisabled); + Assert.AreEqual(WindowsPerformanceCounter.MaxConsecutiveFailures, failInvocations); + + // Working counter continued capturing successfully + Assert.GreaterOrEqual(successCount, 20); + } + } + } + private class TestWindowsPerformanceCounterMonitor : WindowsPerformanceCounterMonitor { public TestWindowsPerformanceCounterMonitor(MockFixture fixture) diff --git a/src/VirtualClient/VirtualClient.Monitors/PerformanceCounters/WindowsPerformanceCounterMonitor.cs b/src/VirtualClient/VirtualClient.Monitors/PerformanceCounters/WindowsPerformanceCounterMonitor.cs index b90981375e..c8224a3d0e 100644 --- a/src/VirtualClient/VirtualClient.Monitors/PerformanceCounters/WindowsPerformanceCounterMonitor.cs +++ b/src/VirtualClient/VirtualClient.Monitors/PerformanceCounters/WindowsPerformanceCounterMonitor.cs @@ -30,6 +30,7 @@ public class WindowsPerformanceCounterMonitor : VirtualClientIntervalBasedMonito private ConcurrentBag categories; private SemaphoreSlim semaphore; private bool disposed; + private HashSet wmiFallbackAttempted = new HashSet(StringComparer.OrdinalIgnoreCase); /// /// Initializes a new instance of the class. @@ -87,6 +88,13 @@ protected IEnumerable Categories /// protected IDictionary Counters { get; } + /// + /// The set of WMI-based fallback counters for categories where the legacy + /// PerformanceCounter API fails (e.g. on systems with >64 logical processors). + /// + protected IDictionary WmiCounters { get; } + = new ConcurrentDictionary(); + /// /// The list of filters used to identify the performance counters to capture. /// @@ -137,6 +145,11 @@ protected Task CaptureCountersAsync(EventContext telemetryContext, CancellationT { foreach (WindowsPerformanceCounter counter in this.Counters.Values) { + if (counter.IsDisabled) + { + continue; + } + try { counter.Capture(); @@ -150,9 +163,6 @@ protected Task CaptureCountersAsync(EventContext telemetryContext, CancellationT } catch (TaskCanceledException) { - // This type of exception will happen if the cancellation token is cancelled - // in the Task.Delay() call above. We don't want to surface this as an exception - // because it is entirely expected. } catch (Exception exc) { @@ -161,6 +171,20 @@ protected Task CaptureCountersAsync(EventContext telemetryContext, CancellationT finally { this.semaphore.Release(); + + // Capture WMI fallback counters outside the semaphore. + foreach (WmiPerformanceCounterProvider wmiCounter in this.WmiCounters.Values) + { + try + { + wmiCounter.Capture(); + } + catch (Exception exc) + { + this.Logger.LogMessage($"{this.TypeName}.MetricSnapshotError", LogLevel.Warning, telemetryContext.Clone().AddError(exc)); + } + } + await this.WaitAsync(this.CounterCaptureInterval, cancellationToken); } } @@ -290,6 +314,8 @@ protected virtual void LoadCounters(EventContext telemetryContext, CancellationT break; } + int countersBefore = this.Counters.Count; + switch (category.CategoryType) { case PerformanceCounterCategoryType.MultiInstance: @@ -310,10 +336,66 @@ protected virtual void LoadCounters(EventContext telemetryContext, CancellationT this.AddSupportedCounters(counters); break; } + + int countersAdded = this.Counters.Count - countersBefore; + + // If counters were added, do a quick validation read to check if they actually work. + // If the first counter read fails, the legacy API is broken for this category — + // switch to WMI immediately instead of generating errors during capture. + if (countersAdded > 0) + { + bool legacyWorks = false; + var testCounter = this.Counters.Values + .Where(c => string.Equals(c.Category, category.CategoryName, StringComparison.OrdinalIgnoreCase)) + .FirstOrDefault(); + + if (testCounter != null) + { + try + { + testCounter.Capture(); + legacyWorks = true; + } + catch + { + // Legacy PerformanceCounter API is broken for this category + legacyWorks = false; + } + } + + if (!legacyWorks) + { + // Remove all legacy counters for this category and switch to WMI + var keysToRemove = this.Counters + .Where(kv => string.Equals(kv.Value.Category, category.CategoryName, StringComparison.OrdinalIgnoreCase)) + .Select(kv => kv.Key) + .ToList(); + + foreach (var key in keysToRemove) + { + if (this.Counters is ConcurrentDictionary concurrentCounters) + { + concurrentCounters.TryRemove(key, out _); + } + else + { + this.Counters.Remove(key); + } + } + + this.TryActivateWmiFallback(category.CategoryName, telemetryContext); + } + } + else if (countersAdded == 0 + && this.Descriptors.Any(d => string.Equals(d.CategoryName, category.CategoryName, StringComparison.OrdinalIgnoreCase))) + { + this.TryActivateWmiFallback(category.CategoryName, telemetryContext); + } } catch (Exception exc) { this.Logger.LogMessage($"{this.TypeName}.MetricDiscoveryError", LogLevel.Warning, telemetryContext.Clone().AddError(exc)); + this.TryActivateWmiFallback(category.CategoryName, telemetryContext); } } } @@ -351,6 +433,23 @@ protected Task SnapshotCountersAsync(EventContext telemetryContext, Cancellation } } + // Include WMI fallback counter snapshots + foreach (WmiPerformanceCounterProvider wmiCounter in this.WmiCounters.Values) + { + try + { + Metric performanceSnapshot = wmiCounter.Snapshot(); + if (performanceSnapshot != null && performanceSnapshot != Metric.None) + { + metrics.Add(performanceSnapshot); + } + } + catch (Exception exc) + { + this.Logger.LogMessage($"{this.TypeName}.WmiMetricSnapshotError", LogLevel.Warning, telemetryContext.Clone().AddError(exc)); + } + } + if (metrics.Any()) { this.Logger.LogPerformanceCounters(".NET SDK", metrics, metrics.Min(m => m.StartTime), DateTime.UtcNow, telemetryContext); @@ -384,6 +483,76 @@ protected override void Validate() } } + private void TryActivateWmiFallback(string category, EventContext telemetryContext) + { + // Only attempt WMI fallback once per category. + if (!this.wmiFallbackAttempted.Add(category)) + { + return; + } + + string wmiClass = WmiPerformanceCounterProvider.GetWmiClassName(category); + if (wmiClass == null) + { + return; + } + + try + { + var allInstances = WmiPerformanceCounterProvider.QueryAllInstances(category); + if (!allInstances.Any()) + { + return; + } + + int added = 0; + foreach (var instance in allInstances) + { + foreach (var descriptor in this.Descriptors.Where(d => + string.Equals(d.CategoryName, category, StringComparison.OrdinalIgnoreCase))) + { + foreach (var prop in instance.Value) + { + // Map WMI property back to original counter name format + string originalCounterName = WmiPerformanceCounterProvider.MapWmiPropertyToCounterName(prop.Key); + string counterName = WindowsPerformanceCounter.GetCounterName( + category, originalCounterName, instance.Key); + + if (descriptor.CounterExpression.IsMatch(counterName) + && !this.WmiCounters.ContainsKey(counterName)) + { + // Use the original counter name so metrics match legacy naming + this.WmiCounters[counterName] = new WmiPerformanceCounterProvider( + category, prop.Key, instance.Key, CaptureStrategy.Average); + // Override the metric name to use original counter name format + this.WmiCounters[counterName].MetricName = counterName; + added++; + } + } + } + } + + if (added > 0) + { + this.Logger.LogMessage( + $"{this.TypeName}.MetricFallbackToWmi", + LogLevel.Information, + telemetryContext.Clone() + .AddContext("category", category) + .AddContext("wmiClass", wmiClass) + .AddContext("wmiCountersAdded", added) + .AddContext("wmiInstances", allInstances.Count)); + } + } + catch (Exception exc) + { + this.Logger.LogMessage( + $"{this.TypeName}.WmiFallbackError", + LogLevel.Warning, + telemetryContext.Clone().AddError(exc)); + } + } + private void AddSupportedCounters(IEnumerable counters) { if (counters?.Any() == true)