diff --git a/assets/state-node-status-exporter/0800_prometheus_rule_openshift.yaml b/assets/state-node-status-exporter/0800_prometheus_rule_openshift.yaml index ab7237fd4..c89954107 100644 --- a/assets/state-node-status-exporter/0800_prometheus_rule_openshift.yaml +++ b/assets/state-node-status-exporter/0800_prometheus_rule_openshift.yaml @@ -10,7 +10,10 @@ spec: - name: Alert on node deployment failure rules: - alert: GPUOperatorNodeDeploymentFailed - # There is no GPU exposed on the node, + # There is no GPU exposed on the node. + # When the device plugin is intentionally disabled in the ClusterPolicy + # (devicePlugin.enabled: false), the metric is set to -1, so this + # alert will not fire in that case. expr: | gpu_operator_node_device_plugin_devices_total == 0 for: 30m diff --git a/cmd/nvidia-validator/metrics.go b/cmd/nvidia-validator/metrics.go index 4105dd166..2084c99f4 100644 --- a/cmd/nvidia-validator/metrics.go +++ b/cmd/nvidia-validator/metrics.go @@ -306,7 +306,15 @@ func (nm *NodeMetrics) Run() error { go nm.watchStatusFile(&nm.cudaReady, cudaStatusFile) go nm.watchDriverValidation() - go nm.watchDevicePluginValidation() + if os.Getenv("DEVICE_PLUGIN_ENABLED") != "false" { + go nm.watchDevicePluginValidation() + } else { + // Set to -1 so the alert (expr: == 0) does not fire. + // The gauge is auto-registered by promauto and defaults to 0, + // which would be a false positive. + nm.deviceCount.Set(-1) + log.Info("metrics: DevicePlugin is disabled in ClusterPolicy, skipping device plugin validation") + } go nm.watchNVIDIAPCI() log.Printf("Running the metrics server, listening on :%d/metrics", nm.port) diff --git a/controllers/object_controls.go b/controllers/object_controls.go index b436bcab1..2d675b987 100644 --- a/controllers/object_controls.go +++ b/controllers/object_controls.go @@ -122,6 +122,8 @@ const ( NvidiaDisableRequireEnvName = "NVIDIA_DISABLE_REQUIRE" // GDSEnabledEnvName is the env name to enable GDS support with device-plugin GDSEnabledEnvName = "GDS_ENABLED" + // DevicePluginEnabledEnvName indicates whether the device plugin is enabled in the ClusterPolicy + DevicePluginEnabledEnvName = "DEVICE_PLUGIN_ENABLED" // MOFEDEnabledEnvName is the env name to enable MOFED devices injection with device-plugin MOFEDEnabledEnvName = "MOFED_ENABLED" // GDRCopyEnabledEnvName is the envvar that enables injection of the GDRCopy device node with the device-plugin @@ -2450,6 +2452,12 @@ func TransformNodeStatusExporter(obj *appsv1.DaemonSet, config *gpuv1.ClusterPol obj.Spec.Template.Spec.Containers[0].Args = config.NodeStatusExporter.Args } + devicePluginEnabled := "true" + if !config.DevicePlugin.IsEnabled() { + devicePluginEnabled = "false" + } + setContainerEnv(&(obj.Spec.Template.Spec.Containers[0]), DevicePluginEnabledEnvName, devicePluginEnabled) + // set/append environment variables for exporter container if len(config.NodeStatusExporter.Env) > 0 { for _, env := range config.NodeStatusExporter.Env { diff --git a/controllers/transforms_test.go b/controllers/transforms_test.go index 86ade02a2..06cb62cfc 100644 --- a/controllers/transforms_test.go +++ b/controllers/transforms_test.go @@ -2891,6 +2891,35 @@ func TestTransformNodeStatusExporter(t *testing.T) { Name: "dummy", Image: "nvcr.io/nvidia/cloud-native/node-status-exporter:v1.0.0", ImagePullPolicy: corev1.PullIfNotPresent, + Env: []corev1.EnvVar{ + {Name: DevicePluginEnabledEnvName, Value: "true"}, + }, + SecurityContext: &corev1.SecurityContext{ + RunAsUser: rootUID, + }, + }), + }, + { + description: "node status exporter with device plugin disabled", + ds: NewDaemonset(). + WithContainer(corev1.Container{Name: "dummy"}), + cpSpec: &gpuv1.ClusterPolicySpec{ + NodeStatusExporter: gpuv1.NodeStatusExporterSpec{ + Repository: "nvcr.io/nvidia/cloud-native", + Image: "node-status-exporter", + Version: "v1.0.0", + ImagePullPolicy: "IfNotPresent", + }, + DevicePlugin: gpuv1.DevicePluginSpec{Enabled: newBoolPtr(false)}, + }, + expectedDs: NewDaemonset(). + WithContainer(corev1.Container{ + Name: "dummy", + Image: "nvcr.io/nvidia/cloud-native/node-status-exporter:v1.0.0", + ImagePullPolicy: corev1.PullIfNotPresent, + Env: []corev1.EnvVar{ + {Name: DevicePluginEnabledEnvName, Value: "false"}, + }, SecurityContext: &corev1.SecurityContext{ RunAsUser: rootUID, }, diff --git a/tests/e2e/helpers/clusterpolicy.go b/tests/e2e/helpers/clusterpolicy.go index c75c3473a..fba7cf1b7 100644 --- a/tests/e2e/helpers/clusterpolicy.go +++ b/tests/e2e/helpers/clusterpolicy.go @@ -106,6 +106,18 @@ func (h *ClusterPolicyClient) DisableGFD(ctx context.Context, name string) error }) } +func (h *ClusterPolicyClient) EnableDevicePlugin(ctx context.Context, name string) error { + return h.modify(ctx, name, func(clusterPolicy *nvidiav1.ClusterPolicy) { + clusterPolicy.Spec.DevicePlugin.Enabled = ptr.To(true) + }) +} + +func (h *ClusterPolicyClient) DisableDevicePlugin(ctx context.Context, name string) error { + return h.modify(ctx, name, func(clusterPolicy *nvidiav1.ClusterPolicy) { + clusterPolicy.Spec.DevicePlugin.Enabled = ptr.To(false) + }) +} + func (h *ClusterPolicyClient) SetMIGStrategy(ctx context.Context, name, strategy string) error { return h.modify(ctx, name, func(clusterPolicy *nvidiav1.ClusterPolicy) { clusterPolicy.Spec.MIG.Strategy = nvidiav1.MIGStrategy(strategy) diff --git a/tests/e2e/suites/clusterpolicy_test.go b/tests/e2e/suites/clusterpolicy_test.go index 92cdbde43..3b7db0331 100644 --- a/tests/e2e/suites/clusterpolicy_test.go +++ b/tests/e2e/suites/clusterpolicy_test.go @@ -328,6 +328,43 @@ var _ = Describe("ClusterPolicy Management", Label("clusterPolicy"), func() { }) }) + // test_device_plugin_disabled_env - Verify DEVICE_PLUGIN_ENABLED env var propagation + When("Disabling device plugin", Label("device-plugin", "toggle"), func() { + It("should set DEVICE_PLUGIN_ENABLED=false on node-status-exporter when device plugin is disabled", func(ctx context.Context) { + clusterPolicy := getClusterPolicyOrSkip(ctx, clusterPolicyClient, policyName) + originalState := clusterPolicy.Spec.DevicePlugin.Enabled + DeferCleanup(func(ctx context.Context) { + if originalState == nil || *originalState { + _ = clusterPolicyClient.EnableDevicePlugin(ctx, policyName) + waitForDaemonSetReady(ctx, daemonSetClient, testNamespace, "nvidia-device-plugin-daemonset") + } + }) + + err := clusterPolicyClient.DisableDevicePlugin(ctx, policyName) + Expect(err).NotTo(HaveOccurred(), "Failed to disable device plugin in ClusterPolicy") + + verifyEnvInDaemonSet(ctx, daemonSetClient, testNamespace, + "nvidia-node-status-exporter", "DEVICE_PLUGIN_ENABLED", "false") + }) + + It("should set DEVICE_PLUGIN_ENABLED=true on node-status-exporter when device plugin is re-enabled", func(ctx context.Context) { + clusterPolicy := getClusterPolicyOrSkip(ctx, clusterPolicyClient, policyName) + originalState := clusterPolicy.Spec.DevicePlugin.Enabled + DeferCleanup(func(ctx context.Context) { + if originalState != nil && !*originalState { + _ = clusterPolicyClient.DisableDevicePlugin(ctx, policyName) + } + }) + + err := clusterPolicyClient.EnableDevicePlugin(ctx, policyName) + Expect(err).NotTo(HaveOccurred(), "Failed to enable device plugin in ClusterPolicy") + + verifyEnvInDaemonSet(ctx, daemonSetClient, testNamespace, + "nvidia-node-status-exporter", "DEVICE_PLUGIN_ENABLED", "true") + waitForDaemonSetReady(ctx, daemonSetClient, testNamespace, "nvidia-device-plugin-daemonset") + }) + }) + // test_custom_labels_override - Test custom labels on daemonsets When("Updating daemonset custom labels", Label("labels", "config"), func() { It("should apply custom labels to all operand pods", func(ctx context.Context) {