From 8337157bcd33736980015a02d9f14218179fe906 Mon Sep 17 00:00:00 2001
From: Michail Resvanis <mresvani@redhat.com>
Date: Thu, 8 Jan 2026 15:25:06 +0100
Subject: [PATCH 1/8] Add Fabric Manager configuration API types and CRD
 manifests

Signed-off-by: Michail Resvanis <mresvani@redhat.com>
---
 api/nvidia/v1/clusterpolicy_types.go          | 49 +++++++++++++++++++
 api/nvidia/v1/zz_generated.deepcopy.go        | 16 ++++++
 .../manifests/nvidia.com_clusterpolicies.yaml | 11 +++++
 .../crd/bases/nvidia.com_clusterpolicies.yaml | 11 +++++
 .../crds/nvidia.com_clusterpolicies.yaml      | 11 +++++
 5 files changed, 98 insertions(+)

diff --git a/api/nvidia/v1/clusterpolicy_types.go b/api/nvidia/v1/clusterpolicy_types.go
index f79497e64..ca27fd2ae 100644
--- a/api/nvidia/v1/clusterpolicy_types.go
+++ b/api/nvidia/v1/clusterpolicy_types.go
@@ -96,6 +96,8 @@ type ClusterPolicySpec struct {
 	HostPaths HostPathsSpec `json:"hostPaths,omitempty"`
 	// KataSandboxDevicePlugin component spec
 	KataSandboxDevicePlugin KataDevicePluginSpec `json:"kataSandboxDevicePlugin,omitempty"`
+	// FabricManager component spec
+	FabricManager FabricManagerSpec `json:"fabricManager,omitempty"`
 }
 
 // Runtime defines container runtime type
@@ -1819,6 +1821,38 @@ type CDIConfigSpec struct {
 	NRIPluginEnabled *bool `json:"nriPluginEnabled,omitempty"`
 }
 
+// FabricMode defines the Fabric Manager mode
+type FabricMode string
+
+const (
+	// FabricModeFullPassthrough indicates Full-passthrough mode (FABRIC_MODE=0)
+	FabricModeFullPassthrough FabricMode = "full-passthrough"
+	// FabricModeSharedNVSwitch indicates Shared NVSwitch Virtualization mode (FABRIC_MODE=1)
+	FabricModeSharedNVSwitch FabricMode = "shared-nvswitch"
+)
+
+func (f FabricMode) String() string {
+	switch f {
+	case FabricModeFullPassthrough:
+		return "full-passthrough"
+	case FabricModeSharedNVSwitch:
+		return "shared-nvswitch"
+	default:
+		return ""
+	}
+}
+
+// FabricManagerSpec defines the properties for NVIDIA Fabric Manager configuration
+type FabricManagerSpec struct {
+	// Mode indicates the Fabric Manager mode
+	// +kubebuilder:validation:Enum=full-passthrough;shared-nvswitch
+	// +kubebuilder:default=full-passthrough
+	// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors=true
+	// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.displayName="Fabric Manager Mode"
+	// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.x-descriptors="urn:alm:descriptor:com.tectonic.ui:select:full-passthrough,urn:alm:descriptor:com.tectonic.ui:select:shared-nvswitch"
+	Mode FabricMode `json:"mode,omitempty"`
+}
+
 // MIGStrategy indicates MIG mode
 type MIGStrategy string
 
@@ -2334,3 +2368,18 @@ func (c *MIGPartedConfigSpec) GetName() string {
 func (c *VGPUDevicesConfigSpec) GetName() string {
 	return ptr.Deref(c, VGPUDevicesConfigSpec{}).Name
 }
+
+// IsSharedNVSwitchMode returns true if Fabric Manager is configured for Shared NVSwitch mode
+func (f *FabricManagerSpec) IsSharedNVSwitchMode() bool {
+	return f.Mode == FabricModeSharedNVSwitch
+}
+
+// ValidateFabricManagerConfig validates the Fabric Manager configuration
+func (c *ClusterPolicySpec) ValidateFabricManagerConfig() error {
+	if c.SandboxWorkloads.DefaultWorkload == "vm-passthrough" &&
+		c.FabricManager.IsSharedNVSwitchMode() &&
+		!c.Driver.IsEnabled() {
+		return fmt.Errorf("driver must be enabled when using vm-passthrough with Fabric Manager Shared NVSwitch mode")
+	}
+	return nil
+}
diff --git a/api/nvidia/v1/zz_generated.deepcopy.go b/api/nvidia/v1/zz_generated.deepcopy.go
index f65e0648b..61ead151c 100644
--- a/api/nvidia/v1/zz_generated.deepcopy.go
+++ b/api/nvidia/v1/zz_generated.deepcopy.go
@@ -215,6 +215,7 @@ func (in *ClusterPolicySpec) DeepCopyInto(out *ClusterPolicySpec) {
 	in.CCManager.DeepCopyInto(&out.CCManager)
 	out.HostPaths = in.HostPaths
 	in.KataSandboxDevicePlugin.DeepCopyInto(&out.KataSandboxDevicePlugin)
+	out.FabricManager = in.FabricManager
 }
 
 // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ClusterPolicySpec.
@@ -839,6 +840,21 @@ func (in *EnvVar) DeepCopy() *EnvVar {
 	return out
 }
 
+// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
+func (in *FabricManagerSpec) DeepCopyInto(out *FabricManagerSpec) {
+	*out = *in
+}
+
+// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new FabricManagerSpec.
+func (in *FabricManagerSpec) DeepCopy() *FabricManagerSpec {
+	if in == nil {
+		return nil
+	}
+	out := new(FabricManagerSpec)
+	in.DeepCopyInto(out)
+	return out
+}
+
 // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
 func (in *GDRCopySpec) DeepCopyInto(out *GDRCopySpec) {
 	*out = *in
diff --git a/bundle/manifests/nvidia.com_clusterpolicies.yaml b/bundle/manifests/nvidia.com_clusterpolicies.yaml
index 030581fc1..8dddbaf9e 100644
--- a/bundle/manifests/nvidia.com_clusterpolicies.yaml
+++ b/bundle/manifests/nvidia.com_clusterpolicies.yaml
@@ -1297,6 +1297,17 @@ spec:
                         type: string
                     type: object
                 type: object
+              fabricManager:
+                description: FabricManager component spec
+                properties:
+                  mode:
+                    default: full-passthrough
+                    description: Mode indicates the Fabric Manager mode
+                    enum:
+                    - full-passthrough
+                    - shared-nvswitch
+                    type: string
+                type: object
               gdrcopy:
                 description: GDRCopy component spec
                 properties:
diff --git a/config/crd/bases/nvidia.com_clusterpolicies.yaml b/config/crd/bases/nvidia.com_clusterpolicies.yaml
index 030581fc1..8dddbaf9e 100644
--- a/config/crd/bases/nvidia.com_clusterpolicies.yaml
+++ b/config/crd/bases/nvidia.com_clusterpolicies.yaml
@@ -1297,6 +1297,17 @@ spec:
                         type: string
                     type: object
                 type: object
+              fabricManager:
+                description: FabricManager component spec
+                properties:
+                  mode:
+                    default: full-passthrough
+                    description: Mode indicates the Fabric Manager mode
+                    enum:
+                    - full-passthrough
+                    - shared-nvswitch
+                    type: string
+                type: object
               gdrcopy:
                 description: GDRCopy component spec
                 properties:
diff --git a/deployments/gpu-operator/crds/nvidia.com_clusterpolicies.yaml b/deployments/gpu-operator/crds/nvidia.com_clusterpolicies.yaml
index 030581fc1..8dddbaf9e 100644
--- a/deployments/gpu-operator/crds/nvidia.com_clusterpolicies.yaml
+++ b/deployments/gpu-operator/crds/nvidia.com_clusterpolicies.yaml
@@ -1297,6 +1297,17 @@ spec:
                         type: string
                     type: object
                 type: object
+              fabricManager:
+                description: FabricManager component spec
+                properties:
+                  mode:
+                    default: full-passthrough
+                    description: Mode indicates the Fabric Manager mode
+                    enum:
+                    - full-passthrough
+                    - shared-nvswitch
+                    type: string
+                type: object
               gdrcopy:
                 description: GDRCopy component spec
                 properties:

From abaeb18449450e7806d241b8c00a2f5ffcd09f81 Mon Sep 17 00:00:00 2001
From: Michail Resvanis <mresvani@redhat.com>
Date: Thu, 8 Jan 2026 15:25:54 +0100
Subject: [PATCH 2/8] Implement controller support for Fabric Manager
 configuration when vm-passthrough

Signed-off-by: Michail Resvanis <mresvani@redhat.com>
---
 controllers/state_manager.go      |  41 +++-
 controllers/state_manager_test.go | 323 ++++++++++++++++++++++++++++++
 2 files changed, 357 insertions(+), 7 deletions(-)

diff --git a/controllers/state_manager.go b/controllers/state_manager.go
index e7b11ca69..4d3e1b998 100644
--- a/controllers/state_manager.go
+++ b/controllers/state_manager.go
@@ -43,6 +43,7 @@ const (
 	commonGPULabelValue                 = "true"
 	commonOperandsLabelKey              = "nvidia.com/gpu.deploy.operands"
 	commonOperandsLabelValue            = "true"
+	driverLabelKey                      = "nvidia.com/gpu.deploy.driver"
 	migManagerLabelKey                  = "nvidia.com/gpu.deploy.mig-manager"
 	migManagerLabelValue                = "true"
 	migCapableLabelKey                  = "nvidia.com/mig.capable"
@@ -119,10 +120,11 @@ var gpuNodeLabels = map[string]string{
 }
 
 type gpuWorkloadConfiguration struct {
-	config      string
-	sandboxMode string // SandboxWorkloads.Mode (e.g. "kubevirt", "kata") — only affects vm-passthrough labels
-	node        string
-	log         logr.Logger
+	config        string
+	sandboxMode   string // SandboxWorkloads.Mode (e.g. "kubevirt", "kata") — only affects vm-passthrough labels
+	node          string
+	log           logr.Logger
+	clusterPolicy *gpuv1.ClusterPolicy
 }
 
 // OpenShiftDriverToolkit contains the values required to deploy
@@ -327,6 +329,15 @@ func isValidWorkloadConfig(workloadConfig string) bool {
 	return ok
 }
 
+// shouldDeployDriverForVMPassthrough returns true if driver should be deployed for vm-passthrough workload
+// based on Fabric Manager configuration
+func (w *gpuWorkloadConfiguration) shouldDeployDriverForVMPassthrough() bool {
+	if w.config != gpuWorkloadConfigVMPassthrough || w.clusterPolicy == nil {
+		return false
+	}
+	return w.clusterPolicy.Spec.FabricManager.IsSharedNVSwitchMode()
+}
+
 // getWorkloadConfig returns the GPU workload configured for the node.
 // If an error occurs when searching for the workload config,
 // return defaultGPUWorkloadConfig.
@@ -346,13 +357,19 @@ func getWorkloadConfig(labels map[string]string, sandboxEnabled bool) (string, e
 // getEffectiveStateLabels returns the state labels to apply for the given workload config and sandbox mode.
 // When config is vm-passthrough and mode is "kata", returns labels with kata-device-plugin instead of sandbox-device-plugin.
 func getEffectiveStateLabels(config, mode string) map[string]string {
-	labels, ok := gpuStateLabels[config]
+	base, ok := gpuStateLabels[config]
 	if !ok {
 		return nil
 	}
 
 	if config != gpuWorkloadConfigVMPassthrough {
-		return labels
+		return base
+	}
+
+	// Copy the base labels to avoid mutating the global map
+	labels := make(map[string]string, len(base))
+	for k, v := range base {
+		labels[k] = v
 	}
 
 	// update labels for the sandbox modes for passthrough
@@ -417,6 +434,16 @@ func (w *gpuWorkloadConfiguration) addGPUStateLabels(labels map[string]string) b
 			modified = true
 		}
 	}
+
+	// Add conditional driver deployment for vm-passthrough workload
+	if w.shouldDeployDriverForVMPassthrough() {
+		if _, ok := labels[driverLabelKey]; !ok {
+			w.log.Info("Setting node label for driver deployment in vm-passthrough with Fabric Manager shared-nvswitch mode", "NodeName", w.node, "Label", driverLabelKey, "Value", "true")
+			labels[driverLabelKey] = "true"
+			modified = true
+		}
+	}
+
 	if w.config == gpuWorkloadConfigContainer && hasMIGCapableGPU(labels) && !hasMIGManagerLabel(labels) {
 		w.log.Info("Setting node label", "NodeName", w.node, "Label", migManagerLabelKey, "Value", migManagerLabelValue)
 		labels[migManagerLabelKey] = migManagerLabelValue
@@ -545,7 +572,7 @@ func (n *ClusterPolicyController) labelGPUNodes() (bool, int, error) {
 		}
 		n.logger.Info("GPU workload configuration", "NodeName", node.Name, "GpuWorkloadConfig", config)
 		mode := n.singleton.Spec.SandboxWorkloads.Mode
-		gpuWorkloadConfig := &gpuWorkloadConfiguration{config: config, sandboxMode: mode, node: node.Name, log: n.logger}
+		gpuWorkloadConfig := &gpuWorkloadConfiguration{config: config, sandboxMode: mode, node: node.Name, log: n.logger, clusterPolicy: n.singleton}
 		if !hasCommonGPULabel(labels) && hasGPULabels(labels) {
 			n.logger.Info("Node has GPU(s)", "NodeName", node.Name)
 			// label the node with common Nvidia GPU label
diff --git a/controllers/state_manager_test.go b/controllers/state_manager_test.go
index 35c245be0..add0988c5 100644
--- a/controllers/state_manager_test.go
+++ b/controllers/state_manager_test.go
@@ -20,7 +20,10 @@ import (
 	"errors"
 	"testing"
 
+	"github.com/go-logr/logr"
+	"github.com/stretchr/testify/assert"
 	"github.com/stretchr/testify/require"
+
 	corev1 "k8s.io/api/core/v1"
 	"k8s.io/utils/ptr"
 
@@ -371,3 +374,323 @@ func TestIsStateEnabled_SandboxAndKataDevicePlugin(t *testing.T) {
 		})
 	}
 }
+
+func TestGpuWorkloadConfiguration_ShouldDeployDriverForVMPassthrough(t *testing.T) {
+	tests := []struct {
+		name          string
+		config        string
+		clusterPolicy *gpuv1.ClusterPolicy
+		expected      bool
+	}{
+		{
+			name:   "non-vm-passthrough workload",
+			config: gpuWorkloadConfigContainer,
+			clusterPolicy: &gpuv1.ClusterPolicy{
+				Spec: gpuv1.ClusterPolicySpec{
+					FabricManager: gpuv1.FabricManagerSpec{
+						Mode: gpuv1.FabricModeSharedNVSwitch,
+					},
+				},
+			},
+			expected: false,
+		},
+		{
+			name:          "vm-passthrough with nil cluster policy",
+			config:        gpuWorkloadConfigVMPassthrough,
+			clusterPolicy: nil,
+			expected:      false,
+		},
+		{
+			name:   "vm-passthrough with shared-nvswitch mode",
+			config: gpuWorkloadConfigVMPassthrough,
+			clusterPolicy: &gpuv1.ClusterPolicy{
+				Spec: gpuv1.ClusterPolicySpec{
+					FabricManager: gpuv1.FabricManagerSpec{
+						Mode: gpuv1.FabricModeSharedNVSwitch,
+					},
+				},
+			},
+			expected: true,
+		},
+		{
+			name:   "vm-passthrough with full-passthrough mode",
+			config: gpuWorkloadConfigVMPassthrough,
+			clusterPolicy: &gpuv1.ClusterPolicy{
+				Spec: gpuv1.ClusterPolicySpec{
+					FabricManager: gpuv1.FabricManagerSpec{
+						Mode: gpuv1.FabricModeFullPassthrough,
+					},
+				},
+			},
+			expected: false,
+		},
+		{
+			name:   "vm-passthrough with default (empty) fabric manager mode",
+			config: gpuWorkloadConfigVMPassthrough,
+			clusterPolicy: &gpuv1.ClusterPolicy{
+				Spec: gpuv1.ClusterPolicySpec{
+					FabricManager: gpuv1.FabricManagerSpec{
+						Mode: "", // empty defaults to full-passthrough
+					},
+				},
+			},
+			expected: false,
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			workloadConfig := &gpuWorkloadConfiguration{
+				config:        tt.config,
+				node:          "test-node",
+				log:           logr.Discard(),
+				clusterPolicy: tt.clusterPolicy,
+			}
+
+			result := workloadConfig.shouldDeployDriverForVMPassthrough()
+			assert.Equal(t, tt.expected, result)
+		})
+	}
+}
+
+func TestGpuWorkloadConfiguration_AddGPUStateLabels(t *testing.T) {
+	tests := []struct {
+		name           string
+		config         string
+		clusterPolicy  *gpuv1.ClusterPolicy
+		inputLabels    map[string]string
+		expectedLabels map[string]string
+		expectModified bool
+	}{
+		{
+			name:   "vm-passthrough with shared-nvswitch adds driver label",
+			config: gpuWorkloadConfigVMPassthrough,
+			clusterPolicy: &gpuv1.ClusterPolicy{
+				Spec: gpuv1.ClusterPolicySpec{
+					FabricManager: gpuv1.FabricManagerSpec{
+						Mode: gpuv1.FabricModeSharedNVSwitch,
+					},
+				},
+			},
+			inputLabels: map[string]string{},
+			expectedLabels: map[string]string{
+				"nvidia.com/gpu.deploy.sandbox-device-plugin": "true",
+				"nvidia.com/gpu.deploy.sandbox-validator":     "true",
+				"nvidia.com/gpu.deploy.vfio-manager":          "true",
+				"nvidia.com/gpu.deploy.kata-manager":          "true",
+				"nvidia.com/gpu.deploy.cc-manager":            "true",
+				"nvidia.com/gpu.deploy.driver":                "true",
+			},
+			expectModified: true,
+		},
+		{
+			name:   "vm-passthrough with full-passthrough does not add driver label",
+			config: gpuWorkloadConfigVMPassthrough,
+			clusterPolicy: &gpuv1.ClusterPolicy{
+				Spec: gpuv1.ClusterPolicySpec{
+					FabricManager: gpuv1.FabricManagerSpec{
+						Mode: gpuv1.FabricModeFullPassthrough,
+					},
+				},
+			},
+			inputLabels: map[string]string{},
+			expectedLabels: map[string]string{
+				"nvidia.com/gpu.deploy.sandbox-device-plugin": "true",
+				"nvidia.com/gpu.deploy.sandbox-validator":     "true",
+				"nvidia.com/gpu.deploy.vfio-manager":          "true",
+				"nvidia.com/gpu.deploy.kata-manager":          "true",
+				"nvidia.com/gpu.deploy.cc-manager":            "true",
+			},
+			expectModified: true,
+		},
+		{
+			name:   "container workload is not affected",
+			config: gpuWorkloadConfigContainer,
+			clusterPolicy: &gpuv1.ClusterPolicy{
+				Spec: gpuv1.ClusterPolicySpec{
+					FabricManager: gpuv1.FabricManagerSpec{
+						Mode: gpuv1.FabricModeSharedNVSwitch,
+					},
+				},
+			},
+			inputLabels: map[string]string{
+				"existing-label": "value",
+			},
+			expectedLabels: map[string]string{
+				"existing-label":                              "value",
+				"nvidia.com/gpu.deploy.driver":                "true",
+				"nvidia.com/gpu.deploy.gpu-feature-discovery": "true",
+				"nvidia.com/gpu.deploy.container-toolkit":     "true",
+				"nvidia.com/gpu.deploy.device-plugin":         "true",
+				"nvidia.com/gpu.deploy.dcgm":                  "true",
+				"nvidia.com/gpu.deploy.dcgm-exporter":         "true",
+				"nvidia.com/gpu.deploy.node-status-exporter":  "true",
+				"nvidia.com/gpu.deploy.operator-validator":    "true",
+			},
+			expectModified: true,
+		},
+		{
+			name:          "vm-passthrough with nil cluster policy does not add driver label",
+			config:        gpuWorkloadConfigVMPassthrough,
+			clusterPolicy: nil,
+			inputLabels:   map[string]string{},
+			expectedLabels: map[string]string{
+				"nvidia.com/gpu.deploy.sandbox-device-plugin": "true",
+				"nvidia.com/gpu.deploy.sandbox-validator":     "true",
+				"nvidia.com/gpu.deploy.vfio-manager":          "true",
+				"nvidia.com/gpu.deploy.kata-manager":          "true",
+				"nvidia.com/gpu.deploy.cc-manager":            "true",
+			},
+			expectModified: true,
+		},
+		{
+			name:   "driver label already exists - no modification",
+			config: gpuWorkloadConfigVMPassthrough,
+			clusterPolicy: &gpuv1.ClusterPolicy{
+				Spec: gpuv1.ClusterPolicySpec{
+					FabricManager: gpuv1.FabricManagerSpec{
+						Mode: gpuv1.FabricModeSharedNVSwitch,
+					},
+				},
+			},
+			inputLabels: map[string]string{
+				"nvidia.com/gpu.deploy.sandbox-device-plugin": "true",
+				"nvidia.com/gpu.deploy.sandbox-validator":     "true",
+				"nvidia.com/gpu.deploy.vfio-manager":          "true",
+				"nvidia.com/gpu.deploy.kata-manager":          "true",
+				"nvidia.com/gpu.deploy.cc-manager":            "true",
+				"nvidia.com/gpu.deploy.driver":                "true",
+			},
+			expectedLabels: map[string]string{
+				"nvidia.com/gpu.deploy.sandbox-device-plugin": "true",
+				"nvidia.com/gpu.deploy.sandbox-validator":     "true",
+				"nvidia.com/gpu.deploy.vfio-manager":          "true",
+				"nvidia.com/gpu.deploy.kata-manager":          "true",
+				"nvidia.com/gpu.deploy.cc-manager":            "true",
+				"nvidia.com/gpu.deploy.driver":                "true",
+			},
+			expectModified: false,
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			workloadConfig := &gpuWorkloadConfiguration{
+				config:        tt.config,
+				node:          "test-node",
+				log:           logr.Discard(),
+				clusterPolicy: tt.clusterPolicy,
+			}
+
+			// Make a copy of input labels to avoid modifying the test data
+			labels := make(map[string]string)
+			for k, v := range tt.inputLabels {
+				labels[k] = v
+			}
+
+			modified := workloadConfig.addGPUStateLabels(labels)
+
+			assert.Equal(t, tt.expectModified, modified)
+			assert.Equal(t, tt.expectedLabels, labels)
+		})
+	}
+}
+
+func TestClusterPolicyValidateFabricManagerConfig(t *testing.T) {
+	tests := []struct {
+		name          string
+		clusterPolicy *gpuv1.ClusterPolicySpec
+		expectError   bool
+		errorMessage  string
+	}{
+		{
+			name: "valid configuration - vm-passthrough with shared-nvswitch and driver enabled",
+			clusterPolicy: &gpuv1.ClusterPolicySpec{
+				SandboxWorkloads: gpuv1.SandboxWorkloadsSpec{
+					DefaultWorkload: "vm-passthrough",
+				},
+				FabricManager: gpuv1.FabricManagerSpec{
+					Mode: gpuv1.FabricModeSharedNVSwitch,
+				},
+				Driver: gpuv1.DriverSpec{
+					Enabled: newBoolPtr(true),
+				},
+			},
+			expectError: false,
+		},
+		{
+			name: "valid configuration - vm-passthrough with full-passthrough mode",
+			clusterPolicy: &gpuv1.ClusterPolicySpec{
+				SandboxWorkloads: gpuv1.SandboxWorkloadsSpec{
+					DefaultWorkload: "vm-passthrough",
+				},
+				FabricManager: gpuv1.FabricManagerSpec{
+					Mode: gpuv1.FabricModeFullPassthrough,
+				},
+				Driver: gpuv1.DriverSpec{
+					Enabled: newBoolPtr(false),
+				},
+			},
+			expectError: false,
+		},
+		{
+			name: "valid configuration - container workload with any fabric manager mode",
+			clusterPolicy: &gpuv1.ClusterPolicySpec{
+				SandboxWorkloads: gpuv1.SandboxWorkloadsSpec{
+					DefaultWorkload: "container",
+				},
+				FabricManager: gpuv1.FabricManagerSpec{
+					Mode: gpuv1.FabricModeSharedNVSwitch,
+				},
+				Driver: gpuv1.DriverSpec{
+					Enabled: newBoolPtr(false),
+				},
+			},
+			expectError: false,
+		},
+		{
+			name: "invalid configuration - vm-passthrough with shared-nvswitch but driver disabled",
+			clusterPolicy: &gpuv1.ClusterPolicySpec{
+				SandboxWorkloads: gpuv1.SandboxWorkloadsSpec{
+					DefaultWorkload: "vm-passthrough",
+				},
+				FabricManager: gpuv1.FabricManagerSpec{
+					Mode: gpuv1.FabricModeSharedNVSwitch,
+				},
+				Driver: gpuv1.DriverSpec{
+					Enabled: newBoolPtr(false),
+				},
+			},
+			expectError:  true,
+			errorMessage: "driver must be enabled when using vm-passthrough with Fabric Manager Shared NVSwitch mode",
+		},
+		{
+			name: "valid configuration - vm-passthrough with shared-nvswitch and driver not specified (defaults to enabled)",
+			clusterPolicy: &gpuv1.ClusterPolicySpec{
+				SandboxWorkloads: gpuv1.SandboxWorkloadsSpec{
+					DefaultWorkload: "vm-passthrough",
+				},
+				FabricManager: gpuv1.FabricManagerSpec{
+					Mode: gpuv1.FabricModeSharedNVSwitch,
+				},
+				Driver: gpuv1.DriverSpec{
+					// Enabled not specified, defaults to true
+				},
+			},
+			expectError: false,
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			err := tt.clusterPolicy.ValidateFabricManagerConfig()
+
+			if tt.expectError {
+				assert.Error(t, err)
+				assert.Contains(t, err.Error(), tt.errorMessage)
+			} else {
+				assert.NoError(t, err)
+			}
+		})
+	}
+}

From 3c669cdda57ec1e2b2f5ece8666374cc2265477d Mon Sep 17 00:00:00 2001
From: Michail Resvanis <mresvani@redhat.com>
Date: Thu, 29 Jan 2026 11:31:21 +0100
Subject: [PATCH 3/8] Adjust driver startup probe for vm-passthrough with
 shared NVSwitch mode

Signed-off-by: Michail Resvanis <mresvani@redhat.com>
---
 assets/state-driver/0400_configmap.yaml                | 10 ++++++++--
 .../testdata/golden/driver-additional-configs.yaml     | 10 ++++++++--
 internal/state/testdata/golden/driver-full-spec.yaml   | 10 ++++++++--
 .../testdata/golden/driver-gdrcopy-openshift.yaml      | 10 ++++++++--
 internal/state/testdata/golden/driver-gdrcopy.yaml     | 10 ++++++++--
 internal/state/testdata/golden/driver-gds.yaml         | 10 ++++++++--
 internal/state/testdata/golden/driver-minimal.yaml     | 10 ++++++++--
 .../golden/driver-openshift-drivertoolkit.yaml         | 10 ++++++++--
 internal/state/testdata/golden/driver-precompiled.yaml | 10 ++++++++--
 .../state/testdata/golden/driver-rdma-hostmofed.yaml   | 10 ++++++++--
 internal/state/testdata/golden/driver-rdma.yaml        | 10 ++++++++--
 internal/state/testdata/golden/driver-secret-env.yaml  | 10 ++++++++--
 .../golden/driver-vgpu-host-manager-openshift.yaml     | 10 ++++++++--
 .../testdata/golden/driver-vgpu-host-manager.yaml      | 10 ++++++++--
 .../testdata/golden/driver-vgpu-licensing-secret.yaml  | 10 ++++++++--
 .../state/testdata/golden/driver-vgpu-licensing.yaml   | 10 ++++++++--
 manifests/state-driver/0400_configmap.yaml             | 10 ++++++++--
 17 files changed, 136 insertions(+), 34 deletions(-)

diff --git a/assets/state-driver/0400_configmap.yaml b/assets/state-driver/0400_configmap.yaml
index 67aa1e2ca..b96e0bd65 100644
--- a/assets/state-driver/0400_configmap.yaml
+++ b/assets/state-driver/0400_configmap.yaml
@@ -22,8 +22,14 @@ data:
     fi
 
     if ! nvidia-smi; then
-      echo "nvidia-smi failed"
-      exit 1
+      # For vm-passthrough with shared-nvswitch mode, nvidia-smi may fail due to unbound devices
+      # Fall back to checking if nvidia module is loaded when FABRIC_MANAGER_FABRIC_MODE=1
+      if [ "${FABRIC_MANAGER_FABRIC_MODE:-}" = "1" ]; then
+        echo "nvidia-smi failed but nvidia module is loaded (vm-passthrough with shared-nvswitch mode)"
+      else
+        echo "nvidia-smi failed"
+        exit 1
+      fi
     fi
 
     GPU_DIRECT_RDMA_ENABLED="${GPU_DIRECT_RDMA_ENABLED:-false}"
diff --git a/internal/state/testdata/golden/driver-additional-configs.yaml b/internal/state/testdata/golden/driver-additional-configs.yaml
index 774daf599..20bfb9dcb 100644
--- a/internal/state/testdata/golden/driver-additional-configs.yaml
+++ b/internal/state/testdata/golden/driver-additional-configs.yaml
@@ -106,8 +106,14 @@ data:
     fi
 
     if ! nvidia-smi; then
-      echo "nvidia-smi failed"
-      exit 1
+      # For vm-passthrough with shared-nvswitch mode, nvidia-smi may fail due to unbound devices
+      # Fall back to checking if nvidia module is loaded when FABRIC_MANAGER_FABRIC_MODE=1
+      if [ "${FABRIC_MANAGER_FABRIC_MODE:-}" = "1" ] && lsmod | grep -q "^nvidia "; then
+        echo "nvidia-smi failed but nvidia module is loaded (vm-passthrough with shared-nvswitch mode)"
+      else
+        echo "nvidia-smi failed"
+        exit 1
+      fi
     fi
 
     GPU_DIRECT_RDMA_ENABLED="${GPU_DIRECT_RDMA_ENABLED:-false}"
diff --git a/internal/state/testdata/golden/driver-full-spec.yaml b/internal/state/testdata/golden/driver-full-spec.yaml
index 24f859bb6..9c991dfc8 100644
--- a/internal/state/testdata/golden/driver-full-spec.yaml
+++ b/internal/state/testdata/golden/driver-full-spec.yaml
@@ -106,8 +106,14 @@ data:
     fi
 
     if ! nvidia-smi; then
-      echo "nvidia-smi failed"
-      exit 1
+      # For vm-passthrough with shared-nvswitch mode, nvidia-smi may fail due to unbound devices
+      # Fall back to checking if nvidia module is loaded when FABRIC_MANAGER_FABRIC_MODE=1
+      if [ "${FABRIC_MANAGER_FABRIC_MODE:-}" = "1" ] && lsmod | grep -q "^nvidia "; then
+        echo "nvidia-smi failed but nvidia module is loaded (vm-passthrough with shared-nvswitch mode)"
+      else
+        echo "nvidia-smi failed"
+        exit 1
+      fi
     fi
 
     GPU_DIRECT_RDMA_ENABLED="${GPU_DIRECT_RDMA_ENABLED:-false}"
diff --git a/internal/state/testdata/golden/driver-gdrcopy-openshift.yaml b/internal/state/testdata/golden/driver-gdrcopy-openshift.yaml
index 391e22841..b51b54dc4 100644
--- a/internal/state/testdata/golden/driver-gdrcopy-openshift.yaml
+++ b/internal/state/testdata/golden/driver-gdrcopy-openshift.yaml
@@ -106,8 +106,14 @@ data:
     fi
 
     if ! nvidia-smi; then
-      echo "nvidia-smi failed"
-      exit 1
+      # For vm-passthrough with shared-nvswitch mode, nvidia-smi may fail due to unbound devices
+      # Fall back to checking if nvidia module is loaded when FABRIC_MANAGER_FABRIC_MODE=1
+      if [ "${FABRIC_MANAGER_FABRIC_MODE:-}" = "1" ] && lsmod | grep -q "^nvidia "; then
+        echo "nvidia-smi failed but nvidia module is loaded (vm-passthrough with shared-nvswitch mode)"
+      else
+        echo "nvidia-smi failed"
+        exit 1
+      fi
     fi
 
     GPU_DIRECT_RDMA_ENABLED="${GPU_DIRECT_RDMA_ENABLED:-false}"
diff --git a/internal/state/testdata/golden/driver-gdrcopy.yaml b/internal/state/testdata/golden/driver-gdrcopy.yaml
index 77f21927a..e0d97a4ef 100644
--- a/internal/state/testdata/golden/driver-gdrcopy.yaml
+++ b/internal/state/testdata/golden/driver-gdrcopy.yaml
@@ -106,8 +106,14 @@ data:
     fi
 
     if ! nvidia-smi; then
-      echo "nvidia-smi failed"
-      exit 1
+      # For vm-passthrough with shared-nvswitch mode, nvidia-smi may fail due to unbound devices
+      # Fall back to checking if nvidia module is loaded when FABRIC_MANAGER_FABRIC_MODE=1
+      if [ "${FABRIC_MANAGER_FABRIC_MODE:-}" = "1" ] && lsmod | grep -q "^nvidia "; then
+        echo "nvidia-smi failed but nvidia module is loaded (vm-passthrough with shared-nvswitch mode)"
+      else
+        echo "nvidia-smi failed"
+        exit 1
+      fi
     fi
 
     GPU_DIRECT_RDMA_ENABLED="${GPU_DIRECT_RDMA_ENABLED:-false}"
diff --git a/internal/state/testdata/golden/driver-gds.yaml b/internal/state/testdata/golden/driver-gds.yaml
index 109c49709..9e0d8d08a 100644
--- a/internal/state/testdata/golden/driver-gds.yaml
+++ b/internal/state/testdata/golden/driver-gds.yaml
@@ -106,8 +106,14 @@ data:
     fi
 
     if ! nvidia-smi; then
-      echo "nvidia-smi failed"
-      exit 1
+      # For vm-passthrough with shared-nvswitch mode, nvidia-smi may fail due to unbound devices
+      # Fall back to checking if nvidia module is loaded when FABRIC_MANAGER_FABRIC_MODE=1
+      if [ "${FABRIC_MANAGER_FABRIC_MODE:-}" = "1" ] && lsmod | grep -q "^nvidia "; then
+        echo "nvidia-smi failed but nvidia module is loaded (vm-passthrough with shared-nvswitch mode)"
+      else
+        echo "nvidia-smi failed"
+        exit 1
+      fi
     fi
 
     GPU_DIRECT_RDMA_ENABLED="${GPU_DIRECT_RDMA_ENABLED:-false}"
diff --git a/internal/state/testdata/golden/driver-minimal.yaml b/internal/state/testdata/golden/driver-minimal.yaml
index d08ba1c2b..d0ef008da 100644
--- a/internal/state/testdata/golden/driver-minimal.yaml
+++ b/internal/state/testdata/golden/driver-minimal.yaml
@@ -106,8 +106,14 @@ data:
     fi
 
     if ! nvidia-smi; then
-      echo "nvidia-smi failed"
-      exit 1
+      # For vm-passthrough with shared-nvswitch mode, nvidia-smi may fail due to unbound devices
+      # Fall back to checking if nvidia module is loaded when FABRIC_MANAGER_FABRIC_MODE=1
+      if [ "${FABRIC_MANAGER_FABRIC_MODE:-}" = "1" ] && lsmod | grep -q "^nvidia "; then
+        echo "nvidia-smi failed but nvidia module is loaded (vm-passthrough with shared-nvswitch mode)"
+      else
+        echo "nvidia-smi failed"
+        exit 1
+      fi
     fi
 
     GPU_DIRECT_RDMA_ENABLED="${GPU_DIRECT_RDMA_ENABLED:-false}"
diff --git a/internal/state/testdata/golden/driver-openshift-drivertoolkit.yaml b/internal/state/testdata/golden/driver-openshift-drivertoolkit.yaml
index ad978ad56..06b3aceb5 100644
--- a/internal/state/testdata/golden/driver-openshift-drivertoolkit.yaml
+++ b/internal/state/testdata/golden/driver-openshift-drivertoolkit.yaml
@@ -106,8 +106,14 @@ data:
     fi
 
     if ! nvidia-smi; then
-      echo "nvidia-smi failed"
-      exit 1
+      # For vm-passthrough with shared-nvswitch mode, nvidia-smi may fail due to unbound devices
+      # Fall back to checking if nvidia module is loaded when FABRIC_MANAGER_FABRIC_MODE=1
+      if [ "${FABRIC_MANAGER_FABRIC_MODE:-}" = "1" ] && lsmod | grep -q "^nvidia "; then
+        echo "nvidia-smi failed but nvidia module is loaded (vm-passthrough with shared-nvswitch mode)"
+      else
+        echo "nvidia-smi failed"
+        exit 1
+      fi
     fi
 
     GPU_DIRECT_RDMA_ENABLED="${GPU_DIRECT_RDMA_ENABLED:-false}"
diff --git a/internal/state/testdata/golden/driver-precompiled.yaml b/internal/state/testdata/golden/driver-precompiled.yaml
index 8441a3438..d1939676b 100644
--- a/internal/state/testdata/golden/driver-precompiled.yaml
+++ b/internal/state/testdata/golden/driver-precompiled.yaml
@@ -106,8 +106,14 @@ data:
     fi
 
     if ! nvidia-smi; then
-      echo "nvidia-smi failed"
-      exit 1
+      # For vm-passthrough with shared-nvswitch mode, nvidia-smi may fail due to unbound devices
+      # Fall back to checking if nvidia module is loaded when FABRIC_MANAGER_FABRIC_MODE=1
+      if [ "${FABRIC_MANAGER_FABRIC_MODE:-}" = "1" ] && lsmod | grep -q "^nvidia "; then
+        echo "nvidia-smi failed but nvidia module is loaded (vm-passthrough with shared-nvswitch mode)"
+      else
+        echo "nvidia-smi failed"
+        exit 1
+      fi
     fi
 
     GPU_DIRECT_RDMA_ENABLED="${GPU_DIRECT_RDMA_ENABLED:-false}"
diff --git a/internal/state/testdata/golden/driver-rdma-hostmofed.yaml b/internal/state/testdata/golden/driver-rdma-hostmofed.yaml
index c2f055b4a..1eb07e114 100644
--- a/internal/state/testdata/golden/driver-rdma-hostmofed.yaml
+++ b/internal/state/testdata/golden/driver-rdma-hostmofed.yaml
@@ -106,8 +106,14 @@ data:
     fi
 
     if ! nvidia-smi; then
-      echo "nvidia-smi failed"
-      exit 1
+      # For vm-passthrough with shared-nvswitch mode, nvidia-smi may fail due to unbound devices
+      # Fall back to checking if nvidia module is loaded when FABRIC_MANAGER_FABRIC_MODE=1
+      if [ "${FABRIC_MANAGER_FABRIC_MODE:-}" = "1" ] && lsmod | grep -q "^nvidia "; then
+        echo "nvidia-smi failed but nvidia module is loaded (vm-passthrough with shared-nvswitch mode)"
+      else
+        echo "nvidia-smi failed"
+        exit 1
+      fi
     fi
 
     GPU_DIRECT_RDMA_ENABLED="${GPU_DIRECT_RDMA_ENABLED:-false}"
diff --git a/internal/state/testdata/golden/driver-rdma.yaml b/internal/state/testdata/golden/driver-rdma.yaml
index 7f6f1127a..ecd76df13 100644
--- a/internal/state/testdata/golden/driver-rdma.yaml
+++ b/internal/state/testdata/golden/driver-rdma.yaml
@@ -106,8 +106,14 @@ data:
     fi
 
     if ! nvidia-smi; then
-      echo "nvidia-smi failed"
-      exit 1
+      # For vm-passthrough with shared-nvswitch mode, nvidia-smi may fail due to unbound devices
+      # Fall back to checking if nvidia module is loaded when FABRIC_MANAGER_FABRIC_MODE=1
+      if [ "${FABRIC_MANAGER_FABRIC_MODE:-}" = "1" ] && lsmod | grep -q "^nvidia "; then
+        echo "nvidia-smi failed but nvidia module is loaded (vm-passthrough with shared-nvswitch mode)"
+      else
+        echo "nvidia-smi failed"
+        exit 1
+      fi
     fi
 
     GPU_DIRECT_RDMA_ENABLED="${GPU_DIRECT_RDMA_ENABLED:-false}"
diff --git a/internal/state/testdata/golden/driver-secret-env.yaml b/internal/state/testdata/golden/driver-secret-env.yaml
index 6db2ceb59..24b001fde 100644
--- a/internal/state/testdata/golden/driver-secret-env.yaml
+++ b/internal/state/testdata/golden/driver-secret-env.yaml
@@ -106,8 +106,14 @@ data:
     fi
 
     if ! nvidia-smi; then
-      echo "nvidia-smi failed"
-      exit 1
+      # For vm-passthrough with shared-nvswitch mode, nvidia-smi may fail due to unbound devices
+      # Fall back to checking if nvidia module is loaded when FABRIC_MANAGER_FABRIC_MODE=1
+      if [ "${FABRIC_MANAGER_FABRIC_MODE:-}" = "1" ] && lsmod | grep -q "^nvidia "; then
+        echo "nvidia-smi failed but nvidia module is loaded (vm-passthrough with shared-nvswitch mode)"
+      else
+        echo "nvidia-smi failed"
+        exit 1
+      fi
     fi
 
     GPU_DIRECT_RDMA_ENABLED="${GPU_DIRECT_RDMA_ENABLED:-false}"
diff --git a/internal/state/testdata/golden/driver-vgpu-host-manager-openshift.yaml b/internal/state/testdata/golden/driver-vgpu-host-manager-openshift.yaml
index 376d0910e..668b9e435 100644
--- a/internal/state/testdata/golden/driver-vgpu-host-manager-openshift.yaml
+++ b/internal/state/testdata/golden/driver-vgpu-host-manager-openshift.yaml
@@ -106,8 +106,14 @@ data:
     fi
 
     if ! nvidia-smi; then
-      echo "nvidia-smi failed"
-      exit 1
+      # For vm-passthrough with shared-nvswitch mode, nvidia-smi may fail due to unbound devices
+      # Fall back to checking if nvidia module is loaded when FABRIC_MANAGER_FABRIC_MODE=1
+      if [ "${FABRIC_MANAGER_FABRIC_MODE:-}" = "1" ] && lsmod | grep -q "^nvidia "; then
+        echo "nvidia-smi failed but nvidia module is loaded (vm-passthrough with shared-nvswitch mode)"
+      else
+        echo "nvidia-smi failed"
+        exit 1
+      fi
     fi
 
     GPU_DIRECT_RDMA_ENABLED="${GPU_DIRECT_RDMA_ENABLED:-false}"
diff --git a/internal/state/testdata/golden/driver-vgpu-host-manager.yaml b/internal/state/testdata/golden/driver-vgpu-host-manager.yaml
index 110a71c56..dea6316db 100644
--- a/internal/state/testdata/golden/driver-vgpu-host-manager.yaml
+++ b/internal/state/testdata/golden/driver-vgpu-host-manager.yaml
@@ -106,8 +106,14 @@ data:
     fi
 
     if ! nvidia-smi; then
-      echo "nvidia-smi failed"
-      exit 1
+      # For vm-passthrough with shared-nvswitch mode, nvidia-smi may fail due to unbound devices
+      # Fall back to checking if nvidia module is loaded when FABRIC_MANAGER_FABRIC_MODE=1
+      if [ "${FABRIC_MANAGER_FABRIC_MODE:-}" = "1" ] && lsmod | grep -q "^nvidia "; then
+        echo "nvidia-smi failed but nvidia module is loaded (vm-passthrough with shared-nvswitch mode)"
+      else
+        echo "nvidia-smi failed"
+        exit 1
+      fi
     fi
 
     GPU_DIRECT_RDMA_ENABLED="${GPU_DIRECT_RDMA_ENABLED:-false}"
diff --git a/internal/state/testdata/golden/driver-vgpu-licensing-secret.yaml b/internal/state/testdata/golden/driver-vgpu-licensing-secret.yaml
index 9c63fa61c..99a749026 100644
--- a/internal/state/testdata/golden/driver-vgpu-licensing-secret.yaml
+++ b/internal/state/testdata/golden/driver-vgpu-licensing-secret.yaml
@@ -106,8 +106,14 @@ data:
     fi
 
     if ! nvidia-smi; then
-      echo "nvidia-smi failed"
-      exit 1
+      # For vm-passthrough with shared-nvswitch mode, nvidia-smi may fail due to unbound devices
+      # Fall back to checking if nvidia module is loaded when FABRIC_MANAGER_FABRIC_MODE=1
+      if [ "${FABRIC_MANAGER_FABRIC_MODE:-}" = "1" ] && lsmod | grep -q "^nvidia "; then
+        echo "nvidia-smi failed but nvidia module is loaded (vm-passthrough with shared-nvswitch mode)"
+      else
+        echo "nvidia-smi failed"
+        exit 1
+      fi
     fi
 
     GPU_DIRECT_RDMA_ENABLED="${GPU_DIRECT_RDMA_ENABLED:-false}"
diff --git a/internal/state/testdata/golden/driver-vgpu-licensing.yaml b/internal/state/testdata/golden/driver-vgpu-licensing.yaml
index b04ed567f..85d91bfae 100644
--- a/internal/state/testdata/golden/driver-vgpu-licensing.yaml
+++ b/internal/state/testdata/golden/driver-vgpu-licensing.yaml
@@ -106,8 +106,14 @@ data:
     fi
 
     if ! nvidia-smi; then
-      echo "nvidia-smi failed"
-      exit 1
+      # For vm-passthrough with shared-nvswitch mode, nvidia-smi may fail due to unbound devices
+      # Fall back to checking if nvidia module is loaded when FABRIC_MANAGER_FABRIC_MODE=1
+      if [ "${FABRIC_MANAGER_FABRIC_MODE:-}" = "1" ] && lsmod | grep -q "^nvidia "; then
+        echo "nvidia-smi failed but nvidia module is loaded (vm-passthrough with shared-nvswitch mode)"
+      else
+        echo "nvidia-smi failed"
+        exit 1
+      fi
     fi
 
     GPU_DIRECT_RDMA_ENABLED="${GPU_DIRECT_RDMA_ENABLED:-false}"
diff --git a/manifests/state-driver/0400_configmap.yaml b/manifests/state-driver/0400_configmap.yaml
index 55ba3df55..34802a6d5 100644
--- a/manifests/state-driver/0400_configmap.yaml
+++ b/manifests/state-driver/0400_configmap.yaml
@@ -26,8 +26,14 @@ data:
     fi
 
     if ! nvidia-smi; then
-      echo "nvidia-smi failed"
-      exit 1
+      # For vm-passthrough with shared-nvswitch mode, nvidia-smi may fail due to unbound devices
+      # Fall back to checking if nvidia module is loaded when FABRIC_MANAGER_FABRIC_MODE=1
+      if [ "${FABRIC_MANAGER_FABRIC_MODE:-}" = "1" ] && lsmod | grep -q "^nvidia "; then
+        echo "nvidia-smi failed but nvidia module is loaded (vm-passthrough with shared-nvswitch mode)"
+      else
+        echo "nvidia-smi failed"
+        exit 1
+      fi
     fi
 
     GPU_DIRECT_RDMA_ENABLED="${GPU_DIRECT_RDMA_ENABLED:-false}"

From 6287721e9cb05ad21621ea33f52d30407b71d46d Mon Sep 17 00:00:00 2001
From: Michail Resvanis <mresvani@redhat.com>
Date: Tue, 20 Jan 2026 15:59:22 +0100
Subject: [PATCH 4/8] Add FM env var to driver container when shared-nvswitch

Signed-off-by: Michail Resvanis <mresvani@redhat.com>
---
 controllers/object_controls.go |  7 ++++
 controllers/transforms_test.go | 72 ++++++++++++++++++++++++++++++++++
 2 files changed, 79 insertions(+)

diff --git a/controllers/object_controls.go b/controllers/object_controls.go
index b436bcab1..64e8612f8 100644
--- a/controllers/object_controls.go
+++ b/controllers/object_controls.go
@@ -3553,6 +3553,13 @@ func transformDriverContainer(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolicy
 		}
 	}
 
+	// Set Fabric Manager environment variable if configured
+	if config.FabricManager.IsSharedNVSwitchMode() {
+		setContainerEnv(driverContainer, "FABRIC_MANAGER_FABRIC_MODE", "1")
+	} else if config.FabricManager.Mode == gpuv1.FabricModeFullPassthrough {
+		setContainerEnv(driverContainer, "FABRIC_MANAGER_FABRIC_MODE", "0")
+	}
+
 	// no further repo configuration required when using pre-compiled drivers, return here.
 	if config.Driver.UsePrecompiledDrivers() {
 		return nil
diff --git a/controllers/transforms_test.go b/controllers/transforms_test.go
index 86ade02a2..a4d861bda 100644
--- a/controllers/transforms_test.go
+++ b/controllers/transforms_test.go
@@ -3010,6 +3010,78 @@ func TestTransformDriver(t *testing.T) {
 			}),
 			errorExpected: false,
 		},
+		{
+			description: "driver with fabric manager shared-nvswitch mode",
+			ds: NewDaemonset().WithContainer(corev1.Container{Name: "nvidia-driver-ctr"}).
+				WithInitContainer(corev1.Container{Name: "k8s-driver-manager"}),
+			cpSpec: &gpuv1.ClusterPolicySpec{
+				Driver: gpuv1.DriverSpec{
+					Repository: "nvcr.io/nvidia",
+					Image:      "driver",
+					Version:    "570.172.08",
+					Manager: gpuv1.DriverManagerSpec{
+						Repository: "nvcr.io/nvidia/cloud-native",
+						Image:      "k8s-driver-manager",
+						Version:    "v0.8.0",
+					},
+				},
+				FabricManager: gpuv1.FabricManagerSpec{
+					Mode: gpuv1.FabricModeSharedNVSwitch,
+				},
+			},
+			client: mockClientMap["secret-env-client"],
+			expectedDs: NewDaemonset().WithContainer(corev1.Container{
+				Name:            "nvidia-driver-ctr",
+				Image:           "nvcr.io/nvidia/driver:570.172.08-",
+				ImagePullPolicy: corev1.PullIfNotPresent,
+				Env: []corev1.EnvVar{
+					{
+						Name:  "FABRIC_MANAGER_FABRIC_MODE",
+						Value: "1",
+					},
+				},
+			}).WithInitContainer(corev1.Container{
+				Name:  "k8s-driver-manager",
+				Image: "nvcr.io/nvidia/cloud-native/k8s-driver-manager:v0.8.0",
+			}),
+			errorExpected: false,
+		},
+		{
+			description: "driver with fabric manager full-passthrough mode",
+			ds: NewDaemonset().WithContainer(corev1.Container{Name: "nvidia-driver-ctr"}).
+				WithInitContainer(corev1.Container{Name: "k8s-driver-manager"}),
+			cpSpec: &gpuv1.ClusterPolicySpec{
+				Driver: gpuv1.DriverSpec{
+					Repository: "nvcr.io/nvidia",
+					Image:      "driver",
+					Version:    "570.172.08",
+					Manager: gpuv1.DriverManagerSpec{
+						Repository: "nvcr.io/nvidia/cloud-native",
+						Image:      "k8s-driver-manager",
+						Version:    "v0.8.0",
+					},
+				},
+				FabricManager: gpuv1.FabricManagerSpec{
+					Mode: gpuv1.FabricModeFullPassthrough,
+				},
+			},
+			client: mockClientMap["secret-env-client"],
+			expectedDs: NewDaemonset().WithContainer(corev1.Container{
+				Name:            "nvidia-driver-ctr",
+				Image:           "nvcr.io/nvidia/driver:570.172.08-",
+				ImagePullPolicy: corev1.PullIfNotPresent,
+				Env: []corev1.EnvVar{
+					{
+						Name:  "FABRIC_MANAGER_FABRIC_MODE",
+						Value: "0",
+					},
+				},
+			}).WithInitContainer(corev1.Container{
+				Name:  "k8s-driver-manager",
+				Image: "nvcr.io/nvidia/cloud-native/k8s-driver-manager:v0.8.0",
+			}),
+			errorExpected: false,
+		},
 	}
 
 	for _, tc := range testCases {

From f2e8b4529e14b09dd26f09287eeb005f7fffd668 Mon Sep 17 00:00:00 2001
From: Michail Resvanis <mresvani@redhat.com>
Date: Wed, 21 Jan 2026 15:32:56 +0100
Subject: [PATCH 5/8] Support vfio-manager with shared-nvswitch fabric manager
 mode

When clusterPolicy.fabricManager.mode=shared-nvswitch and
workload=vm-passthrough, the vfio-manager now preserves the
NVIDIA driver for fabric management while enabling GPU device
passthrough to VMs.

Changes:
- Modify TransformVFIOManager to detect shared-nvswitch mode.
- Replace driver uninstall init container with device unbind init
  container.
- Use vfio-manage unbind --all to detach devices from nvidia driver.
- Keep nvidia driver loaded for fabric management functionality.
- Add comprehensive unit tests for both normal and shared-nvswitch
  modes.

The new flow for shared-nvswitch mode for the vfio-manager:
1. InitContainer: vfio-manage unbind --all (unbind from nvidia driver)
2. Container: vfio-manage bind --all (bind to vfio-pci)

This enables simultaneous fabric management and VM passthrough capabilities.

Signed-off-by: Michail Resvanis <mresvani@redhat.com>
---
 assets/state-vfio-manager/0400_configmap.yaml | 30 +++++++++
 assets/state-vfio-manager/0500_daemonset.yaml | 13 +++-
 controllers/object_controls.go                | 45 +++++++++++--
 controllers/transforms_test.go                | 66 ++++++++++++++++++-
 4 files changed, 147 insertions(+), 7 deletions(-)
 create mode 100644 assets/state-vfio-manager/0400_configmap.yaml

diff --git a/assets/state-vfio-manager/0400_configmap.yaml b/assets/state-vfio-manager/0400_configmap.yaml
new file mode 100644
index 000000000..55a15476f
--- /dev/null
+++ b/assets/state-vfio-manager/0400_configmap.yaml
@@ -0,0 +1,30 @@
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: nvidia-vfio-manager-entrypoint
+  namespace: "FILLED BY THE OPERATOR"
+  labels:
+    app: nvidia-vfio-manager
+data:
+  init-entrypoint.sh: |-
+    #!/bin/sh
+
+    if [ "${FABRIC_MANAGER_MODE}" = "shared-nvswitch" ]; then
+      # In shared-nvswitch mode, wait for driver to be ready before unbinding devices
+      echo "Shared NVSwitch mode detected, waiting for driver readiness..."
+      until [ -f /run/nvidia/validations/driver-ready ]
+      do
+        echo "waiting for the driver validations to be ready..."
+        sleep 5
+      done
+
+      set -o allexport
+      cat /run/nvidia/validations/driver-ready
+      . /run/nvidia/validations/driver-ready
+
+      echo "Driver is ready, proceeding with device unbind"
+      exec vfio-manage unbind --all
+    else
+      # Default mode: uninstall the driver
+      exec driver-manager uninstall_driver
+    fi
diff --git a/assets/state-vfio-manager/0500_daemonset.yaml b/assets/state-vfio-manager/0500_daemonset.yaml
index 1039cc874..8f4cc1830 100644
--- a/assets/state-vfio-manager/0500_daemonset.yaml
+++ b/assets/state-vfio-manager/0500_daemonset.yaml
@@ -26,8 +26,9 @@ spec:
         - name: k8s-driver-manager
           image: "FILLED BY THE OPERATOR"
           imagePullPolicy: IfNotPresent
-          command: ["driver-manager"]
-          args: ["uninstall_driver"]
+          command: ["/bin/sh", "-c"]
+          args:
+            - /bin/init-entrypoint.sh
           env:
           - name: NODE_NAME
             valueFrom:
@@ -47,6 +48,10 @@ spec:
           securityContext:
             privileged: true
           volumeMounts:
+            - name: nvidia-vfio-manager-entrypoint
+              readOnly: true
+              mountPath: /bin/init-entrypoint.sh
+              subPath: init-entrypoint.sh
             - name: run-nvidia
               mountPath: /run/nvidia
               mountPropagation: Bidirectional
@@ -90,6 +95,10 @@ spec:
                 command: ["vfio-manage unbind --all"]
       terminationGracePeriodSeconds: 30
       volumes:
+        - name: nvidia-vfio-manager-entrypoint
+          configMap:
+            name: nvidia-vfio-manager-entrypoint
+            defaultMode: 448
         - name: host-sys
           hostPath:
             path: /sys
diff --git a/controllers/object_controls.go b/controllers/object_controls.go
index 64e8612f8..0f57d8059 100644
--- a/controllers/object_controls.go
+++ b/controllers/object_controls.go
@@ -2029,10 +2029,47 @@ func TransformKataManager(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolicySpec
 
 // TransformVFIOManager transforms VFIO-PCI Manager daemonset with required config as per ClusterPolicy
 func TransformVFIOManager(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolicySpec, n ClusterPolicyController) error {
-	// update k8s-driver-manager initContainer
-	err := transformDriverManagerInitContainer(obj, &config.VFIOManager.DriverManager, nil)
-	if err != nil {
-		return fmt.Errorf("failed to transform k8s-driver-manager initContainer for VFIO Manager: %v", err)
+	// Check if we're in shared-nvswitch mode
+	if config.FabricManager.IsSharedNVSwitchMode() {
+		// In shared-nvswitch mode, use the vfio-manager image for the init container
+		// and set FABRIC_MANAGER_MODE so the entrypoint script runs vfio-manage unbind
+		container := findContainerByName(obj.Spec.Template.Spec.InitContainers, "k8s-driver-manager")
+
+		mainImage, err := gpuv1.ImagePath(&config.VFIOManager)
+		if err != nil {
+			return err
+		}
+
+		container.Name = "vfio-device-unbind"
+		container.Image = mainImage
+		container.ImagePullPolicy = gpuv1.ImagePullPolicy(config.VFIOManager.ImagePullPolicy)
+
+		setContainerEnv(container, "FABRIC_MANAGER_MODE", "shared-nvswitch")
+		setContainerEnv(container, "HOST_ROOT", "/host")
+
+		// Add nvidia-validations volume mount for driver-ready file
+		container.VolumeMounts = append(container.VolumeMounts, corev1.VolumeMount{
+			Name:      "nvidia-validations",
+			MountPath: "/run/nvidia/validations",
+			ReadOnly:  true,
+		})
+
+		// Add nvidia-validations volume
+		obj.Spec.Template.Spec.Volumes = append(obj.Spec.Template.Spec.Volumes, corev1.Volume{
+			Name: "nvidia-validations",
+			VolumeSource: corev1.VolumeSource{
+				HostPath: &corev1.HostPathVolumeSource{
+					Path: "/run/nvidia/validations",
+					Type: &[]corev1.HostPathType{corev1.HostPathDirectoryOrCreate}[0],
+				},
+			},
+		})
+	} else {
+		// Default behavior: update k8s-driver-manager initContainer
+		err := transformDriverManagerInitContainer(obj, &config.VFIOManager.DriverManager, nil)
+		if err != nil {
+			return fmt.Errorf("failed to transform k8s-driver-manager initContainer for VFIO Manager: %v", err)
+		}
 	}
 
 	// update image
diff --git a/controllers/transforms_test.go b/controllers/transforms_test.go
index a4d861bda..ea1f7bce4 100644
--- a/controllers/transforms_test.go
+++ b/controllers/transforms_test.go
@@ -1912,7 +1912,7 @@ func TestTransformVFIOManager(t *testing.T) {
 		expectedDaemonset Daemonset
 	}{
 		{
-			description: "transform vfio manager",
+			description: "transform vfio manager - normal mode",
 			daemonset: NewDaemonset().
 				WithContainer(corev1.Container{Name: "nvidia-vfio-manager"}).
 				WithContainer(corev1.Container{Name: "sidecar"}).
@@ -1935,6 +1935,9 @@ func TestTransformVFIOManager(t *testing.T) {
 						Env:             mockEnv,
 					},
 				},
+				FabricManager: gpuv1.FabricManagerSpec{
+					Mode: gpuv1.FabricModeFullPassthrough,
+				},
 			},
 			expectedDaemonset: NewDaemonset().
 				WithContainer(corev1.Container{
@@ -1957,6 +1960,67 @@ func TestTransformVFIOManager(t *testing.T) {
 				}).
 				WithPullSecret(secret),
 		},
+		{
+			description: "transform vfio manager - shared-nvswitch mode",
+			daemonset: NewDaemonset().
+				WithContainer(corev1.Container{Name: "nvidia-vfio-manager"}).
+				WithContainer(corev1.Container{Name: "sidecar"}).
+				WithInitContainer(corev1.Container{Name: "k8s-driver-manager"}),
+			clusterPolicySpec: &gpuv1.ClusterPolicySpec{
+				VFIOManager: gpuv1.VFIOManagerSpec{
+					Repository:       "nvcr.io/nvidia/cloud-native",
+					Image:            "vfio-pci-manager",
+					Version:          "v1.0.0",
+					ImagePullPolicy:  "IfNotPresent",
+					ImagePullSecrets: []string{secret},
+					Resources:        &gpuv1.ResourceRequirements{Limits: resources.Limits, Requests: resources.Requests},
+					Args:             []string{"--test-flag"},
+					Env:              mockEnv,
+				},
+				FabricManager: gpuv1.FabricManagerSpec{
+					Mode: gpuv1.FabricModeSharedNVSwitch,
+				},
+			},
+			expectedDaemonset: NewDaemonset().
+				WithContainer(corev1.Container{
+					Name:            "nvidia-vfio-manager",
+					Image:           "nvcr.io/nvidia/cloud-native/vfio-pci-manager:v1.0.0",
+					ImagePullPolicy: corev1.PullIfNotPresent,
+					Args:            []string{"--test-flag"},
+					Env:             mockEnvCore,
+					Resources:       resources,
+				}).
+				WithContainer(corev1.Container{
+					Name:      "sidecar",
+					Resources: resources,
+				}).
+				WithInitContainer(corev1.Container{
+					Name:            "vfio-device-unbind",
+					Image:           "nvcr.io/nvidia/cloud-native/vfio-pci-manager:v1.0.0",
+					ImagePullPolicy: corev1.PullIfNotPresent,
+					Env: []corev1.EnvVar{
+						{Name: "FABRIC_MANAGER_MODE", Value: "shared-nvswitch"},
+						{Name: "HOST_ROOT", Value: "/host"},
+					},
+					VolumeMounts: []corev1.VolumeMount{
+						{
+							Name:      "nvidia-validations",
+							MountPath: "/run/nvidia/validations",
+							ReadOnly:  true,
+						},
+					},
+				}).
+				WithVolume(corev1.Volume{
+					Name: "nvidia-validations",
+					VolumeSource: corev1.VolumeSource{
+						HostPath: &corev1.HostPathVolumeSource{
+							Path: "/run/nvidia/validations",
+							Type: &[]corev1.HostPathType{corev1.HostPathDirectoryOrCreate}[0],
+						},
+					},
+				}).
+				WithPullSecret(secret),
+		},
 	}
 
 	for _, tc := range testCases {

From d0682bfe35f42c599fbaeb93d41802e178288835 Mon Sep 17 00:00:00 2001
From: Michail Resvanis <mresvani@redhat.com>
Date: Thu, 22 Jan 2026 10:18:32 +0100
Subject: [PATCH 6/8] Add wait for vfio-pci sandbox validation

Signed-off-by: Michail Resvanis <mresvani@redhat.com>
---
 assets/state-vfio-manager/0500_daemonset.yaml |  7 ++++++
 cmd/nvidia-validator/main.go                  | 24 +++++++++++--------
 2 files changed, 21 insertions(+), 10 deletions(-)

diff --git a/assets/state-vfio-manager/0500_daemonset.yaml b/assets/state-vfio-manager/0500_daemonset.yaml
index 8f4cc1830..7dfc5aaa3 100644
--- a/assets/state-vfio-manager/0500_daemonset.yaml
+++ b/assets/state-vfio-manager/0500_daemonset.yaml
@@ -85,6 +85,9 @@ spec:
             readOnly: true
           - name: host-root
             mountPath: /host
+          - name: run-nvidia-validations
+            mountPath: /run/nvidia/validations
+            mountPropagation: Bidirectional
           securityContext:
             privileged: true
             seLinuxOptions:
@@ -111,6 +114,10 @@ spec:
           hostPath:
             path: /run/nvidia
             type: DirectoryOrCreate
+        - name: run-nvidia-validations
+          hostPath:
+            path: /run/nvidia/validations
+            type: DirectoryOrCreate
         - name: host-root
           hostPath:
             path: "/"
diff --git a/cmd/nvidia-validator/main.go b/cmd/nvidia-validator/main.go
index 8ffa41e28..bbf9a1fe0 100644
--- a/cmd/nvidia-validator/main.go
+++ b/cmd/nvidia-validator/main.go
@@ -1657,18 +1657,22 @@ func (v *VfioPCI) validate() error {
 		return err
 	}
 
-	err = v.runValidation()
-	if err != nil {
-		return err
-	}
-	log.Info("Validation completed successfully - all devices are bound to vfio-pci")
+	for {
+		log.Info("Attempting to validate that all device are bound to vfio-pci")
+		err := v.runValidation()
+		if err != nil {
+			if !withWaitFlag {
+				return fmt.Errorf("error validating vfio-pci: %w", err)
+			}
+			log.Warningf("failed to validate vfio-pci, retrying after %d seconds\n", sleepIntervalSecondsFlag)
+			time.Sleep(time.Duration(sleepIntervalSecondsFlag) * time.Second)
+			continue
+		}
 
-	// delete status file is already present
-	err = createStatusFile(outputDirFlag + "/" + vfioPCIStatusFile)
-	if err != nil {
-		return err
+		log.Info("Validation completed successfully - all devices are bound to vfio-pci")
+
+		return createStatusFile(outputDirFlag + "/" + vfioPCIStatusFile)
 	}
-	return nil
 }
 
 func (v *VfioPCI) runValidation() error {

From ebae6064f2d7c24e0b5a4062f9fe8f1657317c45 Mon Sep 17 00:00:00 2001
From: Michail Resvanis <mresvani@redhat.com>
Date: Thu, 22 Jan 2026 12:59:59 +0100
Subject: [PATCH 7/8] Add driver validation in sandbox when FM shared-nvswitch
 mode

Signed-off-by: Michail Resvanis <mresvani@redhat.com>
---
 .../state-sandbox-validation/0200_role.yaml   |  7 ++
 .../0500_daemonset.yaml                       | 33 +++++++
 controllers/object_controls.go                | 15 +++
 controllers/transforms_test.go                | 98 +++++++++++++++++++
 4 files changed, 153 insertions(+)

diff --git a/assets/state-sandbox-validation/0200_role.yaml b/assets/state-sandbox-validation/0200_role.yaml
index 79da66ff7..e1f616acb 100644
--- a/assets/state-sandbox-validation/0200_role.yaml
+++ b/assets/state-sandbox-validation/0200_role.yaml
@@ -12,3 +12,10 @@ rules:
   - use
   resourceNames:
   - privileged
+- apiGroups:
+  - apps
+  resources:
+  - daemonsets
+  verbs:
+  - get
+  - list
diff --git a/assets/state-sandbox-validation/0500_daemonset.yaml b/assets/state-sandbox-validation/0500_daemonset.yaml
index fcc2aa12a..982f64b53 100644
--- a/assets/state-sandbox-validation/0500_daemonset.yaml
+++ b/assets/state-sandbox-validation/0500_daemonset.yaml
@@ -26,6 +26,36 @@ spec:
       priorityClassName: system-node-critical
       serviceAccountName: nvidia-sandbox-validator
       initContainers:
+        - name: driver-validation
+          image: "FILLED BY THE OPERATOR"
+          command: ["sh", "-c"]
+          args: ["nvidia-validator"]
+          env:
+            - name: WITH_WAIT
+              value: "true"
+            - name: COMPONENT
+              value: driver
+            - name: OPERATOR_NAMESPACE
+              valueFrom:
+                fieldRef:
+                  fieldPath: metadata.namespace
+          securityContext:
+            privileged: true
+            seLinuxOptions:
+              level: "s0"
+          volumeMounts:
+            - name: host-root
+              mountPath: /host
+              readOnly: true
+              mountPropagation: HostToContainer
+            - name: driver-install-path
+              mountPath: /run/nvidia/driver
+              mountPropagation: HostToContainer
+            - name: run-nvidia-validations
+              mountPath: /run/nvidia/validations
+              mountPropagation: Bidirectional
+            - name: host-dev-char
+              mountPath: /host-dev-char
         - name: cc-manager-validation
           image: "FILLED BY THE OPERATOR"
           command: ['sh', '-c']
@@ -145,3 +175,6 @@ spec:
         - name: host-root
           hostPath:
             path: /
+        - name: host-dev-char
+          hostPath:
+            path: /dev/char
diff --git a/controllers/object_controls.go b/controllers/object_controls.go
index 0f57d8059..e08fce4a7 100644
--- a/controllers/object_controls.go
+++ b/controllers/object_controls.go
@@ -2290,12 +2290,27 @@ func TransformSandboxValidator(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolic
 		"vgpu-devices",
 	}
 
+	// Add driver validation when FabricManager.Mode is shared-nvswitch
+	if config.FabricManager.IsSharedNVSwitchMode() {
+		components = append(components, "driver")
+	}
+
 	for _, component := range components {
 		if err := TransformValidatorComponent(config, &obj.Spec.Template.Spec, component); err != nil {
 			validatorErr = errors.Join(validatorErr, err)
 		}
 	}
 
+	// Remove driver validation init container if NOT in shared-nvswitch mode
+	if !config.FabricManager.IsSharedNVSwitchMode() {
+		for i, initContainer := range obj.Spec.Template.Spec.InitContainers {
+			if initContainer.Name == "driver-validation" {
+				obj.Spec.Template.Spec.InitContainers = append(obj.Spec.Template.Spec.InitContainers[:i], obj.Spec.Template.Spec.InitContainers[i+1:]...)
+				break
+			}
+		}
+	}
+
 	if validatorErr != nil {
 		n.logger.Info("WARN: errors transforming the validator containers: %v", validatorErr)
 	}
diff --git a/controllers/transforms_test.go b/controllers/transforms_test.go
index ea1f7bce4..17681f7c3 100644
--- a/controllers/transforms_test.go
+++ b/controllers/transforms_test.go
@@ -2827,6 +2827,104 @@ func TestTransformSandboxValidator(t *testing.T) {
 				WithPullSecret("pull-secret").
 				WithRuntimeClassName("nvidia"),
 		},
+		{
+			description: "fabric manager shared-nvswitch mode - driver validation should be preserved",
+			ds: NewDaemonset().
+				WithInitContainer(corev1.Container{Name: "driver-validation", Image: "old-image"}).
+				WithContainer(corev1.Container{
+					Name:  "dummy",
+					Image: "old-image",
+				}),
+			cpSpec: &gpuv1.ClusterPolicySpec{
+				Validator: gpuv1.ValidatorSpec{
+					Repository: "nvcr.io/nvidia/cloud-native",
+					Image:      "gpu-operator-validator",
+					Version:    "v1.0.0",
+				},
+				FabricManager: gpuv1.FabricManagerSpec{
+					Mode: gpuv1.FabricModeSharedNVSwitch,
+				},
+			},
+			expectedDs: NewDaemonset().
+				WithInitContainer(corev1.Container{
+					Name:  "driver-validation",
+					Image: "nvcr.io/nvidia/cloud-native/gpu-operator-validator:v1.0.0",
+					SecurityContext: &corev1.SecurityContext{
+						RunAsUser: rootUID,
+					},
+				}).
+				WithContainer(corev1.Container{
+					Name:            "dummy",
+					Image:           "nvcr.io/nvidia/cloud-native/gpu-operator-validator:v1.0.0",
+					ImagePullPolicy: corev1.PullIfNotPresent,
+					SecurityContext: &corev1.SecurityContext{
+						RunAsUser: rootUID,
+					},
+				}),
+		},
+		{
+			description: "fabric manager full-passthrough mode - driver validation should be removed",
+			ds: NewDaemonset().
+				WithInitContainer(corev1.Container{Name: "driver-validation", Image: "old-image"}).
+				WithContainer(corev1.Container{
+					Name:  "dummy",
+					Image: "old-image",
+				}),
+			cpSpec: &gpuv1.ClusterPolicySpec{
+				Validator: gpuv1.ValidatorSpec{
+					Repository: "nvcr.io/nvidia/cloud-native",
+					Image:      "gpu-operator-validator",
+					Version:    "v1.0.0",
+				},
+				FabricManager: gpuv1.FabricManagerSpec{
+					Mode: gpuv1.FabricModeFullPassthrough,
+				},
+			},
+			expectedDs: func() Daemonset {
+				ds := NewDaemonset().
+					WithContainer(corev1.Container{
+						Name:            "dummy",
+						Image:           "nvcr.io/nvidia/cloud-native/gpu-operator-validator:v1.0.0",
+						ImagePullPolicy: corev1.PullIfNotPresent,
+						SecurityContext: &corev1.SecurityContext{
+							RunAsUser: rootUID,
+						},
+					})
+				// Set an empty InitContainers slice to match what happens after removal
+				ds.Spec.Template.Spec.InitContainers = []corev1.Container{}
+				return ds
+			}(),
+		},
+		{
+			description: "no fabric manager mode specified - driver validation should be removed",
+			ds: NewDaemonset().
+				WithInitContainer(corev1.Container{Name: "driver-validation", Image: "old-image"}).
+				WithContainer(corev1.Container{
+					Name:  "dummy",
+					Image: "old-image",
+				}),
+			cpSpec: &gpuv1.ClusterPolicySpec{
+				Validator: gpuv1.ValidatorSpec{
+					Repository: "nvcr.io/nvidia/cloud-native",
+					Image:      "gpu-operator-validator",
+					Version:    "v1.0.0",
+				},
+			},
+			expectedDs: func() Daemonset {
+				ds := NewDaemonset().
+					WithContainer(corev1.Container{
+						Name:            "dummy",
+						Image:           "nvcr.io/nvidia/cloud-native/gpu-operator-validator:v1.0.0",
+						ImagePullPolicy: corev1.PullIfNotPresent,
+						SecurityContext: &corev1.SecurityContext{
+							RunAsUser: rootUID,
+						},
+					})
+				// Set an empty InitContainers slice to match what happens after removal
+				ds.Spec.Template.Spec.InitContainers = []corev1.Container{}
+				return ds
+			}(),
+		},
 	}
 
 	for _, tc := range testCases {

From 9b5433926ff11dcfc96d127d87fb97b0cb9ca2ab Mon Sep 17 00:00:00 2001
From: Michail Resvanis <mresvani@redhat.com>
Date: Wed, 28 Jan 2026 11:23:50 +0100
Subject: [PATCH 8/8] Enable FM in sandbox device plugin when FM
 shared-nvswitch mode

Signed-off-by: Michail Resvanis <mresvani@redhat.com>
---
 controllers/object_controls.go      |  25 ++++++
 controllers/object_controls_test.go |  24 +++++
 controllers/transforms_test.go      | 130 +++++++++++++++++++++++++++-
 3 files changed, 177 insertions(+), 2 deletions(-)

diff --git a/controllers/object_controls.go b/controllers/object_controls.go
index e08fce4a7..1a2c35997 100644
--- a/controllers/object_controls.go
+++ b/controllers/object_controls.go
@@ -1662,6 +1662,31 @@ func TransformSandboxDevicePlugin(obj *appsv1.DaemonSet, config *gpuv1.ClusterPo
 			setContainerEnv(&(obj.Spec.Template.Spec.Containers[0]), env.Name, env.Value)
 		}
 	}
+
+	// Set ENABLE_FABRIC_MANAGER environment variable if shared-nvswitch mode is configured
+	if config.FabricManager.IsSharedNVSwitchMode() {
+		setContainerEnv(&(obj.Spec.Template.Spec.Containers[0]), "ENABLE_FABRIC_MANAGER", "true")
+
+		// Add fabric manager volume mount to the container
+		fabricManagerVolMount := corev1.VolumeMount{
+			Name:      "run-nvidia-fabricmanager",
+			MountPath: "/run/nvidia-fabricmanager",
+		}
+		obj.Spec.Template.Spec.Containers[0].VolumeMounts = append(obj.Spec.Template.Spec.Containers[0].VolumeMounts, fabricManagerVolMount)
+
+		// Add fabric manager volume to the pod spec
+		fabricManagerVol := corev1.Volume{
+			Name: "run-nvidia-fabricmanager",
+			VolumeSource: corev1.VolumeSource{
+				HostPath: &corev1.HostPathVolumeSource{
+					Path: "/run/nvidia-fabricmanager",
+					Type: ptr.To(corev1.HostPathDirectoryOrCreate),
+				},
+			},
+		}
+		obj.Spec.Template.Spec.Volumes = append(obj.Spec.Template.Spec.Volumes, fabricManagerVol)
+	}
+
 	return nil
 }
 
diff --git a/controllers/object_controls_test.go b/controllers/object_controls_test.go
index f6df7340d..2be7bf15b 100644
--- a/controllers/object_controls_test.go
+++ b/controllers/object_controls_test.go
@@ -935,6 +935,8 @@ func getSandboxDevicePluginTestInput(testCase string) *gpuv1.ClusterPolicy {
 	switch testCase {
 	case "default":
 		// Do nothing
+	case "fabric-manager-shared-nvswitch":
+		cp.Spec.FabricManager.Mode = gpuv1.FabricModeSharedNVSwitch
 	default:
 		return nil
 	}
@@ -950,11 +952,16 @@ func getSandboxDevicePluginTestOutput(testCase string) map[string]interface{} {
 		"numDaemonsets":   1,
 		"image":           "nvcr.io/nvidia/kubevirt-device-plugin:v1.1.0",
 		"imagePullSecret": "ngc-secret",
+		"env":             map[string]string{},
 	}
 
 	switch testCase {
 	case "default":
 		// Do nothing
+	case "fabric-manager-shared-nvswitch":
+		output["env"] = map[string]string{
+			"ENABLE_FABRIC_MANAGER": "true",
+		}
 	default:
 		return nil
 	}
@@ -975,6 +982,11 @@ func TestSandboxDevicePlugin(t *testing.T) {
 			getSandboxDevicePluginTestInput("default"),
 			getSandboxDevicePluginTestOutput("default"),
 		},
+		{
+			"FabricManagerSharedNVSwitch",
+			getSandboxDevicePluginTestInput("fabric-manager-shared-nvswitch"),
+			getSandboxDevicePluginTestOutput("fabric-manager-shared-nvswitch"),
+		},
 	}
 
 	for _, tc := range testCases {
@@ -988,14 +1000,26 @@ func TestSandboxDevicePlugin(t *testing.T) {
 			}
 
 			image := ""
+			containerEnv := make(map[string]string)
 			for _, container := range ds.Spec.Template.Spec.Containers {
 				if strings.Contains(container.Name, "nvidia-sandbox-device-plugin-ctr") {
 					image = container.Image
+					for _, env := range container.Env {
+						containerEnv[env.Name] = env.Value
+					}
 					continue
 				}
 			}
 
 			require.Equal(t, tc.output["image"], image, "Unexpected configuration for nvidia-sandbox-device-plugin-ctr image")
+			
+			// Check environment variables
+			expectedEnv := tc.output["env"].(map[string]string)
+			for envName, expectedValue := range expectedEnv {
+				actualValue, found := containerEnv[envName]
+				require.True(t, found, "Expected environment variable %s not found", envName)
+				require.Equal(t, expectedValue, actualValue, "Unexpected value for environment variable %s", envName)
+			}
 
 			// cleanup by deleting all kubernetes objects
 			err = removeState(&clusterPolicyController, clusterPolicyController.idx-1)
diff --git a/controllers/transforms_test.go b/controllers/transforms_test.go
index 17681f7c3..01a563f81 100644
--- a/controllers/transforms_test.go
+++ b/controllers/transforms_test.go
@@ -3194,17 +3194,27 @@ func TestTransformDriver(t *testing.T) {
 			client: mockClientMap["secret-env-client"],
 			expectedDs: NewDaemonset().WithContainer(corev1.Container{
 				Name:            "nvidia-driver-ctr",
-				Image:           "nvcr.io/nvidia/driver:570.172.08-",
+				Image:           "nvcr.io/nvidia/driver:570.172.08-ubuntu20.04",
 				ImagePullPolicy: corev1.PullIfNotPresent,
 				Env: []corev1.EnvVar{
 					{
 						Name:  "FABRIC_MANAGER_FABRIC_MODE",
 						Value: "1",
 					},
+					{
+						Name:  "DRIVER_CONFIG_DIGEST",
+						Value: "2205091877",
+					},
 				},
 			}).WithInitContainer(corev1.Container{
 				Name:  "k8s-driver-manager",
 				Image: "nvcr.io/nvidia/cloud-native/k8s-driver-manager:v0.8.0",
+				Env: []corev1.EnvVar{
+					{
+						Name:  "DRIVER_CONFIG_DIGEST",
+						Value: "2205091877",
+					},
+				},
 			}),
 			errorExpected: false,
 		},
@@ -3230,17 +3240,27 @@ func TestTransformDriver(t *testing.T) {
 			client: mockClientMap["secret-env-client"],
 			expectedDs: NewDaemonset().WithContainer(corev1.Container{
 				Name:            "nvidia-driver-ctr",
-				Image:           "nvcr.io/nvidia/driver:570.172.08-",
+				Image:           "nvcr.io/nvidia/driver:570.172.08-ubuntu20.04",
 				ImagePullPolicy: corev1.PullIfNotPresent,
 				Env: []corev1.EnvVar{
 					{
 						Name:  "FABRIC_MANAGER_FABRIC_MODE",
 						Value: "0",
 					},
+					{
+						Name:  "DRIVER_CONFIG_DIGEST",
+						Value: "240528038",
+					},
 				},
 			}).WithInitContainer(corev1.Container{
 				Name:  "k8s-driver-manager",
 				Image: "nvcr.io/nvidia/cloud-native/k8s-driver-manager:v0.8.0",
+				Env: []corev1.EnvVar{
+					{
+						Name:  "DRIVER_CONFIG_DIGEST",
+						Value: "240528038",
+					},
+				},
 			}),
 			errorExpected: false,
 		},
@@ -4708,3 +4728,109 @@ func TestHashDriverInstallConfigZeroFieldInvariant(t *testing.T) {
 	assert.NotEqual(t, originalDigest, changedDigest,
 		"a non-zero new field should change the digest")
 }
+
+func TestTransformSandboxDevicePlugin(t *testing.T) {
+	initMockK8sClients()
+	testCases := []struct {
+		description   string
+		ds            Daemonset
+		cpSpec        *gpuv1.ClusterPolicySpec
+		expectedDs    Daemonset
+		errorExpected bool
+	}{
+		{
+			description: "sandbox device plugin with fabric manager shared-nvswitch mode",
+			ds: NewDaemonset().WithContainer(corev1.Container{Name: "nvidia-sandbox-device-plugin-ctr"}).
+				WithInitContainer(corev1.Container{Name: "toolkit-validation"}),
+			cpSpec: &gpuv1.ClusterPolicySpec{
+				SandboxDevicePlugin: gpuv1.SandboxDevicePluginSpec{
+					Repository: "nvcr.io/nvidia",
+					Image:      "kubevirt-device-plugin",
+					Version:    "v1.2.0",
+				},
+				FabricManager: gpuv1.FabricManagerSpec{
+					Mode: gpuv1.FabricModeSharedNVSwitch,
+				},
+				Validator: gpuv1.ValidatorSpec{
+					Repository: "nvcr.io/nvidia/cloud-native",
+					Image:      "gpu-operator-validator",
+					Version:    "v1.0.0",
+				},
+			},
+			expectedDs: NewDaemonset().WithContainer(corev1.Container{
+				Name:            "nvidia-sandbox-device-plugin-ctr",
+				Image:           "nvcr.io/nvidia/kubevirt-device-plugin:v1.2.0",
+				ImagePullPolicy: corev1.PullIfNotPresent,
+				Env: []corev1.EnvVar{
+					{
+						Name:  "ENABLE_FABRIC_MANAGER",
+						Value: "true",
+					},
+				},
+				VolumeMounts: []corev1.VolumeMount{
+					{
+						Name:      "run-nvidia-fabricmanager",
+						MountPath: "/run/nvidia-fabricmanager",
+					},
+				},
+			}).WithInitContainer(corev1.Container{
+				Name:  "toolkit-validation",
+				Image: "nvcr.io/nvidia/cloud-native/gpu-operator-validator:v1.0.0",
+				SecurityContext: &corev1.SecurityContext{
+					RunAsUser: rootUID,
+				},
+			}).WithVolume(corev1.Volume{
+				Name: "run-nvidia-fabricmanager",
+				VolumeSource: corev1.VolumeSource{
+					HostPath: &corev1.HostPathVolumeSource{
+						Path: "/run/nvidia-fabricmanager",
+						Type: ptr.To(corev1.HostPathDirectoryOrCreate),
+					},
+				},
+			}),
+			errorExpected: false,
+		},
+		{
+			description: "sandbox device plugin without fabric manager shared-nvswitch mode",
+			ds: NewDaemonset().WithContainer(corev1.Container{Name: "nvidia-sandbox-device-plugin-ctr"}).
+				WithInitContainer(corev1.Container{Name: "toolkit-validation"}),
+			cpSpec: &gpuv1.ClusterPolicySpec{
+				SandboxDevicePlugin: gpuv1.SandboxDevicePluginSpec{
+					Repository: "nvcr.io/nvidia",
+					Image:      "kubevirt-device-plugin",
+					Version:    "v1.2.0",
+				},
+				Validator: gpuv1.ValidatorSpec{
+					Repository: "nvcr.io/nvidia/cloud-native",
+					Image:      "gpu-operator-validator",
+					Version:    "v1.0.0",
+				},
+			},
+			expectedDs: NewDaemonset().WithContainer(corev1.Container{
+				Name:            "nvidia-sandbox-device-plugin-ctr",
+				Image:           "nvcr.io/nvidia/kubevirt-device-plugin:v1.2.0",
+				ImagePullPolicy: corev1.PullIfNotPresent,
+			}).WithInitContainer(corev1.Container{
+				Name:  "toolkit-validation",
+				Image: "nvcr.io/nvidia/cloud-native/gpu-operator-validator:v1.0.0",
+				SecurityContext: &corev1.SecurityContext{
+					RunAsUser: rootUID,
+				},
+			}),
+			errorExpected: false,
+		},
+	}
+
+	for _, tc := range testCases {
+		t.Run(tc.description, func(t *testing.T) {
+			err := TransformSandboxDevicePlugin(tc.ds.DaemonSet, tc.cpSpec,
+				ClusterPolicyController{operatorNamespace: "test-ns", logger: ctrl.Log.WithName("test")})
+			if tc.errorExpected {
+				require.Error(t, err)
+				return
+			}
+			require.NoError(t, err)
+			require.EqualValues(t, tc.expectedDs, tc.ds)
+		})
+	}
+}