Skip to content
49 changes: 49 additions & 0 deletions api/nvidia/v1/clusterpolicy_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,8 @@ type ClusterPolicySpec struct {
HostPaths HostPathsSpec `json:"hostPaths,omitempty"`
// KataSandboxDevicePlugin component spec
KataSandboxDevicePlugin KataDevicePluginSpec `json:"kataSandboxDevicePlugin,omitempty"`
// FabricManager component spec
FabricManager FabricManagerSpec `json:"fabricManager,omitempty"`
}

// Runtime defines container runtime type
Expand Down Expand Up @@ -1819,6 +1821,38 @@ type CDIConfigSpec struct {
NRIPluginEnabled *bool `json:"nriPluginEnabled,omitempty"`
}

// FabricMode defines the Fabric Manager mode
type FabricMode string

const (
// FabricModeFullPassthrough indicates Full-passthrough mode (FABRIC_MODE=0)
FabricModeFullPassthrough FabricMode = "full-passthrough"
// FabricModeSharedNVSwitch indicates Shared NVSwitch Virtualization mode (FABRIC_MODE=1)
FabricModeSharedNVSwitch FabricMode = "shared-nvswitch"
)

func (f FabricMode) String() string {
switch f {
case FabricModeFullPassthrough:
return "full-passthrough"
case FabricModeSharedNVSwitch:
return "shared-nvswitch"
default:
return ""
}
}

// FabricManagerSpec defines the properties for NVIDIA Fabric Manager configuration
type FabricManagerSpec struct {
// Mode indicates the Fabric Manager mode
// +kubebuilder:validation:Enum=full-passthrough;shared-nvswitch
// +kubebuilder:default=full-passthrough
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors=true
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.displayName="Fabric Manager Mode"
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.x-descriptors="urn:alm:descriptor:com.tectonic.ui:select:full-passthrough,urn:alm:descriptor:com.tectonic.ui:select:shared-nvswitch"
Mode FabricMode `json:"mode,omitempty"`
}

// MIGStrategy indicates MIG mode
type MIGStrategy string

Expand Down Expand Up @@ -2334,3 +2368,18 @@ func (c *MIGPartedConfigSpec) GetName() string {
func (c *VGPUDevicesConfigSpec) GetName() string {
return ptr.Deref(c, VGPUDevicesConfigSpec{}).Name
}

// IsSharedNVSwitchMode returns true if Fabric Manager is configured for Shared NVSwitch mode
func (f *FabricManagerSpec) IsSharedNVSwitchMode() bool {
return f.Mode == FabricModeSharedNVSwitch
}

// ValidateFabricManagerConfig validates the Fabric Manager configuration
func (c *ClusterPolicySpec) ValidateFabricManagerConfig() error {
if c.SandboxWorkloads.DefaultWorkload == "vm-passthrough" &&
c.FabricManager.IsSharedNVSwitchMode() &&
!c.Driver.IsEnabled() {
return fmt.Errorf("driver must be enabled when using vm-passthrough with Fabric Manager Shared NVSwitch mode")
}
return nil
}
16 changes: 16 additions & 0 deletions api/nvidia/v1/zz_generated.deepcopy.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

10 changes: 8 additions & 2 deletions assets/state-driver/0400_configmap.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -22,8 +22,14 @@ data:
fi

if ! nvidia-smi; then
echo "nvidia-smi failed"
exit 1
# For vm-passthrough with shared-nvswitch mode, nvidia-smi may fail due to unbound devices
# Fall back to checking if nvidia module is loaded when FABRIC_MANAGER_FABRIC_MODE=1
Comment on lines +25 to +26
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Question (for my understanding) -- GPUs may not be bound to the nvidia driver since there is a chance that the vfio-manager ran already and unbound the devices? Am I understanding this correct?

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, that's correct. In shared-nvswitch mode, the vfio-manager runs vfio-manage unbind --all to unbind GPUs from the nvidia driver and rebind them to vfio-pci for VM passthrough. The nvidia kernel module remains loaded (needed for Fabric Manager / NVSwitch management), but since the GPU devices are no longer bound to the nvidia driver nvidia-smi fails. So we fall back to just verifying the kernel module is loaded, which is sufficient for this mode.

if [ "${FABRIC_MANAGER_FABRIC_MODE:-}" = "1" ]; then
echo "nvidia-smi failed but nvidia module is loaded (vm-passthrough with shared-nvswitch mode)"
else
echo "nvidia-smi failed"
exit 1
fi
fi

GPU_DIRECT_RDMA_ENABLED="${GPU_DIRECT_RDMA_ENABLED:-false}"
Expand Down
7 changes: 7 additions & 0 deletions assets/state-sandbox-validation/0200_role.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -12,3 +12,10 @@ rules:
- use
resourceNames:
- privileged
- apiGroups:
- apps
resources:
- daemonsets
verbs:
- get
- list
33 changes: 33 additions & 0 deletions assets/state-sandbox-validation/0500_daemonset.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,36 @@ spec:
priorityClassName: system-node-critical
serviceAccountName: nvidia-sandbox-validator
initContainers:
- name: driver-validation
image: "FILLED BY THE OPERATOR"
command: ["sh", "-c"]
args: ["nvidia-validator"]
env:
- name: WITH_WAIT
value: "true"
- name: COMPONENT
value: driver
- name: OPERATOR_NAMESPACE
valueFrom:
fieldRef:
fieldPath: metadata.namespace
securityContext:
privileged: true
seLinuxOptions:
level: "s0"
volumeMounts:
- name: host-root
mountPath: /host
readOnly: true
mountPropagation: HostToContainer
- name: driver-install-path
mountPath: /run/nvidia/driver
mountPropagation: HostToContainer
- name: run-nvidia-validations
mountPath: /run/nvidia/validations
mountPropagation: Bidirectional
- name: host-dev-char
mountPath: /host-dev-char
- name: cc-manager-validation
image: "FILLED BY THE OPERATOR"
command: ['sh', '-c']
Expand Down Expand Up @@ -145,3 +175,6 @@ spec:
- name: host-root
hostPath:
path: /
- name: host-dev-char
hostPath:
path: /dev/char
30 changes: 30 additions & 0 deletions assets/state-vfio-manager/0400_configmap.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
apiVersion: v1
kind: ConfigMap
metadata:
name: nvidia-vfio-manager-entrypoint
namespace: "FILLED BY THE OPERATOR"
labels:
app: nvidia-vfio-manager
data:
init-entrypoint.sh: |-
#!/bin/sh

if [ "${FABRIC_MANAGER_MODE}" = "shared-nvswitch" ]; then
# In shared-nvswitch mode, wait for driver to be ready before unbinding devices
echo "Shared NVSwitch mode detected, waiting for driver readiness..."
until [ -f /run/nvidia/validations/driver-ready ]
do
echo "waiting for the driver validations to be ready..."
sleep 5
done

set -o allexport
cat /run/nvidia/validations/driver-ready
. /run/nvidia/validations/driver-ready

echo "Driver is ready, proceeding with device unbind"
exec vfio-manage unbind --all
else
# Default mode: uninstall the driver
exec driver-manager uninstall_driver
fi
20 changes: 18 additions & 2 deletions assets/state-vfio-manager/0500_daemonset.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -26,8 +26,9 @@ spec:
- name: k8s-driver-manager
image: "FILLED BY THE OPERATOR"
imagePullPolicy: IfNotPresent
command: ["driver-manager"]
args: ["uninstall_driver"]
command: ["/bin/sh", "-c"]
args:
- /bin/init-entrypoint.sh
env:
- name: NODE_NAME
valueFrom:
Expand All @@ -47,6 +48,10 @@ spec:
securityContext:
privileged: true
volumeMounts:
- name: nvidia-vfio-manager-entrypoint
readOnly: true
mountPath: /bin/init-entrypoint.sh
subPath: init-entrypoint.sh
- name: run-nvidia
mountPath: /run/nvidia
mountPropagation: Bidirectional
Expand Down Expand Up @@ -80,6 +85,9 @@ spec:
readOnly: true
- name: host-root
mountPath: /host
- name: run-nvidia-validations
mountPath: /run/nvidia/validations
mountPropagation: Bidirectional
securityContext:
privileged: true
seLinuxOptions:
Expand All @@ -90,6 +98,10 @@ spec:
command: ["vfio-manage unbind --all"]
terminationGracePeriodSeconds: 30
volumes:
- name: nvidia-vfio-manager-entrypoint
configMap:
name: nvidia-vfio-manager-entrypoint
defaultMode: 448
- name: host-sys
hostPath:
path: /sys
Expand All @@ -102,6 +114,10 @@ spec:
hostPath:
path: /run/nvidia
type: DirectoryOrCreate
- name: run-nvidia-validations
hostPath:
path: /run/nvidia/validations
type: DirectoryOrCreate
- name: host-root
hostPath:
path: "/"
11 changes: 11 additions & 0 deletions bundle/manifests/nvidia.com_clusterpolicies.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -1297,6 +1297,17 @@ spec:
type: string
type: object
type: object
fabricManager:
description: FabricManager component spec
properties:
mode:
default: full-passthrough
description: Mode indicates the Fabric Manager mode
enum:
- full-passthrough
- shared-nvswitch
type: string
type: object
gdrcopy:
description: GDRCopy component spec
properties:
Expand Down
24 changes: 14 additions & 10 deletions cmd/nvidia-validator/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -1657,18 +1657,22 @@ func (v *VfioPCI) validate() error {
return err
}

err = v.runValidation()
if err != nil {
return err
}
log.Info("Validation completed successfully - all devices are bound to vfio-pci")
for {
log.Info("Attempting to validate that all device are bound to vfio-pci")
err := v.runValidation()
if err != nil {
if !withWaitFlag {
return fmt.Errorf("error validating vfio-pci: %w", err)
}
log.Warningf("failed to validate vfio-pci, retrying after %d seconds\n", sleepIntervalSecondsFlag)
time.Sleep(time.Duration(sleepIntervalSecondsFlag) * time.Second)
continue
}

// delete status file is already present
err = createStatusFile(outputDirFlag + "/" + vfioPCIStatusFile)
if err != nil {
return err
log.Info("Validation completed successfully - all devices are bound to vfio-pci")

return createStatusFile(outputDirFlag + "/" + vfioPCIStatusFile)
}
return nil
}

func (v *VfioPCI) runValidation() error {
Expand Down
11 changes: 11 additions & 0 deletions config/crd/bases/nvidia.com_clusterpolicies.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -1297,6 +1297,17 @@ spec:
type: string
type: object
type: object
fabricManager:
description: FabricManager component spec
properties:
mode:
default: full-passthrough
description: Mode indicates the Fabric Manager mode
enum:
- full-passthrough
- shared-nvswitch
type: string
type: object
gdrcopy:
description: GDRCopy component spec
properties:
Expand Down
Loading