Skip to content

Commit e58d2a6

Browse files
author
Michail Resvanis
committed
Enable FM in sandbox device plugin when FM shared-nvswitch mode
Signed-off-by: Michail Resvanis <mresvani@redhat.com>
1 parent 6da0eb2 commit e58d2a6

3 files changed

Lines changed: 177 additions & 2 deletions

File tree

controllers/object_controls.go

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1662,6 +1662,31 @@ func TransformSandboxDevicePlugin(obj *appsv1.DaemonSet, config *gpuv1.ClusterPo
16621662
setContainerEnv(&(obj.Spec.Template.Spec.Containers[0]), env.Name, env.Value)
16631663
}
16641664
}
1665+
1666+
// Set ENABLE_FABRIC_MANAGER environment variable if shared-nvswitch mode is configured
1667+
if config.FabricManager.IsSharedNVSwitchMode() {
1668+
setContainerEnv(&(obj.Spec.Template.Spec.Containers[0]), "ENABLE_FABRIC_MANAGER", "true")
1669+
1670+
// Add fabric manager volume mount to the container
1671+
fabricManagerVolMount := corev1.VolumeMount{
1672+
Name: "run-nvidia-fabricmanager",
1673+
MountPath: "/run/nvidia-fabricmanager",
1674+
}
1675+
obj.Spec.Template.Spec.Containers[0].VolumeMounts = append(obj.Spec.Template.Spec.Containers[0].VolumeMounts, fabricManagerVolMount)
1676+
1677+
// Add fabric manager volume to the pod spec
1678+
fabricManagerVol := corev1.Volume{
1679+
Name: "run-nvidia-fabricmanager",
1680+
VolumeSource: corev1.VolumeSource{
1681+
HostPath: &corev1.HostPathVolumeSource{
1682+
Path: "/run/nvidia-fabricmanager",
1683+
Type: ptr.To(corev1.HostPathDirectoryOrCreate),
1684+
},
1685+
},
1686+
}
1687+
obj.Spec.Template.Spec.Volumes = append(obj.Spec.Template.Spec.Volumes, fabricManagerVol)
1688+
}
1689+
16651690
return nil
16661691
}
16671692

controllers/object_controls_test.go

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -935,6 +935,8 @@ func getSandboxDevicePluginTestInput(testCase string) *gpuv1.ClusterPolicy {
935935
switch testCase {
936936
case "default":
937937
// Do nothing
938+
case "fabric-manager-shared-nvswitch":
939+
cp.Spec.FabricManager.Mode = gpuv1.FabricModeSharedNVSwitch
938940
default:
939941
return nil
940942
}
@@ -950,11 +952,16 @@ func getSandboxDevicePluginTestOutput(testCase string) map[string]interface{} {
950952
"numDaemonsets": 1,
951953
"image": "nvcr.io/nvidia/kubevirt-device-plugin:v1.1.0",
952954
"imagePullSecret": "ngc-secret",
955+
"env": map[string]string{},
953956
}
954957

955958
switch testCase {
956959
case "default":
957960
// Do nothing
961+
case "fabric-manager-shared-nvswitch":
962+
output["env"] = map[string]string{
963+
"ENABLE_FABRIC_MANAGER": "true",
964+
}
958965
default:
959966
return nil
960967
}
@@ -975,6 +982,11 @@ func TestSandboxDevicePlugin(t *testing.T) {
975982
getSandboxDevicePluginTestInput("default"),
976983
getSandboxDevicePluginTestOutput("default"),
977984
},
985+
{
986+
"FabricManagerSharedNVSwitch",
987+
getSandboxDevicePluginTestInput("fabric-manager-shared-nvswitch"),
988+
getSandboxDevicePluginTestOutput("fabric-manager-shared-nvswitch"),
989+
},
978990
}
979991

980992
for _, tc := range testCases {
@@ -988,14 +1000,26 @@ func TestSandboxDevicePlugin(t *testing.T) {
9881000
}
9891001

9901002
image := ""
1003+
containerEnv := make(map[string]string)
9911004
for _, container := range ds.Spec.Template.Spec.Containers {
9921005
if strings.Contains(container.Name, "nvidia-sandbox-device-plugin-ctr") {
9931006
image = container.Image
1007+
for _, env := range container.Env {
1008+
containerEnv[env.Name] = env.Value
1009+
}
9941010
continue
9951011
}
9961012
}
9971013

9981014
require.Equal(t, tc.output["image"], image, "Unexpected configuration for nvidia-sandbox-device-plugin-ctr image")
1015+
1016+
// Check environment variables
1017+
expectedEnv := tc.output["env"].(map[string]string)
1018+
for envName, expectedValue := range expectedEnv {
1019+
actualValue, found := containerEnv[envName]
1020+
require.True(t, found, "Expected environment variable %s not found", envName)
1021+
require.Equal(t, expectedValue, actualValue, "Unexpected value for environment variable %s", envName)
1022+
}
9991023

10001024
// cleanup by deleting all kubernetes objects
10011025
err = removeState(&clusterPolicyController, clusterPolicyController.idx-1)

controllers/transforms_test.go

Lines changed: 128 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3207,17 +3207,27 @@ func TestTransformDriver(t *testing.T) {
32073207
client: mockClientMap["secret-env-client"],
32083208
expectedDs: NewDaemonset().WithContainer(corev1.Container{
32093209
Name: "nvidia-driver-ctr",
3210-
Image: "nvcr.io/nvidia/driver:570.172.08-",
3210+
Image: "nvcr.io/nvidia/driver:570.172.08-ubuntu20.04",
32113211
ImagePullPolicy: corev1.PullIfNotPresent,
32123212
Env: []corev1.EnvVar{
32133213
{
32143214
Name: "FABRIC_MANAGER_FABRIC_MODE",
32153215
Value: "1",
32163216
},
3217+
{
3218+
Name: "DRIVER_CONFIG_DIGEST",
3219+
Value: "2205091877",
3220+
},
32173221
},
32183222
}).WithInitContainer(corev1.Container{
32193223
Name: "k8s-driver-manager",
32203224
Image: "nvcr.io/nvidia/cloud-native/k8s-driver-manager:v0.8.0",
3225+
Env: []corev1.EnvVar{
3226+
{
3227+
Name: "DRIVER_CONFIG_DIGEST",
3228+
Value: "2205091877",
3229+
},
3230+
},
32213231
}),
32223232
errorExpected: false,
32233233
},
@@ -3243,17 +3253,27 @@ func TestTransformDriver(t *testing.T) {
32433253
client: mockClientMap["secret-env-client"],
32443254
expectedDs: NewDaemonset().WithContainer(corev1.Container{
32453255
Name: "nvidia-driver-ctr",
3246-
Image: "nvcr.io/nvidia/driver:570.172.08-",
3256+
Image: "nvcr.io/nvidia/driver:570.172.08-ubuntu20.04",
32473257
ImagePullPolicy: corev1.PullIfNotPresent,
32483258
Env: []corev1.EnvVar{
32493259
{
32503260
Name: "FABRIC_MANAGER_FABRIC_MODE",
32513261
Value: "0",
32523262
},
3263+
{
3264+
Name: "DRIVER_CONFIG_DIGEST",
3265+
Value: "240528038",
3266+
},
32533267
},
32543268
}).WithInitContainer(corev1.Container{
32553269
Name: "k8s-driver-manager",
32563270
Image: "nvcr.io/nvidia/cloud-native/k8s-driver-manager:v0.8.0",
3271+
Env: []corev1.EnvVar{
3272+
{
3273+
Name: "DRIVER_CONFIG_DIGEST",
3274+
Value: "240528038",
3275+
},
3276+
},
32573277
}),
32583278
errorExpected: false,
32593279
},
@@ -4721,3 +4741,109 @@ func TestHashDriverInstallConfigZeroFieldInvariant(t *testing.T) {
47214741
assert.NotEqual(t, originalDigest, changedDigest,
47224742
"a non-zero new field should change the digest")
47234743
}
4744+
4745+
func TestTransformSandboxDevicePlugin(t *testing.T) {
4746+
initMockK8sClients()
4747+
testCases := []struct {
4748+
description string
4749+
ds Daemonset
4750+
cpSpec *gpuv1.ClusterPolicySpec
4751+
expectedDs Daemonset
4752+
errorExpected bool
4753+
}{
4754+
{
4755+
description: "sandbox device plugin with fabric manager shared-nvswitch mode",
4756+
ds: NewDaemonset().WithContainer(corev1.Container{Name: "nvidia-sandbox-device-plugin-ctr"}).
4757+
WithInitContainer(corev1.Container{Name: "toolkit-validation"}),
4758+
cpSpec: &gpuv1.ClusterPolicySpec{
4759+
SandboxDevicePlugin: gpuv1.SandboxDevicePluginSpec{
4760+
Repository: "nvcr.io/nvidia",
4761+
Image: "kubevirt-device-plugin",
4762+
Version: "v1.2.0",
4763+
},
4764+
FabricManager: gpuv1.FabricManagerSpec{
4765+
Mode: gpuv1.FabricModeSharedNVSwitch,
4766+
},
4767+
Validator: gpuv1.ValidatorSpec{
4768+
Repository: "nvcr.io/nvidia/cloud-native",
4769+
Image: "gpu-operator-validator",
4770+
Version: "v1.0.0",
4771+
},
4772+
},
4773+
expectedDs: NewDaemonset().WithContainer(corev1.Container{
4774+
Name: "nvidia-sandbox-device-plugin-ctr",
4775+
Image: "nvcr.io/nvidia/kubevirt-device-plugin:v1.2.0",
4776+
ImagePullPolicy: corev1.PullIfNotPresent,
4777+
Env: []corev1.EnvVar{
4778+
{
4779+
Name: "ENABLE_FABRIC_MANAGER",
4780+
Value: "true",
4781+
},
4782+
},
4783+
VolumeMounts: []corev1.VolumeMount{
4784+
{
4785+
Name: "run-nvidia-fabricmanager",
4786+
MountPath: "/run/nvidia-fabricmanager",
4787+
},
4788+
},
4789+
}).WithInitContainer(corev1.Container{
4790+
Name: "toolkit-validation",
4791+
Image: "nvcr.io/nvidia/cloud-native/gpu-operator-validator:v1.0.0",
4792+
SecurityContext: &corev1.SecurityContext{
4793+
RunAsUser: rootUID,
4794+
},
4795+
}).WithVolume(corev1.Volume{
4796+
Name: "run-nvidia-fabricmanager",
4797+
VolumeSource: corev1.VolumeSource{
4798+
HostPath: &corev1.HostPathVolumeSource{
4799+
Path: "/run/nvidia-fabricmanager",
4800+
Type: ptr.To(corev1.HostPathDirectoryOrCreate),
4801+
},
4802+
},
4803+
}),
4804+
errorExpected: false,
4805+
},
4806+
{
4807+
description: "sandbox device plugin without fabric manager shared-nvswitch mode",
4808+
ds: NewDaemonset().WithContainer(corev1.Container{Name: "nvidia-sandbox-device-plugin-ctr"}).
4809+
WithInitContainer(corev1.Container{Name: "toolkit-validation"}),
4810+
cpSpec: &gpuv1.ClusterPolicySpec{
4811+
SandboxDevicePlugin: gpuv1.SandboxDevicePluginSpec{
4812+
Repository: "nvcr.io/nvidia",
4813+
Image: "kubevirt-device-plugin",
4814+
Version: "v1.2.0",
4815+
},
4816+
Validator: gpuv1.ValidatorSpec{
4817+
Repository: "nvcr.io/nvidia/cloud-native",
4818+
Image: "gpu-operator-validator",
4819+
Version: "v1.0.0",
4820+
},
4821+
},
4822+
expectedDs: NewDaemonset().WithContainer(corev1.Container{
4823+
Name: "nvidia-sandbox-device-plugin-ctr",
4824+
Image: "nvcr.io/nvidia/kubevirt-device-plugin:v1.2.0",
4825+
ImagePullPolicy: corev1.PullIfNotPresent,
4826+
}).WithInitContainer(corev1.Container{
4827+
Name: "toolkit-validation",
4828+
Image: "nvcr.io/nvidia/cloud-native/gpu-operator-validator:v1.0.0",
4829+
SecurityContext: &corev1.SecurityContext{
4830+
RunAsUser: rootUID,
4831+
},
4832+
}),
4833+
errorExpected: false,
4834+
},
4835+
}
4836+
4837+
for _, tc := range testCases {
4838+
t.Run(tc.description, func(t *testing.T) {
4839+
err := TransformSandboxDevicePlugin(tc.ds.DaemonSet, tc.cpSpec,
4840+
ClusterPolicyController{operatorNamespace: "test-ns", logger: ctrl.Log.WithName("test")})
4841+
if tc.errorExpected {
4842+
require.Error(t, err)
4843+
return
4844+
}
4845+
require.NoError(t, err)
4846+
require.EqualValues(t, tc.expectedDs, tc.ds)
4847+
})
4848+
}
4849+
}

0 commit comments

Comments
 (0)