Skip to content

Commit e2e31ca

Browse files
committed
store os release name and use it than recalculating from tag which is errorprone
Signed-off-by: Rahul Sharma <rahulsharm@nvidia.com>
1 parent d5750f2 commit e2e31ca

7 files changed

Lines changed: 194 additions & 41 deletions

File tree

controllers/object_controls.go

Lines changed: 16 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -3298,16 +3298,12 @@ func resolveDriverTag(n ClusterPolicyController, driverSpec interface{}) (string
32983298
return image, nil
32993299
}
33003300

3301-
func getOSName(osTag string) string {
3302-
// Extract base OS ID by stripping version suffix from osTag
3303-
// Examples: "rhel10" -> "rhel", "ubuntu22.04" -> "ubuntu", "rocky9" -> "rocky"
3304-
osID := strings.TrimRight(osTag, "0123456789.")
3305-
return osID
3306-
}
3307-
33083301
// getRepoConfigPath returns the standard OS specific path for repository configuration files.
33093302
func (n ClusterPolicyController) getRepoConfigPath() (string, error) {
3310-
osID := getOSName(n.gpuNodeOSTag)
3303+
osID := n.gpuNodeOSRelease
3304+
if osID == "" {
3305+
return "", fmt.Errorf("GPU node OS name is empty")
3306+
}
33113307
if path, ok := RepoConfigPathMap[osID]; ok {
33123308
return path, nil
33133309
}
@@ -3316,7 +3312,10 @@ func (n ClusterPolicyController) getRepoConfigPath() (string, error) {
33163312

33173313
// getCertConfigPath returns the standard OS specific path for ssl keys/certificates.
33183314
func (n ClusterPolicyController) getCertConfigPath() (string, error) {
3319-
osID := getOSName(n.gpuNodeOSTag)
3315+
osID := n.gpuNodeOSRelease
3316+
if osID == "" {
3317+
return "", fmt.Errorf("GPU node OS name is empty")
3318+
}
33203319
if path, ok := CertConfigPathMap[osID]; ok {
33213320
return path, nil
33223321
}
@@ -3326,7 +3325,10 @@ func (n ClusterPolicyController) getCertConfigPath() (string, error) {
33263325
// getSubscriptionPathsToVolumeSources returns the MountPathToVolumeSource map containing all
33273326
// OS-specific subscription/entitlement paths that need to be mounted in the container.
33283327
func (n ClusterPolicyController) getSubscriptionPathsToVolumeSources() (MountPathToVolumeSource, error) {
3329-
osID := getOSName(n.gpuNodeOSTag)
3328+
osID := n.gpuNodeOSRelease
3329+
if osID == "" {
3330+
return nil, fmt.Errorf("GPU node OS name is empty")
3331+
}
33303332
if pathToVolumeSource, ok := SubscriptionPathMap[osID]; ok {
33313333
return pathToVolumeSource, nil
33323334
}
@@ -3594,7 +3596,10 @@ func transformDriverContainer(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolicy
35943596
}
35953597
}
35963598

3597-
osID := getOSName(n.gpuNodeOSTag)
3599+
osID := n.gpuNodeOSRelease
3600+
if osID == "" {
3601+
return fmt.Errorf("ERROR: failed to determine GPU node OS name")
3602+
}
35983603
// set up subscription entitlements for RHEL(using K8s with a non-CRIO runtime) and SLES
35993604
if (osID == "rhel" && n.openshift == "" && n.runtime != gpuv1.CRIO) || osID == "sles" || osID == "sl-micro" {
36003605
n.logger.Info("Mounting subscriptions into the driver container", "OS", osID)

controllers/object_controls_test.go

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -225,13 +225,14 @@ func setup() error {
225225
if gpuNodeCount == 0 {
226226
return fmt.Errorf("no gpu nodes in mock cluster")
227227
}
228-
gpuNodeOSTag, err := clusterPolicyController.getGPUNodeOSTag()
228+
gpuNodeOSRelease, gpuNodeOSTag, err := clusterPolicyController.getGPUNodeOSInfo()
229229
if err != nil {
230230
return fmt.Errorf("unable to get GPU node tag: %w", err)
231231
}
232232

233233
clusterPolicyController.hasGPUNodes = gpuNodeCount != 0
234234
clusterPolicyController.hasNFDLabels = hasNFDLabels
235+
clusterPolicyController.gpuNodeOSRelease = gpuNodeOSRelease
235236
clusterPolicyController.gpuNodeOSTag = gpuNodeOSTag
236237

237238
// setup kernelVersionMap for pre-compiled driver tests

controllers/state_manager.go

Lines changed: 39 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -164,11 +164,12 @@ type ClusterPolicyController struct {
164164
openshift string
165165
ocpDriverToolkit OpenShiftDriverToolkit
166166

167-
runtime gpuv1.Runtime
168-
gpuNodeOSTag string
169-
hasGPUNodes bool
170-
hasNFDLabels bool
171-
sandboxEnabled bool
167+
runtime gpuv1.Runtime
168+
gpuNodeOSTag string
169+
gpuNodeOSRelease string
170+
hasGPUNodes bool
171+
hasNFDLabels bool
172+
sandboxEnabled bool
172173
}
173174

174175
func addState(n *ClusterPolicyController, path string) {
@@ -637,7 +638,7 @@ func getRuntimeString(node corev1.Node) (gpuv1.Runtime, error) {
637638
return runtime, nil
638639
}
639640

640-
func (n *ClusterPolicyController) getGPUNodeOSTag() (string, error) {
641+
func (n *ClusterPolicyController) getGPUNodeOSInfo() (string, string, error) {
641642
ctx := n.ctx
642643
opts := []client.ListOption{
643644
client.MatchingLabels(map[string]string{commonGPULabelKey: commonGPULabelValue}),
@@ -646,34 +647,54 @@ func (n *ClusterPolicyController) getGPUNodeOSTag() (string, error) {
646647
nodeList := &corev1.NodeList{}
647648
err := n.client.List(ctx, nodeList, opts...)
648649
if err != nil {
649-
return "", fmt.Errorf("unable to list nodes with GPU present: %w", err)
650+
return "", "", fmt.Errorf("unable to list nodes with GPU present: %w", err)
650651
}
651652
if len(nodeList.Items) == 0 {
652-
return "", fmt.Errorf("no nodes found with GPU present")
653+
return "", "", fmt.Errorf("no nodes found with GPU present")
653654
}
654655

655656
labels := nodeList.Items[0].Labels
656657
osName, ok := labels[nfdOSReleaseIDLabelKey]
657658
if !ok {
658-
return "", fmt.Errorf("unable to retrieve OS name from label %s", nfdOSReleaseIDLabelKey)
659+
return "", "", fmt.Errorf("unable to retrieve OS name from label %s", nfdOSReleaseIDLabelKey)
659660
}
660661
osVersion, ok := labels[nfdOSVersionIDLabelKey]
661662
if !ok {
662-
return "", fmt.Errorf("unable to retrieve OS version from label %s", nfdOSVersionIDLabelKey)
663+
return "", "", fmt.Errorf("unable to retrieve OS version from label %s", nfdOSVersionIDLabelKey)
663664
}
664665
osMajorVersion := strings.Split(osVersion, ".")[0]
665-
osMajorNumber, err := strconv.Atoi(osMajorVersion)
666-
if err != nil {
667-
return "", fmt.Errorf("error processing OS major version %s: %w", osMajorVersion, err)
668-
}
669666

670667
// If the OS is RockyLinux or RHEL 10 & above, we will omit the minor version when constructing the os image tag
671-
if osName == "rocky" || (osName == "rhel" && osMajorNumber >= 10) {
668+
switch osName {
669+
case "rocky":
672670
osVersion = osMajorVersion
671+
case "rhel":
672+
osMajorNumber, err := parseOSMajorVersion(osVersion)
673+
if err != nil {
674+
n.logger.Info("Unable to parse RHEL major version, using full OS version for GPU node OS tag", "osVersion", osVersion, "error", err)
675+
} else if osMajorNumber >= 10 {
676+
osVersion = osMajorVersion
677+
}
673678
}
674679
osTag := fmt.Sprintf("%s%s", osName, osVersion)
675680

676-
return osTag, nil
681+
return osName, osTag, nil
682+
}
683+
684+
func parseOSMajorVersion(osVersion string) (int, error) {
685+
osMajorVersion := strings.Split(osVersion, ".")[0]
686+
osMajorVersion = strings.TrimSpace(osMajorVersion)
687+
osMajorVersion = strings.TrimPrefix(strings.TrimPrefix(osMajorVersion, "v"), "V")
688+
if osMajorVersion == "" {
689+
return 0, fmt.Errorf("empty OS major version")
690+
}
691+
692+
osMajorNumber, err := strconv.Atoi(osMajorVersion)
693+
if err != nil {
694+
return 0, fmt.Errorf("error processing OS major version %s: %w", osMajorVersion, err)
695+
}
696+
697+
return osMajorNumber, nil
677698
}
678699

679700
func (n *ClusterPolicyController) setPodSecurityLabelsForNamespace() error {
@@ -939,10 +960,11 @@ func (n *ClusterPolicyController) init(ctx context.Context, reconciler *ClusterP
939960
n.hasNFDLabels = hasNFDLabels
940961

941962
if n.hasGPUNodes {
942-
gpuNodeOSTag, err := n.getGPUNodeOSTag()
963+
gpuNodeOSRelease, gpuNodeOSTag, err := n.getGPUNodeOSInfo()
943964
if err != nil {
944965
return fmt.Errorf("failed to retrieve GPU node OS tag: %w", err)
945966
}
967+
n.gpuNodeOSRelease = gpuNodeOSRelease
946968
n.gpuNodeOSTag = gpuNodeOSTag
947969
}
948970
// fetch all nodes and annotate gpu nodes

controllers/state_manager_test.go

Lines changed: 88 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,16 +17,104 @@
1717
package controllers
1818

1919
import (
20+
"context"
2021
"errors"
2122
"testing"
2223

2324
"github.com/stretchr/testify/require"
2425
corev1 "k8s.io/api/core/v1"
26+
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
27+
"k8s.io/apimachinery/pkg/runtime"
2528
"k8s.io/utils/ptr"
29+
"sigs.k8s.io/controller-runtime/pkg/client/fake"
2630

2731
gpuv1 "github.com/NVIDIA/gpu-operator/api/nvidia/v1"
2832
)
2933

34+
func TestGetGPUNodeOSInfo(t *testing.T) {
35+
testCases := []struct {
36+
name string
37+
osName string
38+
osVersion string
39+
expected string
40+
}{
41+
{
42+
name: "talos version with v prefix",
43+
osName: "talos",
44+
osVersion: "v1.12.6",
45+
expected: "talosv1.12.6",
46+
},
47+
{
48+
name: "rhel 10 omits minor version",
49+
osName: "rhel",
50+
osVersion: "10.2",
51+
expected: "rhel10",
52+
},
53+
{
54+
name: "rocky omits minor version",
55+
osName: "rocky",
56+
osVersion: "9.5",
57+
expected: "rocky9",
58+
},
59+
{
60+
name: "ubuntu preserves full version",
61+
osName: "ubuntu",
62+
osVersion: "24.04",
63+
expected: "ubuntu24.04",
64+
},
65+
{
66+
name: "sles preserves dotted version",
67+
osName: "sles",
68+
osVersion: "15.6",
69+
expected: "sles15.6",
70+
},
71+
{
72+
name: "sles preserves service-pack version",
73+
osName: "sles",
74+
osVersion: "15-SP6",
75+
expected: "sles15-SP6",
76+
},
77+
{
78+
name: "sl-micro preserves dotted version",
79+
osName: "sl-micro",
80+
osVersion: "6.0",
81+
expected: "sl-micro6.0",
82+
},
83+
{
84+
name: "archlinux preserves rolling version",
85+
osName: "archlinux",
86+
osVersion: "rolling",
87+
expected: "archlinuxrolling",
88+
},
89+
}
90+
91+
for _, tc := range testCases {
92+
t.Run(tc.name, func(t *testing.T) {
93+
scheme := runtime.NewScheme()
94+
require.NoError(t, corev1.AddToScheme(scheme))
95+
96+
node := &corev1.Node{
97+
ObjectMeta: metav1.ObjectMeta{
98+
Name: "gpu-node-1",
99+
Labels: map[string]string{
100+
commonGPULabelKey: commonGPULabelValue,
101+
nfdOSReleaseIDLabelKey: tc.osName,
102+
nfdOSVersionIDLabelKey: tc.osVersion,
103+
},
104+
},
105+
}
106+
107+
client := fake.NewClientBuilder().WithScheme(scheme).WithObjects(node).Build()
108+
controller := ClusterPolicyController{ctx: context.Background(), client: client}
109+
110+
osName, osTag, err := controller.getGPUNodeOSInfo()
111+
require.NoError(t, err)
112+
require.Equal(t, tc.osName, osName)
113+
require.Equal(t, tc.expected, osTag)
114+
})
115+
}
116+
}
117+
30118
func TestGetRuntimeString(t *testing.T) {
31119
testCases := []struct {
32120
description string

controllers/transforms_test.go

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -3016,7 +3016,7 @@ func TestTransformDriver(t *testing.T) {
30163016
t.Run(tc.description, func(t *testing.T) {
30173017
err := TransformDriver(tc.ds.DaemonSet, tc.cpSpec,
30183018
ClusterPolicyController{client: tc.client, runtime: gpuv1.Containerd,
3019-
operatorNamespace: "test-ns", logger: ctrl.Log.WithName("test"), gpuNodeOSTag: "ubuntu20.04"})
3019+
operatorNamespace: "test-ns", logger: ctrl.Log.WithName("test"), gpuNodeOSRelease: "ubuntu", gpuNodeOSTag: "ubuntu20.04"})
30203020
if tc.errorExpected {
30213021
require.Error(t, err)
30223022
return
@@ -3430,7 +3430,7 @@ func TestTransformDriverWithLicensingConfig(t *testing.T) {
34303430
t.Run(tc.description, func(t *testing.T) {
34313431
err := TransformDriver(tc.ds.DaemonSet, tc.cpSpec,
34323432
ClusterPolicyController{client: tc.client, runtime: gpuv1.Containerd,
3433-
operatorNamespace: "test-ns", logger: ctrl.Log.WithName("test"), gpuNodeOSTag: "ubuntu20.04"})
3433+
operatorNamespace: "test-ns", logger: ctrl.Log.WithName("test"), gpuNodeOSRelease: "ubuntu", gpuNodeOSTag: "ubuntu20.04"})
34343434
if tc.errorExpected {
34353435
require.Error(t, err)
34363436
return
@@ -3562,7 +3562,7 @@ func TestTransformDriverWithResources(t *testing.T) {
35623562
t.Run(tc.description, func(t *testing.T) {
35633563
err := TransformDriver(tc.ds.DaemonSet, tc.cpSpec,
35643564
ClusterPolicyController{client: tc.client, runtime: gpuv1.Containerd,
3565-
operatorNamespace: "test-ns", logger: ctrl.Log.WithName("test"), gpuNodeOSTag: "ubuntu20.04"})
3565+
operatorNamespace: "test-ns", logger: ctrl.Log.WithName("test"), gpuNodeOSRelease: "ubuntu", gpuNodeOSTag: "ubuntu20.04"})
35663566
if tc.errorExpected {
35673567
require.Error(t, err)
35683568
return
@@ -3657,7 +3657,7 @@ func TestTransformDriverRDMA(t *testing.T) {
36573657

36583658
err := TransformDriver(ds.DaemonSet, cpSpec,
36593659
ClusterPolicyController{client: mockClient, runtime: gpuv1.Containerd,
3660-
operatorNamespace: "test-ns", logger: ctrl.Log.WithName("test"), gpuNodeOSTag: "ubuntu20.04"})
3660+
operatorNamespace: "test-ns", logger: ctrl.Log.WithName("test"), gpuNodeOSRelease: "ubuntu", gpuNodeOSTag: "ubuntu20.04"})
36613661
require.NoError(t, err)
36623662

36633663
require.EqualValues(t, expectedDs, ds)
@@ -3740,7 +3740,7 @@ func TestTransformDriverVGPUTopologyConfig(t *testing.T) {
37403740

37413741
err := TransformDriver(ds.DaemonSet, cpSpec,
37423742
ClusterPolicyController{client: mockClient, runtime: gpuv1.Containerd,
3743-
operatorNamespace: "test-ns", logger: ctrl.Log.WithName("test"), gpuNodeOSTag: "ubuntu20.04"})
3743+
operatorNamespace: "test-ns", logger: ctrl.Log.WithName("test"), gpuNodeOSRelease: "ubuntu", gpuNodeOSTag: "ubuntu20.04"})
37443744
require.NoError(t, err)
37453745
require.EqualValues(t, expectedDs, ds)
37463746
}
@@ -4173,7 +4173,7 @@ func TestTransformDriverWithAdditionalConfig(t *testing.T) {
41734173
t.Run(tc.description, func(t *testing.T) {
41744174
err := TransformDriver(tc.ds.DaemonSet, tc.cpSpec,
41754175
ClusterPolicyController{client: tc.client, runtime: gpuv1.Containerd,
4176-
operatorNamespace: "test-ns", logger: ctrl.Log.WithName("test"), gpuNodeOSTag: "ubuntu24.04"})
4176+
operatorNamespace: "test-ns", logger: ctrl.Log.WithName("test"), gpuNodeOSRelease: "ubuntu", gpuNodeOSTag: "ubuntu24.04"})
41774177
if tc.errorExpected {
41784178
require.Error(t, err)
41794179
require.Equal(t, tc.errorMessage, err.Error())

internal/state/nodepool.go

Lines changed: 29 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -142,17 +142,40 @@ func getNodePools(ctx context.Context, k8sClient client.Client, selector map[str
142142

143143
func getOSTag(osRelease, osVersion string) (string, error) {
144144
osMajorVersion := strings.Split(osVersion, ".")[0]
145-
osMajorNumber, err := strconv.Atoi(osMajorVersion)
146-
if err != nil {
147-
return "", fmt.Errorf("failed to parse os version: %w", err)
148-
}
149145

150146
var osTagSuffix string
151147
// If the OS is RockyLinux or RHEL 10 & above, we will omit the minor version when constructing the os image tag
152-
if osRelease == "rocky" || (osRelease == "rhel" && osMajorNumber >= 10) {
148+
switch osRelease {
149+
case "rocky":
153150
osTagSuffix = osMajorVersion
154-
} else {
151+
case "rhel":
152+
osMajorNumber, err := parseOSMajorVersion(osVersion)
153+
if err != nil {
154+
return "", fmt.Errorf("failed to parse os version: %w", err)
155+
}
156+
if osMajorNumber >= 10 {
157+
osTagSuffix = osMajorVersion
158+
} else {
159+
osTagSuffix = osVersion
160+
}
161+
default:
155162
osTagSuffix = osVersion
156163
}
157164
return fmt.Sprintf("%s%s", osRelease, osTagSuffix), nil
158165
}
166+
167+
func parseOSMajorVersion(osVersion string) (int, error) {
168+
osMajorVersion := strings.Split(osVersion, ".")[0]
169+
osMajorVersion = strings.TrimSpace(osMajorVersion)
170+
osMajorVersion = strings.TrimPrefix(strings.TrimPrefix(osMajorVersion, "v"), "V")
171+
if osMajorVersion == "" {
172+
return 0, fmt.Errorf("empty OS major version")
173+
}
174+
175+
osMajorNumber, err := strconv.Atoi(osMajorVersion)
176+
if err != nil {
177+
return 0, err
178+
}
179+
180+
return osMajorNumber, nil
181+
}

0 commit comments

Comments
 (0)