diff --git a/api/nvidia/v1/clusterpolicy_types.go b/api/nvidia/v1/clusterpolicy_types.go index cb8ce2ef8..f18a3021b 100644 --- a/api/nvidia/v1/clusterpolicy_types.go +++ b/api/nvidia/v1/clusterpolicy_types.go @@ -499,6 +499,15 @@ type DriverSpec struct { // +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.x-descriptors="urn:alm:descriptor:com.tectonic.ui:select:auto,urn:alm:descriptor:com.tectonic.ui:select:open,urn:alm:descriptor:com.tectonic.ui:select:proprietary" KernelModuleType string `json:"kernelModuleType,omitempty"` + // DriverType defines the type of NVIDIA driver to be deployed. + // Accepted values are gpu, vgpu, and vgpu-host-manager. + // +kubebuilder:validation:Enum=gpu;vgpu;vgpu-host-manager + // +kubebuilder:validation:Optional + // +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.displayName="Driver Type" + // +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.description="Driver Type" + // +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.x-descriptors="urn:alm:descriptor:com.tectonic.ui:select:gpu,urn:alm:descriptor:com.tectonic.ui:select:vgpu,urn:alm:descriptor:com.tectonic.ui:select:vgpu-host-manager" + DriverType string `json:"driverType,omitempty"` + // Enabled indicates if deployment of NVIDIA Driver through operator is enabled // +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors=true // +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.displayName="Enable NVIDIA Driver deployment through GPU Operator" diff --git a/bundle/manifests/nvidia.com_clusterpolicies.yaml b/bundle/manifests/nvidia.com_clusterpolicies.yaml index 0510c140f..07395d2cf 100644 --- a/bundle/manifests/nvidia.com_clusterpolicies.yaml +++ b/bundle/manifests/nvidia.com_clusterpolicies.yaml @@ -643,6 +643,15 @@ spec: name: type: string type: object + driverType: + description: |- + DriverType defines the type of NVIDIA driver to be deployed. + Accepted values are gpu, vgpu, and vgpu-host-manager. + enum: + - gpu + - vgpu + - vgpu-host-manager + type: string enabled: description: Enabled indicates if deployment of NVIDIA Driver through operator is enabled diff --git a/cmd/nvidia-validator/main.go b/cmd/nvidia-validator/main.go index ee84da29b..fb9a43553 100644 --- a/cmd/nvidia-validator/main.go +++ b/cmd/nvidia-validator/main.go @@ -843,6 +843,20 @@ func (d *Driver) createStatusFile(driverInfo driverInfo) error { return createStatusFileWithContent(outputDirFlag+"/"+driverStatusFile, statusFileContent) } +// areNvidiaModulesLoaded checks if NVIDIA kernel modules are already loaded in kernel memory. +func areNvidiaModulesLoaded() bool { + // Check if the nvidia module is loaded by checking if /sys/module/nvidia/refcnt exists + if _, err := os.Stat("/sys/module/nvidia/refcnt"); err == nil { + refcntData, err := os.ReadFile("/sys/module/nvidia/refcnt") + if err == nil { + refcnt := strings.TrimSpace(string(refcntData)) + log.Infof("NVIDIA kernel modules already loaded in kernel memory (refcnt=%s)", refcnt) + return true + } + } + return false +} + // createDevCharSymlinks creates symlinks in /host-dev-char that point to all possible NVIDIA devices nodes. func createDevCharSymlinks(driverInfo driverInfo, disableDevCharSymlinkCreation bool) error { if disableDevCharSymlinkCreation { @@ -853,8 +867,16 @@ func createDevCharSymlinks(driverInfo driverInfo, disableDevCharSymlinkCreation log.Info("creating symlinks under /dev/char that correspond to NVIDIA character devices") - // Only attempt to load NVIDIA kernel modules when we can chroot into driverRoot - loadKernelModules := driverInfo.isHostDriver || (driverInfo.devRoot == driverInfo.driverRoot) + // Check if NVIDIA modules are already loaded in kernel memory. + // If they are, we don't need to run modprobe (which would fail if modules aren't in /lib/modules/). + // This handles the case where the driver container performed a userspace-only install + // after detecting that modules were already loaded from a previous boot. + modulesAlreadyLoaded := areNvidiaModulesLoaded() + + // Only attempt to load NVIDIA kernel modules when: + // 1. Modules are not already loaded in kernel memory, AND + // 2. We can chroot into driverRoot to run modprobe + loadKernelModules := !modulesAlreadyLoaded && (driverInfo.isHostDriver || (driverInfo.devRoot == driverInfo.driverRoot)) // driverRootCtrPath is the path of the driver install dir in the container. This will either be // driverInstallDirCtrPathFlag or '/host'. diff --git a/config/crd/bases/nvidia.com_clusterpolicies.yaml b/config/crd/bases/nvidia.com_clusterpolicies.yaml index 0510c140f..07395d2cf 100644 --- a/config/crd/bases/nvidia.com_clusterpolicies.yaml +++ b/config/crd/bases/nvidia.com_clusterpolicies.yaml @@ -643,6 +643,15 @@ spec: name: type: string type: object + driverType: + description: |- + DriverType defines the type of NVIDIA driver to be deployed. + Accepted values are gpu, vgpu, and vgpu-host-manager. + enum: + - gpu + - vgpu + - vgpu-host-manager + type: string enabled: description: Enabled indicates if deployment of NVIDIA Driver through operator is enabled diff --git a/controllers/object_controls.go b/controllers/object_controls.go index 0b08fbc6d..c270acafe 100644 --- a/controllers/object_controls.go +++ b/controllers/object_controls.go @@ -160,6 +160,10 @@ const ( OpenKernelModulesEnabledEnvName = "OPEN_KERNEL_MODULES_ENABLED" // KernelModuleTypeEnvName is the name of the driver-container envvar to set the desired kernel module type KernelModuleTypeEnvName = "KERNEL_MODULE_TYPE" + // DriverTypeEnvName is the name of the driver-container envvar to set the driver type + DriverTypeEnvName = "DRIVER_TYPE" + // DriverVersionEnvName is the name of the envvar to set the desired driver version + DriverVersionEnvName = "DRIVER_VERSION" // MPSRootEnvName is the name of the envvar for configuring the MPS root MPSRootEnvName = "MPS_ROOT" // DefaultMPSRoot is the default MPS root path on the host @@ -1002,7 +1006,7 @@ func TransformDriver(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolicySpec, n C } // update driver-manager initContainer - err = transformDriverManagerInitContainer(obj, &config.Driver.Manager, config.Driver.GPUDirectRDMA) + err = transformDriverManagerInitContainer(obj, &config.Driver.Manager, config.Driver.GPUDirectRDMA, config.Driver.Version, config.Driver.KernelModuleType, config.Driver.DriverType) if err != nil { return err } @@ -1050,7 +1054,7 @@ func TransformDriver(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolicySpec, n C // TransformVGPUManager transforms NVIDIA vGPU Manager daemonset with required config as per ClusterPolicy func TransformVGPUManager(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolicySpec, n ClusterPolicyController) error { // update k8s-driver-manager initContainer - err := transformDriverManagerInitContainer(obj, &config.VGPUManager.DriverManager, nil) + err := transformDriverManagerInitContainer(obj, &config.VGPUManager.DriverManager, nil, config.VGPUManager.Version, "", "") if err != nil { return fmt.Errorf("failed to transform k8s-driver-manager initContainer for vGPU Manager: %v", err) } @@ -2009,7 +2013,7 @@ func TransformKataManager(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolicySpec // TransformVFIOManager transforms VFIO-PCI Manager daemonset with required config as per ClusterPolicy func TransformVFIOManager(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolicySpec, n ClusterPolicyController) error { // update k8s-driver-manager initContainer - err := transformDriverManagerInitContainer(obj, &config.VFIOManager.DriverManager, nil) + err := transformDriverManagerInitContainer(obj, &config.VFIOManager.DriverManager, nil, config.VFIOManager.Version, "", "") if err != nil { return fmt.Errorf("failed to transform k8s-driver-manager initContainer for VFIO Manager: %v", err) } @@ -2741,7 +2745,7 @@ func transformConfigManagerSidecarContainer(obj *appsv1.DaemonSet, config *gpuv1 return nil } -func transformDriverManagerInitContainer(obj *appsv1.DaemonSet, driverManagerSpec *gpuv1.DriverManagerSpec, rdmaSpec *gpuv1.GPUDirectRDMASpec) error { +func transformDriverManagerInitContainer(obj *appsv1.DaemonSet, driverManagerSpec *gpuv1.DriverManagerSpec, rdmaSpec *gpuv1.GPUDirectRDMASpec, driverVersion string, kernelModuleType string, driverType string) error { container := findContainerByName(obj.Spec.Template.Spec.InitContainers, "k8s-driver-manager") if container == nil { @@ -2765,6 +2769,21 @@ func transformDriverManagerInitContainer(obj *appsv1.DaemonSet, driverManagerSpe } } + // set driver version for config change detection + if driverVersion != "" { + setContainerEnv(container, DriverVersionEnvName, driverVersion) + } + + // set kernel module type for config change detection + if kernelModuleType != "" { + setContainerEnv(container, KernelModuleTypeEnvName, kernelModuleType) + } + + // set driver type for config change detection + if driverType != "" { + setContainerEnv(container, DriverTypeEnvName, driverType) + } + // set/append environment variables for driver-manager initContainer if len(driverManagerSpec.Env) > 0 { for _, env := range driverManagerSpec.Env { @@ -3424,6 +3443,10 @@ func transformDriverContainer(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolicy } } + if len(config.Driver.DriverType) > 0 { + setContainerEnv(driverContainer, DriverTypeEnvName, config.Driver.DriverType) + } + // set container probe timeouts if config.Driver.StartupProbe != nil { setContainerProbe(driverContainer, config.Driver.StartupProbe, Startup) diff --git a/controllers/transforms_test.go b/controllers/transforms_test.go index 2e0ae7c1d..a56849f4d 100644 --- a/controllers/transforms_test.go +++ b/controllers/transforms_test.go @@ -1719,7 +1719,9 @@ func TestTransformVFIOManager(t *testing.T) { Name: "k8s-driver-manager", Image: "nvcr.io/nvidia/cloud-native/k8s-driver-manager:v1.0.0", ImagePullPolicy: corev1.PullIfNotPresent, - Env: mockEnvCore, + Env: append([]corev1.EnvVar{ + {Name: DriverVersionEnvName, Value: "v1.0.0"}, + }, mockEnvCore...), }). WithPullSecret(secret), }, @@ -1937,10 +1939,12 @@ func newBoolPtr(b bool) *bool { func TestTransformDriverManagerInitContainer(t *testing.T) { testCases := []struct { - description string - ds Daemonset - cpSpec *gpuv1.ClusterPolicySpec - expectedDs Daemonset + description string + ds Daemonset + cpSpec *gpuv1.ClusterPolicySpec + driverVersion string + kernelModuleType string + expectedDs Daemonset }{ { description: "transform k8s-driver-manager initContainer", @@ -1963,6 +1967,8 @@ func TestTransformDriverManagerInitContainer(t *testing.T) { }, }, }, + driverVersion: "", + kernelModuleType: "", expectedDs: NewDaemonset().WithInitContainer(corev1.Container{ Name: "k8s-driver-manager", Image: "nvcr.io/nvidia/cloud-native/k8s-driver-manager:v1.0.0", @@ -1974,11 +1980,39 @@ func TestTransformDriverManagerInitContainer(t *testing.T) { }, }).WithInitContainer(corev1.Container{Name: "dummy"}).WithPullSecret("pull-secret"), }, + { + description: "transform k8s-driver-manager initContainer with driver version and kernel module type", + ds: NewDaemonset(). + WithInitContainer(corev1.Container{Name: "k8s-driver-manager"}). + WithInitContainer(corev1.Container{Name: "dummy"}), + cpSpec: &gpuv1.ClusterPolicySpec{ + Driver: gpuv1.DriverSpec{ + Manager: gpuv1.DriverManagerSpec{ + Repository: "nvcr.io/nvidia/cloud-native", + Image: "k8s-driver-manager", + Version: "v1.0.0", + ImagePullPolicy: "IfNotPresent", + ImagePullSecrets: []string{"pull-secret"}, + }, + }, + }, + driverVersion: "550.90.12", + kernelModuleType: "open", + expectedDs: NewDaemonset().WithInitContainer(corev1.Container{ + Name: "k8s-driver-manager", + Image: "nvcr.io/nvidia/cloud-native/k8s-driver-manager:v1.0.0", + ImagePullPolicy: corev1.PullIfNotPresent, + Env: []corev1.EnvVar{ + {Name: DriverVersionEnvName, Value: "550.90.12"}, + {Name: KernelModuleTypeEnvName, Value: "open"}, + }, + }).WithInitContainer(corev1.Container{Name: "dummy"}).WithPullSecret("pull-secret"), + }, } for _, tc := range testCases { t.Run(tc.description, func(t *testing.T) { - err := transformDriverManagerInitContainer(tc.ds.DaemonSet, &tc.cpSpec.Driver.Manager, tc.cpSpec.Driver.GPUDirectRDMA) + err := transformDriverManagerInitContainer(tc.ds.DaemonSet, &tc.cpSpec.Driver.Manager, tc.cpSpec.Driver.GPUDirectRDMA, tc.driverVersion, tc.kernelModuleType, tc.cpSpec.Driver.DriverType) require.NoError(t, err) require.EqualValues(t, tc.expectedDs, tc.ds) }) @@ -2665,6 +2699,9 @@ func TestTransformDriver(t *testing.T) { }).WithInitContainer(corev1.Container{ Name: "k8s-driver-manager", Image: "nvcr.io/nvidia/cloud-native/k8s-driver-manager:v0.8.0", + Env: []corev1.EnvVar{ + {Name: DriverVersionEnvName, Value: "570.172.08"}, + }, }), errorExpected: false, }, @@ -2962,6 +2999,9 @@ func TestTransformDriverWithLicensingConfig(t *testing.T) { Name: "k8s-driver-manager", Image: "nvcr.io/nvidia/cloud-native/k8s-driver-manager:v0.8.0", ImagePullPolicy: corev1.PullIfNotPresent, + Env: []corev1.EnvVar{ + {Name: DriverVersionEnvName, Value: "570.172.08"}, + }, }).WithVolume(corev1.Volume{ Name: "licensing-config", VolumeSource: corev1.VolumeSource{ @@ -3016,6 +3056,9 @@ func TestTransformDriverWithLicensingConfig(t *testing.T) { Name: "k8s-driver-manager", Image: "nvcr.io/nvidia/cloud-native/k8s-driver-manager:v0.8.0", ImagePullPolicy: corev1.PullIfNotPresent, + Env: []corev1.EnvVar{ + {Name: DriverVersionEnvName, Value: "570.172.08"}, + }, }).WithVolume(corev1.Volume{ Name: "licensing-config", VolumeSource: corev1.VolumeSource{ @@ -3140,6 +3183,9 @@ func TestTransformDriverWithResources(t *testing.T) { }).WithInitContainer(corev1.Container{ Name: "k8s-driver-manager", Image: "nvcr.io/nvidia/cloud-native/k8s-driver-manager:v0.8.0", + Env: []corev1.EnvVar{ + {Name: DriverVersionEnvName, Value: "570.172.08"}, + }, }), errorExpected: false, }, @@ -3219,6 +3265,10 @@ func TestTransformDriverRDMA(t *testing.T) { Name: "USE_HOST_MOFED", Value: "true", }, + { + Name: "DRIVER_VERSION", + Value: "570.172.08", + }, }, }).WithContainer(corev1.Container{ Name: "nvidia-peermem", diff --git a/deployments/gpu-operator/crds/nvidia.com_clusterpolicies.yaml b/deployments/gpu-operator/crds/nvidia.com_clusterpolicies.yaml index 0510c140f..07395d2cf 100644 --- a/deployments/gpu-operator/crds/nvidia.com_clusterpolicies.yaml +++ b/deployments/gpu-operator/crds/nvidia.com_clusterpolicies.yaml @@ -643,6 +643,15 @@ spec: name: type: string type: object + driverType: + description: |- + DriverType defines the type of NVIDIA driver to be deployed. + Accepted values are gpu, vgpu, and vgpu-host-manager. + enum: + - gpu + - vgpu + - vgpu-host-manager + type: string enabled: description: Enabled indicates if deployment of NVIDIA Driver through operator is enabled diff --git a/internal/state/driver_test.go b/internal/state/driver_test.go index 9b3938dc1..189899000 100644 --- a/internal/state/driver_test.go +++ b/internal/state/driver_test.go @@ -627,6 +627,7 @@ func getMinimalDriverRenderData() *driverRenderData { LivenessProbe: getDefaultContainerProbeSpec(), ReadinessProbe: getDefaultContainerProbeSpec(), DriverType: nvidiav1alpha1.GPU, + Version: "525.85.03", Resources: &nvidiav1alpha1.ResourceRequirements{ Requests: corev1.ResourceList{ corev1.ResourceCPU: resource.MustParse("200m"), diff --git a/internal/state/testdata/golden/driver-additional-configs.yaml b/internal/state/testdata/golden/driver-additional-configs.yaml index 9ac2e3666..d70029922 100644 --- a/internal/state/testdata/golden/driver-additional-configs.yaml +++ b/internal/state/testdata/golden/driver-additional-configs.yaml @@ -142,6 +142,8 @@ spec: valueFrom: fieldRef: fieldPath: status.hostIP + - name: DRIVER_TYPE + value: gpu image: nvcr.io/nvidia/driver:525.85.03-ubuntu22.04 imagePullPolicy: IfNotPresent lifecycle: @@ -239,6 +241,10 @@ spec: valueFrom: fieldRef: fieldPath: metadata.namespace + - name: DRIVER_VERSION + value: 525.85.03 + - name: DRIVER_TYPE + value: gpu image: nvcr.io/nvidia/cloud-native/k8s-driver-manager:devel imagePullPolicy: IfNotPresent name: k8s-driver-manager diff --git a/internal/state/testdata/golden/driver-full-spec.yaml b/internal/state/testdata/golden/driver-full-spec.yaml index 6eb6bed8d..c512c8817 100644 --- a/internal/state/testdata/golden/driver-full-spec.yaml +++ b/internal/state/testdata/golden/driver-full-spec.yaml @@ -248,6 +248,8 @@ spec: valueFrom: fieldRef: fieldPath: metadata.namespace + - name: KERNEL_MODULE_TYPE + value: open - name: FOO value: foo - name: BAR diff --git a/internal/state/testdata/golden/driver-gdrcopy-openshift.yaml b/internal/state/testdata/golden/driver-gdrcopy-openshift.yaml index 81bcfa59a..24e05f720 100644 --- a/internal/state/testdata/golden/driver-gdrcopy-openshift.yaml +++ b/internal/state/testdata/golden/driver-gdrcopy-openshift.yaml @@ -198,6 +198,8 @@ spec: valueFrom: fieldRef: fieldPath: status.hostIP + - name: DRIVER_TYPE + value: gpu - name: OPENSHIFT_VERSION value: "4.13" - name: HTTP_PROXY @@ -408,6 +410,10 @@ spec: valueFrom: fieldRef: fieldPath: metadata.namespace + - name: DRIVER_VERSION + value: 525.85.03 + - name: DRIVER_TYPE + value: gpu image: nvcr.io/nvidia/cloud-native/k8s-driver-manager:devel imagePullPolicy: IfNotPresent name: k8s-driver-manager diff --git a/internal/state/testdata/golden/driver-gdrcopy.yaml b/internal/state/testdata/golden/driver-gdrcopy.yaml index b9b8971fb..824880b94 100644 --- a/internal/state/testdata/golden/driver-gdrcopy.yaml +++ b/internal/state/testdata/golden/driver-gdrcopy.yaml @@ -142,6 +142,8 @@ spec: valueFrom: fieldRef: fieldPath: status.hostIP + - name: DRIVER_TYPE + value: gpu image: nvcr.io/nvidia/driver:525.85.03-ubuntu22.04 imagePullPolicy: IfNotPresent lifecycle: @@ -291,6 +293,10 @@ spec: valueFrom: fieldRef: fieldPath: metadata.namespace + - name: DRIVER_VERSION + value: 525.85.03 + - name: DRIVER_TYPE + value: gpu image: nvcr.io/nvidia/cloud-native/k8s-driver-manager:devel imagePullPolicy: IfNotPresent name: k8s-driver-manager diff --git a/internal/state/testdata/golden/driver-gds.yaml b/internal/state/testdata/golden/driver-gds.yaml index 43b930c2a..c8aabd73e 100644 --- a/internal/state/testdata/golden/driver-gds.yaml +++ b/internal/state/testdata/golden/driver-gds.yaml @@ -142,6 +142,8 @@ spec: valueFrom: fieldRef: fieldPath: status.hostIP + - name: DRIVER_TYPE + value: gpu image: nvcr.io/nvidia/driver:525.85.03-ubuntu22.04 imagePullPolicy: IfNotPresent lifecycle: @@ -291,6 +293,10 @@ spec: valueFrom: fieldRef: fieldPath: metadata.namespace + - name: DRIVER_VERSION + value: 525.85.03 + - name: DRIVER_TYPE + value: gpu image: nvcr.io/nvidia/cloud-native/k8s-driver-manager:devel imagePullPolicy: IfNotPresent name: k8s-driver-manager diff --git a/internal/state/testdata/golden/driver-minimal.yaml b/internal/state/testdata/golden/driver-minimal.yaml index f2b314dca..35670397d 100644 --- a/internal/state/testdata/golden/driver-minimal.yaml +++ b/internal/state/testdata/golden/driver-minimal.yaml @@ -142,6 +142,8 @@ spec: valueFrom: fieldRef: fieldPath: status.hostIP + - name: DRIVER_TYPE + value: gpu image: nvcr.io/nvidia/driver:525.85.03-ubuntu22.04 imagePullPolicy: IfNotPresent lifecycle: @@ -230,6 +232,10 @@ spec: valueFrom: fieldRef: fieldPath: metadata.namespace + - name: DRIVER_VERSION + value: 525.85.03 + - name: DRIVER_TYPE + value: gpu image: nvcr.io/nvidia/cloud-native/k8s-driver-manager:devel imagePullPolicy: IfNotPresent name: k8s-driver-manager diff --git a/internal/state/testdata/golden/driver-openshift-drivertoolkit.yaml b/internal/state/testdata/golden/driver-openshift-drivertoolkit.yaml index 04b00b7d2..527ad3b8e 100644 --- a/internal/state/testdata/golden/driver-openshift-drivertoolkit.yaml +++ b/internal/state/testdata/golden/driver-openshift-drivertoolkit.yaml @@ -198,6 +198,8 @@ spec: valueFrom: fieldRef: fieldPath: status.hostIP + - name: DRIVER_TYPE + value: gpu - name: OPENSHIFT_VERSION value: "4.13" - name: HTTP_PROXY @@ -346,6 +348,10 @@ spec: valueFrom: fieldRef: fieldPath: metadata.namespace + - name: DRIVER_VERSION + value: 525.85.03 + - name: DRIVER_TYPE + value: gpu image: nvcr.io/nvidia/cloud-native/k8s-driver-manager:devel imagePullPolicy: IfNotPresent name: k8s-driver-manager diff --git a/internal/state/testdata/golden/driver-precompiled.yaml b/internal/state/testdata/golden/driver-precompiled.yaml index 40c6bcf9a..f76dd161d 100644 --- a/internal/state/testdata/golden/driver-precompiled.yaml +++ b/internal/state/testdata/golden/driver-precompiled.yaml @@ -144,6 +144,8 @@ spec: valueFrom: fieldRef: fieldPath: status.hostIP + - name: DRIVER_TYPE + value: gpu image: nvcr.io/nvidia/driver:535-5.4.0-150-generic-ubuntu22.04 imagePullPolicy: IfNotPresent lifecycle: @@ -232,6 +234,10 @@ spec: valueFrom: fieldRef: fieldPath: metadata.namespace + - name: DRIVER_VERSION + value: 525.85.03 + - name: DRIVER_TYPE + value: gpu image: nvcr.io/nvidia/cloud-native/k8s-driver-manager:devel imagePullPolicy: IfNotPresent name: k8s-driver-manager diff --git a/internal/state/testdata/golden/driver-rdma-hostmofed.yaml b/internal/state/testdata/golden/driver-rdma-hostmofed.yaml index 917bafec6..36c06da34 100644 --- a/internal/state/testdata/golden/driver-rdma-hostmofed.yaml +++ b/internal/state/testdata/golden/driver-rdma-hostmofed.yaml @@ -142,6 +142,8 @@ spec: valueFrom: fieldRef: fieldPath: status.hostIP + - name: DRIVER_TYPE + value: gpu - name: GPU_DIRECT_RDMA_ENABLED value: "true" - name: USE_HOST_MOFED @@ -309,6 +311,10 @@ spec: valueFrom: fieldRef: fieldPath: metadata.namespace + - name: DRIVER_VERSION + value: 525.85.03 + - name: DRIVER_TYPE + value: gpu - name: GPU_DIRECT_RDMA_ENABLED value: "true" - name: USE_HOST_MOFED diff --git a/internal/state/testdata/golden/driver-rdma.yaml b/internal/state/testdata/golden/driver-rdma.yaml index 70e2b5b26..92a804d7f 100644 --- a/internal/state/testdata/golden/driver-rdma.yaml +++ b/internal/state/testdata/golden/driver-rdma.yaml @@ -142,6 +142,8 @@ spec: valueFrom: fieldRef: fieldPath: status.hostIP + - name: DRIVER_TYPE + value: gpu - name: GPU_DIRECT_RDMA_ENABLED value: "true" image: nvcr.io/nvidia/driver:525.85.03-ubuntu22.04 @@ -305,6 +307,10 @@ spec: valueFrom: fieldRef: fieldPath: metadata.namespace + - name: DRIVER_VERSION + value: 525.85.03 + - name: DRIVER_TYPE + value: gpu - name: GPU_DIRECT_RDMA_ENABLED value: "true" image: nvcr.io/nvidia/cloud-native/k8s-driver-manager:devel diff --git a/internal/state/testdata/golden/driver-secret-env.yaml b/internal/state/testdata/golden/driver-secret-env.yaml index 17fd86194..f510795f1 100644 --- a/internal/state/testdata/golden/driver-secret-env.yaml +++ b/internal/state/testdata/golden/driver-secret-env.yaml @@ -142,6 +142,8 @@ spec: valueFrom: fieldRef: fieldPath: status.hostIP + - name: DRIVER_TYPE + value: gpu envFrom: - secretRef: name: test-secret-env @@ -321,6 +323,10 @@ spec: valueFrom: fieldRef: fieldPath: metadata.namespace + - name: DRIVER_VERSION + value: 525.85.03 + - name: DRIVER_TYPE + value: gpu image: nvcr.io/nvidia/cloud-native/k8s-driver-manager:devel imagePullPolicy: IfNotPresent name: k8s-driver-manager diff --git a/internal/state/testdata/golden/driver-vgpu-host-manager-openshift.yaml b/internal/state/testdata/golden/driver-vgpu-host-manager-openshift.yaml index 77f96808c..b7f998eec 100644 --- a/internal/state/testdata/golden/driver-vgpu-host-manager-openshift.yaml +++ b/internal/state/testdata/golden/driver-vgpu-host-manager-openshift.yaml @@ -188,6 +188,8 @@ spec: valueFrom: fieldRef: fieldPath: status.hostIP + - name: DRIVER_TYPE + value: vgpu-host-manager - name: OPENSHIFT_VERSION value: "4.13" image: nvcr.io/nvidia/vgpu-manager:525.85.03-rhel8.0 @@ -307,6 +309,10 @@ spec: valueFrom: fieldRef: fieldPath: metadata.namespace + - name: DRIVER_VERSION + value: 525.85.03 + - name: DRIVER_TYPE + value: vgpu-host-manager image: nvcr.io/nvidia/cloud-native/k8s-driver-manager:devel imagePullPolicy: IfNotPresent name: k8s-driver-manager diff --git a/internal/state/testdata/golden/driver-vgpu-host-manager.yaml b/internal/state/testdata/golden/driver-vgpu-host-manager.yaml index 8da9c438a..638eb0232 100644 --- a/internal/state/testdata/golden/driver-vgpu-host-manager.yaml +++ b/internal/state/testdata/golden/driver-vgpu-host-manager.yaml @@ -142,6 +142,8 @@ spec: valueFrom: fieldRef: fieldPath: status.hostIP + - name: DRIVER_TYPE + value: vgpu-host-manager image: nvcr.io/nvidia/vgpu-manager:525.85.03-ubuntu22.04 imagePullPolicy: IfNotPresent name: nvidia-driver-ctr @@ -216,6 +218,10 @@ spec: valueFrom: fieldRef: fieldPath: metadata.namespace + - name: DRIVER_VERSION + value: 525.85.03 + - name: DRIVER_TYPE + value: vgpu-host-manager image: nvcr.io/nvidia/cloud-native/k8s-driver-manager:devel imagePullPolicy: IfNotPresent name: k8s-driver-manager diff --git a/internal/state/testdata/golden/driver-vgpu-licensing-secret.yaml b/internal/state/testdata/golden/driver-vgpu-licensing-secret.yaml index 19a8726a8..efbbe7011 100644 --- a/internal/state/testdata/golden/driver-vgpu-licensing-secret.yaml +++ b/internal/state/testdata/golden/driver-vgpu-licensing-secret.yaml @@ -142,6 +142,8 @@ spec: valueFrom: fieldRef: fieldPath: status.hostIP + - name: DRIVER_TYPE + value: gpu image: nvcr.io/nvidia/driver:525.85.03-ubuntu22.04 imagePullPolicy: IfNotPresent lifecycle: @@ -236,6 +238,10 @@ spec: valueFrom: fieldRef: fieldPath: metadata.namespace + - name: DRIVER_VERSION + value: 525.85.03 + - name: DRIVER_TYPE + value: gpu image: nvcr.io/nvidia/cloud-native/k8s-driver-manager:devel imagePullPolicy: IfNotPresent name: k8s-driver-manager diff --git a/internal/state/testdata/golden/driver-vgpu-licensing.yaml b/internal/state/testdata/golden/driver-vgpu-licensing.yaml index 0286d488d..86ee8dcac 100644 --- a/internal/state/testdata/golden/driver-vgpu-licensing.yaml +++ b/internal/state/testdata/golden/driver-vgpu-licensing.yaml @@ -142,6 +142,8 @@ spec: valueFrom: fieldRef: fieldPath: status.hostIP + - name: DRIVER_TYPE + value: gpu image: nvcr.io/nvidia/driver:525.85.03-ubuntu22.04 imagePullPolicy: IfNotPresent lifecycle: @@ -236,6 +238,10 @@ spec: valueFrom: fieldRef: fieldPath: metadata.namespace + - name: DRIVER_VERSION + value: 525.85.03 + - name: DRIVER_TYPE + value: gpu image: nvcr.io/nvidia/cloud-native/k8s-driver-manager:devel imagePullPolicy: IfNotPresent name: k8s-driver-manager diff --git a/manifests/state-driver/0500_daemonset.yaml b/manifests/state-driver/0500_daemonset.yaml index e3d0dc1f6..06ebdcb21 100644 --- a/manifests/state-driver/0500_daemonset.yaml +++ b/manifests/state-driver/0500_daemonset.yaml @@ -148,6 +148,18 @@ spec: valueFrom: fieldRef: fieldPath: metadata.namespace + {{- if .Driver.Spec.Version }} + - name: DRIVER_VERSION + value: {{ .Driver.Spec.Version | quote }} + {{- end }} + {{- if .Driver.Spec.KernelModuleType }} + - name: KERNEL_MODULE_TYPE + value: {{ .Driver.Spec.KernelModuleType | quote }} + {{- end }} + {{- if .Driver.Spec.DriverType }} + - name: DRIVER_TYPE + value: {{ .Driver.Spec.DriverType | quote }} + {{- end }} {{- if and (.GPUDirectRDMA) (deref .GPUDirectRDMA.Enabled) }} - name: GPU_DIRECT_RDMA_ENABLED value: "true" @@ -226,6 +238,10 @@ spec: value: "true" {{- end }} {{- end }} + {{- if .Driver.Spec.DriverType }} + - name: DRIVER_TYPE + value: {{ .Driver.Spec.DriverType | quote }} + {{- end }} {{- if and (.Openshift) (.Runtime.OpenshiftVersion) }} - name: OPENSHIFT_VERSION value: {{ .Runtime.OpenshiftVersion | quote }}