Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions api/nvidia/v1/clusterpolicy_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -499,6 +499,15 @@ type DriverSpec struct {
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.x-descriptors="urn:alm:descriptor:com.tectonic.ui:select:auto,urn:alm:descriptor:com.tectonic.ui:select:open,urn:alm:descriptor:com.tectonic.ui:select:proprietary"
KernelModuleType string `json:"kernelModuleType,omitempty"`

// DriverType defines the type of NVIDIA driver to be deployed.
// Accepted values are gpu, vgpu, and vgpu-host-manager.
// +kubebuilder:validation:Enum=gpu;vgpu;vgpu-host-manager
// +kubebuilder:validation:Optional
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.displayName="Driver Type"
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.description="Driver Type"
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.x-descriptors="urn:alm:descriptor:com.tectonic.ui:select:gpu,urn:alm:descriptor:com.tectonic.ui:select:vgpu,urn:alm:descriptor:com.tectonic.ui:select:vgpu-host-manager"
DriverType string `json:"driverType,omitempty"`

// Enabled indicates if deployment of NVIDIA Driver through operator is enabled
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors=true
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.displayName="Enable NVIDIA Driver deployment through GPU Operator"
Expand Down
9 changes: 9 additions & 0 deletions bundle/manifests/nvidia.com_clusterpolicies.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -643,6 +643,15 @@ spec:
name:
type: string
type: object
driverType:
description: |-
DriverType defines the type of NVIDIA driver to be deployed.
Accepted values are gpu, vgpu, and vgpu-host-manager.
enum:
- gpu
- vgpu
- vgpu-host-manager
type: string
enabled:
description: Enabled indicates if deployment of NVIDIA Driver
through operator is enabled
Expand Down
26 changes: 24 additions & 2 deletions cmd/nvidia-validator/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -843,6 +843,20 @@ func (d *Driver) createStatusFile(driverInfo driverInfo) error {
return createStatusFileWithContent(outputDirFlag+"/"+driverStatusFile, statusFileContent)
}

// areNvidiaModulesLoaded checks if NVIDIA kernel modules are already loaded in kernel memory.
func areNvidiaModulesLoaded() bool {
// Check if the nvidia module is loaded by checking if /sys/module/nvidia/refcnt exists
if _, err := os.Stat("/sys/module/nvidia/refcnt"); err == nil {
refcntData, err := os.ReadFile("/sys/module/nvidia/refcnt")
if err == nil {
refcnt := strings.TrimSpace(string(refcntData))
log.Infof("NVIDIA kernel modules already loaded in kernel memory (refcnt=%s)", refcnt)
return true
}
}
return false
}

// createDevCharSymlinks creates symlinks in /host-dev-char that point to all possible NVIDIA devices nodes.
func createDevCharSymlinks(driverInfo driverInfo, disableDevCharSymlinkCreation bool) error {
if disableDevCharSymlinkCreation {
Expand All @@ -853,8 +867,16 @@ func createDevCharSymlinks(driverInfo driverInfo, disableDevCharSymlinkCreation

log.Info("creating symlinks under /dev/char that correspond to NVIDIA character devices")

// Only attempt to load NVIDIA kernel modules when we can chroot into driverRoot
loadKernelModules := driverInfo.isHostDriver || (driverInfo.devRoot == driverInfo.driverRoot)
// Check if NVIDIA modules are already loaded in kernel memory.
// If they are, we don't need to run modprobe (which would fail if modules aren't in /lib/modules/).
// This handles the case where the driver container performed a userspace-only install
// after detecting that modules were already loaded from a previous boot.
modulesAlreadyLoaded := areNvidiaModulesLoaded()

// Only attempt to load NVIDIA kernel modules when:
// 1. Modules are not already loaded in kernel memory, AND
// 2. We can chroot into driverRoot to run modprobe
loadKernelModules := !modulesAlreadyLoaded && (driverInfo.isHostDriver || (driverInfo.devRoot == driverInfo.driverRoot))

// driverRootCtrPath is the path of the driver install dir in the container. This will either be
// driverInstallDirCtrPathFlag or '/host'.
Expand Down
9 changes: 9 additions & 0 deletions config/crd/bases/nvidia.com_clusterpolicies.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -643,6 +643,15 @@ spec:
name:
type: string
type: object
driverType:
description: |-
DriverType defines the type of NVIDIA driver to be deployed.
Accepted values are gpu, vgpu, and vgpu-host-manager.
enum:
- gpu
- vgpu
- vgpu-host-manager
type: string
enabled:
description: Enabled indicates if deployment of NVIDIA Driver
through operator is enabled
Expand Down
31 changes: 27 additions & 4 deletions controllers/object_controls.go
Original file line number Diff line number Diff line change
Expand Up @@ -160,6 +160,10 @@ const (
OpenKernelModulesEnabledEnvName = "OPEN_KERNEL_MODULES_ENABLED"
// KernelModuleTypeEnvName is the name of the driver-container envvar to set the desired kernel module type
KernelModuleTypeEnvName = "KERNEL_MODULE_TYPE"
// DriverTypeEnvName is the name of the driver-container envvar to set the driver type
DriverTypeEnvName = "DRIVER_TYPE"
// DriverVersionEnvName is the name of the envvar to set the desired driver version
DriverVersionEnvName = "DRIVER_VERSION"
// MPSRootEnvName is the name of the envvar for configuring the MPS root
MPSRootEnvName = "MPS_ROOT"
// DefaultMPSRoot is the default MPS root path on the host
Expand Down Expand Up @@ -1002,7 +1006,7 @@ func TransformDriver(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolicySpec, n C
}

// update driver-manager initContainer
err = transformDriverManagerInitContainer(obj, &config.Driver.Manager, config.Driver.GPUDirectRDMA)
err = transformDriverManagerInitContainer(obj, &config.Driver.Manager, config.Driver.GPUDirectRDMA, config.Driver.Version, config.Driver.KernelModuleType, config.Driver.DriverType)
if err != nil {
return err
}
Expand Down Expand Up @@ -1050,7 +1054,7 @@ func TransformDriver(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolicySpec, n C
// TransformVGPUManager transforms NVIDIA vGPU Manager daemonset with required config as per ClusterPolicy
func TransformVGPUManager(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolicySpec, n ClusterPolicyController) error {
// update k8s-driver-manager initContainer
err := transformDriverManagerInitContainer(obj, &config.VGPUManager.DriverManager, nil)
err := transformDriverManagerInitContainer(obj, &config.VGPUManager.DriverManager, nil, config.VGPUManager.Version, "", "")
if err != nil {
return fmt.Errorf("failed to transform k8s-driver-manager initContainer for vGPU Manager: %v", err)
}
Expand Down Expand Up @@ -2009,7 +2013,7 @@ func TransformKataManager(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolicySpec
// TransformVFIOManager transforms VFIO-PCI Manager daemonset with required config as per ClusterPolicy
func TransformVFIOManager(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolicySpec, n ClusterPolicyController) error {
// update k8s-driver-manager initContainer
err := transformDriverManagerInitContainer(obj, &config.VFIOManager.DriverManager, nil)
err := transformDriverManagerInitContainer(obj, &config.VFIOManager.DriverManager, nil, config.VFIOManager.Version, "", "")
if err != nil {
return fmt.Errorf("failed to transform k8s-driver-manager initContainer for VFIO Manager: %v", err)
}
Expand Down Expand Up @@ -2741,7 +2745,7 @@ func transformConfigManagerSidecarContainer(obj *appsv1.DaemonSet, config *gpuv1
return nil
}

func transformDriverManagerInitContainer(obj *appsv1.DaemonSet, driverManagerSpec *gpuv1.DriverManagerSpec, rdmaSpec *gpuv1.GPUDirectRDMASpec) error {
func transformDriverManagerInitContainer(obj *appsv1.DaemonSet, driverManagerSpec *gpuv1.DriverManagerSpec, rdmaSpec *gpuv1.GPUDirectRDMASpec, driverVersion string, kernelModuleType string, driverType string) error {
container := findContainerByName(obj.Spec.Template.Spec.InitContainers, "k8s-driver-manager")

if container == nil {
Expand All @@ -2765,6 +2769,21 @@ func transformDriverManagerInitContainer(obj *appsv1.DaemonSet, driverManagerSpe
}
}

// set driver version for config change detection
if driverVersion != "" {
setContainerEnv(container, DriverVersionEnvName, driverVersion)
}

// set kernel module type for config change detection
if kernelModuleType != "" {
setContainerEnv(container, KernelModuleTypeEnvName, kernelModuleType)
}

// set driver type for config change detection
if driverType != "" {
setContainerEnv(container, DriverTypeEnvName, driverType)
}

// set/append environment variables for driver-manager initContainer
if len(driverManagerSpec.Env) > 0 {
for _, env := range driverManagerSpec.Env {
Expand Down Expand Up @@ -3424,6 +3443,10 @@ func transformDriverContainer(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolicy
}
}

if len(config.Driver.DriverType) > 0 {
setContainerEnv(driverContainer, DriverTypeEnvName, config.Driver.DriverType)
}

// set container probe timeouts
if config.Driver.StartupProbe != nil {
setContainerProbe(driverContainer, config.Driver.StartupProbe, Startup)
Expand Down
62 changes: 56 additions & 6 deletions controllers/transforms_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -1719,7 +1719,9 @@ func TestTransformVFIOManager(t *testing.T) {
Name: "k8s-driver-manager",
Image: "nvcr.io/nvidia/cloud-native/k8s-driver-manager:v1.0.0",
ImagePullPolicy: corev1.PullIfNotPresent,
Env: mockEnvCore,
Env: append([]corev1.EnvVar{
{Name: DriverVersionEnvName, Value: "v1.0.0"},
}, mockEnvCore...),
}).
WithPullSecret(secret),
},
Expand Down Expand Up @@ -1937,10 +1939,12 @@ func newBoolPtr(b bool) *bool {

func TestTransformDriverManagerInitContainer(t *testing.T) {
testCases := []struct {
description string
ds Daemonset
cpSpec *gpuv1.ClusterPolicySpec
expectedDs Daemonset
description string
ds Daemonset
cpSpec *gpuv1.ClusterPolicySpec
driverVersion string
kernelModuleType string
expectedDs Daemonset
}{
{
description: "transform k8s-driver-manager initContainer",
Expand All @@ -1963,6 +1967,8 @@ func TestTransformDriverManagerInitContainer(t *testing.T) {
},
},
},
driverVersion: "",
kernelModuleType: "",
expectedDs: NewDaemonset().WithInitContainer(corev1.Container{
Name: "k8s-driver-manager",
Image: "nvcr.io/nvidia/cloud-native/k8s-driver-manager:v1.0.0",
Expand All @@ -1974,11 +1980,39 @@ func TestTransformDriverManagerInitContainer(t *testing.T) {
},
}).WithInitContainer(corev1.Container{Name: "dummy"}).WithPullSecret("pull-secret"),
},
{
description: "transform k8s-driver-manager initContainer with driver version and kernel module type",
ds: NewDaemonset().
WithInitContainer(corev1.Container{Name: "k8s-driver-manager"}).
WithInitContainer(corev1.Container{Name: "dummy"}),
cpSpec: &gpuv1.ClusterPolicySpec{
Driver: gpuv1.DriverSpec{
Manager: gpuv1.DriverManagerSpec{
Repository: "nvcr.io/nvidia/cloud-native",
Image: "k8s-driver-manager",
Version: "v1.0.0",
ImagePullPolicy: "IfNotPresent",
ImagePullSecrets: []string{"pull-secret"},
},
},
},
driverVersion: "550.90.12",
kernelModuleType: "open",
expectedDs: NewDaemonset().WithInitContainer(corev1.Container{
Name: "k8s-driver-manager",
Image: "nvcr.io/nvidia/cloud-native/k8s-driver-manager:v1.0.0",
ImagePullPolicy: corev1.PullIfNotPresent,
Env: []corev1.EnvVar{
{Name: DriverVersionEnvName, Value: "550.90.12"},
{Name: KernelModuleTypeEnvName, Value: "open"},
},
}).WithInitContainer(corev1.Container{Name: "dummy"}).WithPullSecret("pull-secret"),
},
}

for _, tc := range testCases {
t.Run(tc.description, func(t *testing.T) {
err := transformDriverManagerInitContainer(tc.ds.DaemonSet, &tc.cpSpec.Driver.Manager, tc.cpSpec.Driver.GPUDirectRDMA)
err := transformDriverManagerInitContainer(tc.ds.DaemonSet, &tc.cpSpec.Driver.Manager, tc.cpSpec.Driver.GPUDirectRDMA, tc.driverVersion, tc.kernelModuleType, tc.cpSpec.Driver.DriverType)
require.NoError(t, err)
require.EqualValues(t, tc.expectedDs, tc.ds)
})
Expand Down Expand Up @@ -2665,6 +2699,9 @@ func TestTransformDriver(t *testing.T) {
}).WithInitContainer(corev1.Container{
Name: "k8s-driver-manager",
Image: "nvcr.io/nvidia/cloud-native/k8s-driver-manager:v0.8.0",
Env: []corev1.EnvVar{
{Name: DriverVersionEnvName, Value: "570.172.08"},
},
}),
errorExpected: false,
},
Expand Down Expand Up @@ -2962,6 +2999,9 @@ func TestTransformDriverWithLicensingConfig(t *testing.T) {
Name: "k8s-driver-manager",
Image: "nvcr.io/nvidia/cloud-native/k8s-driver-manager:v0.8.0",
ImagePullPolicy: corev1.PullIfNotPresent,
Env: []corev1.EnvVar{
{Name: DriverVersionEnvName, Value: "570.172.08"},
},
}).WithVolume(corev1.Volume{
Name: "licensing-config",
VolumeSource: corev1.VolumeSource{
Expand Down Expand Up @@ -3016,6 +3056,9 @@ func TestTransformDriverWithLicensingConfig(t *testing.T) {
Name: "k8s-driver-manager",
Image: "nvcr.io/nvidia/cloud-native/k8s-driver-manager:v0.8.0",
ImagePullPolicy: corev1.PullIfNotPresent,
Env: []corev1.EnvVar{
{Name: DriverVersionEnvName, Value: "570.172.08"},
},
}).WithVolume(corev1.Volume{
Name: "licensing-config",
VolumeSource: corev1.VolumeSource{
Expand Down Expand Up @@ -3140,6 +3183,9 @@ func TestTransformDriverWithResources(t *testing.T) {
}).WithInitContainer(corev1.Container{
Name: "k8s-driver-manager",
Image: "nvcr.io/nvidia/cloud-native/k8s-driver-manager:v0.8.0",
Env: []corev1.EnvVar{
{Name: DriverVersionEnvName, Value: "570.172.08"},
},
}),
errorExpected: false,
},
Expand Down Expand Up @@ -3219,6 +3265,10 @@ func TestTransformDriverRDMA(t *testing.T) {
Name: "USE_HOST_MOFED",
Value: "true",
},
{
Name: "DRIVER_VERSION",
Value: "570.172.08",
},
},
}).WithContainer(corev1.Container{
Name: "nvidia-peermem",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -643,6 +643,15 @@ spec:
name:
type: string
type: object
driverType:
description: |-
DriverType defines the type of NVIDIA driver to be deployed.
Accepted values are gpu, vgpu, and vgpu-host-manager.
enum:
- gpu
- vgpu
- vgpu-host-manager
type: string
enabled:
description: Enabled indicates if deployment of NVIDIA Driver
through operator is enabled
Expand Down
1 change: 1 addition & 0 deletions internal/state/driver_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -627,6 +627,7 @@ func getMinimalDriverRenderData() *driverRenderData {
LivenessProbe: getDefaultContainerProbeSpec(),
ReadinessProbe: getDefaultContainerProbeSpec(),
DriverType: nvidiav1alpha1.GPU,
Version: "525.85.03",
Resources: &nvidiav1alpha1.ResourceRequirements{
Requests: corev1.ResourceList{
corev1.ResourceCPU: resource.MustParse("200m"),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -142,6 +142,8 @@ spec:
valueFrom:
fieldRef:
fieldPath: status.hostIP
- name: DRIVER_TYPE
value: gpu
image: nvcr.io/nvidia/driver:525.85.03-ubuntu22.04
imagePullPolicy: IfNotPresent
lifecycle:
Expand Down Expand Up @@ -239,6 +241,10 @@ spec:
valueFrom:
fieldRef:
fieldPath: metadata.namespace
- name: DRIVER_VERSION
value: 525.85.03
- name: DRIVER_TYPE
value: gpu
image: nvcr.io/nvidia/cloud-native/k8s-driver-manager:devel
imagePullPolicy: IfNotPresent
name: k8s-driver-manager
Expand Down
2 changes: 2 additions & 0 deletions internal/state/testdata/golden/driver-full-spec.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -248,6 +248,8 @@ spec:
valueFrom:
fieldRef:
fieldPath: metadata.namespace
- name: KERNEL_MODULE_TYPE
value: open
- name: FOO
value: foo
- name: BAR
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -198,6 +198,8 @@ spec:
valueFrom:
fieldRef:
fieldPath: status.hostIP
- name: DRIVER_TYPE
value: gpu
- name: OPENSHIFT_VERSION
value: "4.13"
- name: HTTP_PROXY
Expand Down Expand Up @@ -408,6 +410,10 @@ spec:
valueFrom:
fieldRef:
fieldPath: metadata.namespace
- name: DRIVER_VERSION
value: 525.85.03
- name: DRIVER_TYPE
value: gpu
image: nvcr.io/nvidia/cloud-native/k8s-driver-manager:devel
imagePullPolicy: IfNotPresent
name: k8s-driver-manager
Expand Down
6 changes: 6 additions & 0 deletions internal/state/testdata/golden/driver-gdrcopy.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -142,6 +142,8 @@ spec:
valueFrom:
fieldRef:
fieldPath: status.hostIP
- name: DRIVER_TYPE
value: gpu
image: nvcr.io/nvidia/driver:525.85.03-ubuntu22.04
imagePullPolicy: IfNotPresent
lifecycle:
Expand Down Expand Up @@ -291,6 +293,10 @@ spec:
valueFrom:
fieldRef:
fieldPath: metadata.namespace
- name: DRIVER_VERSION
value: 525.85.03
- name: DRIVER_TYPE
value: gpu
image: nvcr.io/nvidia/cloud-native/k8s-driver-manager:devel
imagePullPolicy: IfNotPresent
name: k8s-driver-manager
Expand Down
Loading
Loading