From 0101bff707832c5a296b33a6dcd80061bf3df74a Mon Sep 17 00:00:00 2001 From: Tariq Ibrahim Date: Mon, 1 Dec 2025 13:12:27 -0800 Subject: [PATCH 1/2] [state-toolkit] add support for mounting runtime NRI sockets Signed-off-by: Tariq Ibrahim --- controllers/object_controls.go | 24 +++++++++++++++-- controllers/transforms_test.go | 47 +++++++++++++++++++++++++++++++--- 2 files changed, 65 insertions(+), 6 deletions(-) diff --git a/controllers/object_controls.go b/controllers/object_controls.go index 0b08fbc6d..6ac6a17a2 100644 --- a/controllers/object_controls.go +++ b/controllers/object_controls.go @@ -66,6 +66,8 @@ const ( DefaultDockerConfigFile = "/etc/docker/daemon.json" // DefaultDockerSocketFile indicates default docker socket file DefaultDockerSocketFile = "/var/run/docker.sock" + // DefaultRuntimeNRISocketFile indicates the default container runtime NRI socket file + DefaultRuntimeNRISocketFile = "/var/run/nri/nri.sock" // DefaultCRIOConfigFile indicates default config file path for cri-o. . DefaultCRIOConfigFile = "/etc/crio/config.toml" // DefaultCRIODropInConfigFile indicates the default path to the drop-in config file for cri-o @@ -82,9 +84,11 @@ const ( DefaultRuntimeClass = "nvidia" // DriverInstallPathVolName represents volume name for driver install path provided to toolkit DriverInstallPathVolName = "driver-install-path" - // DefaultRuntimeSocketTargetDir represents target directory where runtime socket dirctory will be mounted + // DefaultRuntimeNRISocketTargetDir represents target directory where runtime NRI socket directory will be mounted + DefaultRuntimeNRISocketTargetDir = "/runtime/nri-sock-dir/" + // DefaultRuntimeSocketTargetDir represents target directory where runtime socket directory will be mounted DefaultRuntimeSocketTargetDir = "/runtime/sock-dir/" - // DefaultRuntimeConfigTargetDir represents target directory where runtime socket dirctory will be mounted + // DefaultRuntimeConfigTargetDir represents target directory where runtime socket directory will be mounted DefaultRuntimeConfigTargetDir = "/runtime/config-dir/" // DefaultRuntimeDropInConfigTargetDir represents target directory where drop-in config directory will be mounted DefaultRuntimeDropInConfigTargetDir = "/runtime/config-dir.d/" @@ -1440,6 +1444,22 @@ func transformForRuntime(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolicySpec, socketVol := corev1.Volume{Name: volMountSocketName, VolumeSource: corev1.VolumeSource{HostPath: &corev1.HostPathVolumeSource{Path: path.Dir(runtimeSocketFile)}}} obj.Spec.Template.Spec.Volumes = append(obj.Spec.Template.Spec.Volumes, socketVol) } + + // setup mounts for the runtime NRI socket file + nriSocketFile := getContainerEnv(container, "RUNTIME_NRI_SOCKET") + if nriSocketFile == "" { + nriSocketFile = DefaultRuntimeNRISocketFile + } + + setContainerEnv(container, "RUNTIME_NRI_SOCKET", DefaultRuntimeNRISocketTargetDir+path.Base(nriSocketFile)) + + nriVolMountSocketName := "nri-socket" + nriVolMountSocket := corev1.VolumeMount{Name: nriVolMountSocketName, MountPath: DefaultRuntimeNRISocketTargetDir} + container.VolumeMounts = append(container.VolumeMounts, nriVolMountSocket) + + nriSocketVol := corev1.Volume{Name: nriVolMountSocketName, VolumeSource: corev1.VolumeSource{HostPath: &corev1.HostPathVolumeSource{Path: path.Dir(nriSocketFile), Type: ptr.To(corev1.HostPathDirectoryOrCreate)}}} + obj.Spec.Template.Spec.Volumes = append(obj.Spec.Template.Spec.Volumes, nriSocketVol) + return nil } diff --git a/controllers/transforms_test.go b/controllers/transforms_test.go index 2e0ae7c1d..d92290300 100644 --- a/controllers/transforms_test.go +++ b/controllers/transforms_test.go @@ -381,6 +381,7 @@ func TestTransformForRuntime(t *testing.T) { WithHostPathVolume("containerd-config", filepath.Dir(DefaultContainerdConfigFile), ptr.To(corev1.HostPathDirectoryOrCreate)). WithHostPathVolume("containerd-drop-in-config", "/etc/containerd/conf.d", ptr.To(corev1.HostPathDirectoryOrCreate)). WithHostPathVolume("containerd-socket", filepath.Dir(DefaultContainerdSocketFile), nil). + WithHostPathVolume("nri-socket", filepath.Dir(DefaultRuntimeNRISocketFile), ptr.To(corev1.HostPathDirectoryOrCreate)). WithContainer(corev1.Container{ Name: "test-ctr", Env: []corev1.EnvVar{ @@ -392,11 +393,13 @@ func TestTransformForRuntime(t *testing.T) { {Name: "RUNTIME_DROP_IN_CONFIG_HOST_PATH", Value: "/etc/containerd/conf.d/99-nvidia.toml"}, {Name: "RUNTIME_SOCKET", Value: filepath.Join(DefaultRuntimeSocketTargetDir, filepath.Base(DefaultContainerdSocketFile))}, {Name: "CONTAINERD_SOCKET", Value: filepath.Join(DefaultRuntimeSocketTargetDir, filepath.Base(DefaultContainerdSocketFile))}, + {Name: "RUNTIME_NRI_SOCKET", Value: filepath.Join(DefaultRuntimeNRISocketTargetDir, filepath.Base(DefaultRuntimeNRISocketFile))}, }, VolumeMounts: []corev1.VolumeMount{ {Name: "containerd-config", MountPath: DefaultRuntimeConfigTargetDir}, {Name: "containerd-drop-in-config", MountPath: "/runtime/config-dir.d/"}, {Name: "containerd-socket", MountPath: DefaultRuntimeSocketTargetDir}, + {Name: "nri-socket", MountPath: DefaultRuntimeNRISocketTargetDir}, }, }), }, @@ -414,6 +417,7 @@ func TestTransformForRuntime(t *testing.T) { WithHostPathVolume("containerd-config", filepath.Dir(DefaultContainerdConfigFile), ptr.To(corev1.HostPathDirectoryOrCreate)). WithHostPathVolume("containerd-drop-in-config", "/etc/containerd/conf.d", ptr.To(corev1.HostPathDirectoryOrCreate)). WithHostPathVolume("containerd-socket", filepath.Dir(DefaultContainerdSocketFile), nil). + WithHostPathVolume("nri-socket", filepath.Dir(DefaultRuntimeNRISocketFile), ptr.To(corev1.HostPathDirectoryOrCreate)). WithContainer(corev1.Container{ Name: "test-ctr", Env: []corev1.EnvVar{ @@ -426,11 +430,13 @@ func TestTransformForRuntime(t *testing.T) { {Name: "RUNTIME_DROP_IN_CONFIG_HOST_PATH", Value: "/etc/containerd/conf.d/99-nvidia.toml"}, {Name: "RUNTIME_SOCKET", Value: filepath.Join(DefaultRuntimeSocketTargetDir, filepath.Base(DefaultContainerdSocketFile))}, {Name: "CONTAINERD_SOCKET", Value: filepath.Join(DefaultRuntimeSocketTargetDir, filepath.Base(DefaultContainerdSocketFile))}, + {Name: "RUNTIME_NRI_SOCKET", Value: filepath.Join(DefaultRuntimeNRISocketTargetDir, filepath.Base(DefaultRuntimeNRISocketFile))}, }, VolumeMounts: []corev1.VolumeMount{ {Name: "containerd-config", MountPath: DefaultRuntimeConfigTargetDir}, {Name: "containerd-drop-in-config", MountPath: "/runtime/config-dir.d/"}, {Name: "containerd-socket", MountPath: DefaultRuntimeSocketTargetDir}, + {Name: "nri-socket", MountPath: DefaultRuntimeNRISocketTargetDir}, }, }), }, @@ -448,6 +454,7 @@ func TestTransformForRuntime(t *testing.T) { WithHostPathVolume("containerd-config", filepath.Dir(DefaultContainerdConfigFile), ptr.To(corev1.HostPathDirectoryOrCreate)). WithHostPathVolume("containerd-drop-in-config", "/etc/containerd/conf.d", ptr.To(corev1.HostPathDirectoryOrCreate)). WithHostPathVolume("containerd-socket", filepath.Dir(DefaultContainerdSocketFile), nil). + WithHostPathVolume("nri-socket", filepath.Dir(DefaultRuntimeNRISocketFile), ptr.To(corev1.HostPathDirectoryOrCreate)). WithContainer(corev1.Container{ Name: "test-ctr", Env: []corev1.EnvVar{ @@ -460,11 +467,13 @@ func TestTransformForRuntime(t *testing.T) { {Name: "RUNTIME_DROP_IN_CONFIG_HOST_PATH", Value: "/etc/containerd/conf.d/99-nvidia.toml"}, {Name: "RUNTIME_SOCKET", Value: filepath.Join(DefaultRuntimeSocketTargetDir, filepath.Base(DefaultContainerdSocketFile))}, {Name: "CONTAINERD_SOCKET", Value: filepath.Join(DefaultRuntimeSocketTargetDir, filepath.Base(DefaultContainerdSocketFile))}, + {Name: "RUNTIME_NRI_SOCKET", Value: filepath.Join(DefaultRuntimeNRISocketTargetDir, filepath.Base(DefaultRuntimeNRISocketFile))}, }, VolumeMounts: []corev1.VolumeMount{ {Name: "containerd-config", MountPath: DefaultRuntimeConfigTargetDir}, {Name: "containerd-drop-in-config", MountPath: "/runtime/config-dir.d/"}, {Name: "containerd-socket", MountPath: DefaultRuntimeSocketTargetDir}, + {Name: "nri-socket", MountPath: DefaultRuntimeNRISocketTargetDir}, }, }), }, @@ -475,6 +484,7 @@ func TestTransformForRuntime(t *testing.T) { expectedOutput: NewDaemonset(). WithHostPathVolume("crio-config", "/etc/crio", ptr.To(corev1.HostPathDirectoryOrCreate)). WithHostPathVolume("crio-drop-in-config", "/etc/crio/crio.conf.d", ptr.To(corev1.HostPathDirectoryOrCreate)). + WithHostPathVolume("nri-socket", filepath.Dir(DefaultRuntimeNRISocketFile), ptr.To(corev1.HostPathDirectoryOrCreate)). WithContainer(corev1.Container{ Name: "test-ctr", Env: []corev1.EnvVar{ @@ -483,10 +493,12 @@ func TestTransformForRuntime(t *testing.T) { {Name: "CRIO_CONFIG", Value: "/runtime/config-dir/config.toml"}, {Name: "RUNTIME_DROP_IN_CONFIG", Value: "/runtime/config-dir.d/99-nvidia.conf"}, {Name: "RUNTIME_DROP_IN_CONFIG_HOST_PATH", Value: "/etc/crio/crio.conf.d/99-nvidia.conf"}, + {Name: "RUNTIME_NRI_SOCKET", Value: filepath.Join(DefaultRuntimeNRISocketTargetDir, filepath.Base(DefaultRuntimeNRISocketFile))}, }, VolumeMounts: []corev1.VolumeMount{ {Name: "crio-config", MountPath: DefaultRuntimeConfigTargetDir}, {Name: "crio-drop-in-config", MountPath: "/runtime/config-dir.d/"}, + {Name: "nri-socket", MountPath: DefaultRuntimeNRISocketTargetDir}, }, }), }, @@ -499,6 +511,7 @@ func TestTransformForRuntime(t *testing.T) { expectedOutput: NewDaemonset(). WithHostPathVolume("containerd-config", filepath.Dir(DefaultContainerdConfigFile), ptr.To(corev1.HostPathDirectoryOrCreate)). WithHostPathVolume("containerd-socket", filepath.Dir(DefaultContainerdSocketFile), nil). + WithHostPathVolume("nri-socket", filepath.Dir(DefaultRuntimeNRISocketFile), ptr.To(corev1.HostPathDirectoryOrCreate)). WithContainer(corev1.Container{ Name: "nvidia-kata-manager", Env: []corev1.EnvVar{ @@ -508,10 +521,12 @@ func TestTransformForRuntime(t *testing.T) { {Name: "CONTAINERD_CONFIG", Value: filepath.Join(DefaultRuntimeConfigTargetDir, filepath.Base(DefaultContainerdConfigFile))}, {Name: "RUNTIME_SOCKET", Value: filepath.Join(DefaultRuntimeSocketTargetDir, filepath.Base(DefaultContainerdSocketFile))}, {Name: "CONTAINERD_SOCKET", Value: filepath.Join(DefaultRuntimeSocketTargetDir, filepath.Base(DefaultContainerdSocketFile))}, + {Name: "RUNTIME_NRI_SOCKET", Value: filepath.Join(DefaultRuntimeNRISocketTargetDir, filepath.Base(DefaultRuntimeNRISocketFile))}, }, VolumeMounts: []corev1.VolumeMount{ {Name: "containerd-config", MountPath: DefaultRuntimeConfigTargetDir}, {Name: "containerd-socket", MountPath: DefaultRuntimeSocketTargetDir}, + {Name: "nri-socket", MountPath: DefaultRuntimeNRISocketTargetDir}, }, }), }, @@ -522,6 +537,7 @@ func TestTransformForRuntime(t *testing.T) { expectedOutput: NewDaemonset(). WithHostPathVolume("docker-config", filepath.Dir(DefaultDockerConfigFile), ptr.To(corev1.HostPathDirectoryOrCreate)). WithHostPathVolume("docker-socket", filepath.Dir(DefaultDockerSocketFile), nil). + WithHostPathVolume("nri-socket", filepath.Dir(DefaultRuntimeNRISocketFile), ptr.To(corev1.HostPathDirectoryOrCreate)). WithContainer(corev1.Container{ Name: "test-ctr", Env: []corev1.EnvVar{ @@ -530,10 +546,12 @@ func TestTransformForRuntime(t *testing.T) { {Name: "DOCKER_CONFIG", Value: filepath.Join(DefaultRuntimeConfigTargetDir, filepath.Base(DefaultDockerConfigFile))}, {Name: "RUNTIME_SOCKET", Value: filepath.Join(DefaultRuntimeSocketTargetDir, filepath.Base(DefaultDockerSocketFile))}, {Name: "DOCKER_SOCKET", Value: filepath.Join(DefaultRuntimeSocketTargetDir, filepath.Base(DefaultDockerSocketFile))}, + {Name: "RUNTIME_NRI_SOCKET", Value: filepath.Join(DefaultRuntimeNRISocketTargetDir, filepath.Base(DefaultRuntimeNRISocketFile))}, }, VolumeMounts: []corev1.VolumeMount{ {Name: "docker-config", MountPath: DefaultRuntimeConfigTargetDir}, {Name: "docker-socket", MountPath: DefaultRuntimeSocketTargetDir}, + {Name: "nri-socket", MountPath: DefaultRuntimeNRISocketTargetDir}, }, }), }, @@ -834,16 +852,19 @@ func TestTransformToolkit(t *testing.T) { {Name: "RUNTIME_DROP_IN_CONFIG_HOST_PATH", Value: "/etc/containerd/conf.d/99-nvidia.toml"}, {Name: "RUNTIME_SOCKET", Value: "/runtime/sock-dir/containerd.sock"}, {Name: "CONTAINERD_SOCKET", Value: "/runtime/sock-dir/containerd.sock"}, + {Name: "RUNTIME_NRI_SOCKET", Value: "/runtime/nri-sock-dir/nri.sock"}, }, VolumeMounts: []corev1.VolumeMount{ {Name: "containerd-config", MountPath: "/runtime/config-dir/"}, {Name: "containerd-drop-in-config", MountPath: "/runtime/config-dir.d/"}, {Name: "containerd-socket", MountPath: "/runtime/sock-dir/"}, + {Name: "nri-socket", MountPath: "/runtime/nri-sock-dir/"}, }, }). WithHostPathVolume("containerd-config", "/etc/containerd", ptr.To(corev1.HostPathDirectoryOrCreate)). WithHostPathVolume("containerd-drop-in-config", "/etc/containerd/conf.d", ptr.To(corev1.HostPathDirectoryOrCreate)). WithHostPathVolume("containerd-socket", "/run/containerd", nil). + WithHostPathVolume("nri-socket", "/var/run/nri", ptr.To(corev1.HostPathDirectoryOrCreate)). WithPullSecret("pull-secret"), }, { @@ -913,16 +934,19 @@ func TestTransformToolkit(t *testing.T) { {Name: "RUNTIME_DROP_IN_CONFIG", Value: "/runtime/config-dir.d/99-nvidia.toml"}, {Name: "RUNTIME_DROP_IN_CONFIG_HOST_PATH", Value: "/etc/containerd/conf.d/99-nvidia.toml"}, {Name: "RUNTIME_SOCKET", Value: "/runtime/sock-dir/containerd.sock"}, + {Name: "RUNTIME_NRI_SOCKET", Value: "/runtime/nri-sock-dir/nri.sock"}, }, VolumeMounts: []corev1.VolumeMount{ {Name: "containerd-config", MountPath: "/runtime/config-dir/"}, {Name: "containerd-drop-in-config", MountPath: "/runtime/config-dir.d/"}, {Name: "containerd-socket", MountPath: "/runtime/sock-dir/"}, + {Name: "nri-socket", MountPath: "/runtime/nri-sock-dir/"}, }, }). WithHostPathVolume("containerd-config", "/var/lib/rancher/k3s/agent/etc/containerd", ptr.To(corev1.HostPathDirectoryOrCreate)). WithHostPathVolume("containerd-drop-in-config", "/etc/containerd/conf.d", ptr.To(corev1.HostPathDirectoryOrCreate)). WithHostPathVolume("containerd-socket", "/run/k3s/containerd", nil). + WithHostPathVolume("nri-socket", "/var/run/nri", ptr.To(corev1.HostPathDirectoryOrCreate)). WithPullSecret("pull-secret"), }, { @@ -952,14 +976,17 @@ func TestTransformToolkit(t *testing.T) { {Name: "CRIO_CONFIG", Value: "/runtime/config-dir/config.toml"}, {Name: "RUNTIME_DROP_IN_CONFIG", Value: "/runtime/config-dir.d/99-nvidia.conf"}, {Name: "RUNTIME_DROP_IN_CONFIG_HOST_PATH", Value: "/etc/crio/crio.conf.d/99-nvidia.conf"}, + {Name: "RUNTIME_NRI_SOCKET", Value: "/runtime/nri-sock-dir/nri.sock"}, }, VolumeMounts: []corev1.VolumeMount{ {Name: "crio-config", MountPath: DefaultRuntimeConfigTargetDir}, {Name: "crio-drop-in-config", MountPath: "/runtime/config-dir.d/"}, + {Name: "nri-socket", MountPath: "/runtime/nri-sock-dir/"}, }, }). WithHostPathVolume("crio-config", "/etc/crio", ptr.To(corev1.HostPathDirectoryOrCreate)). - WithHostPathVolume("crio-drop-in-config", "/etc/crio/crio.conf.d", ptr.To(corev1.HostPathDirectoryOrCreate)), + WithHostPathVolume("crio-drop-in-config", "/etc/crio/crio.conf.d", ptr.To(corev1.HostPathDirectoryOrCreate)). + WithHostPathVolume("nri-socket", "/var/run/nri", ptr.To(corev1.HostPathDirectoryOrCreate)), }, { description: "transform nvidia-container-toolkit-ctr container, cri-o runtime, cdi disabled", @@ -988,14 +1015,17 @@ func TestTransformToolkit(t *testing.T) { {Name: "CRIO_CONFIG", Value: "/runtime/config-dir/config.toml"}, {Name: "RUNTIME_DROP_IN_CONFIG", Value: "/runtime/config-dir.d/99-nvidia.conf"}, {Name: "RUNTIME_DROP_IN_CONFIG_HOST_PATH", Value: "/etc/crio/crio.conf.d/99-nvidia.conf"}, + {Name: "RUNTIME_NRI_SOCKET", Value: "/runtime/nri-sock-dir/nri.sock"}, }, VolumeMounts: []corev1.VolumeMount{ {Name: "crio-config", MountPath: DefaultRuntimeConfigTargetDir}, {Name: "crio-drop-in-config", MountPath: "/runtime/config-dir.d/"}, + {Name: "nri-socket", MountPath: "/runtime/nri-sock-dir/"}, }, }). WithHostPathVolume("crio-config", "/etc/crio", ptr.To(corev1.HostPathDirectoryOrCreate)). - WithHostPathVolume("crio-drop-in-config", "/etc/crio/crio.conf.d", ptr.To(corev1.HostPathDirectoryOrCreate)), + WithHostPathVolume("crio-drop-in-config", "/etc/crio/crio.conf.d", ptr.To(corev1.HostPathDirectoryOrCreate)). + WithHostPathVolume("nri-socket", "/var/run/nri", ptr.To(corev1.HostPathDirectoryOrCreate)), }, } @@ -1578,13 +1608,19 @@ func TestTransformKataManager(t *testing.T) { {Name: "CONTAINERD_CONFIG", Value: "/runtime/config-dir/config.toml"}, {Name: "RUNTIME_SOCKET", Value: "/runtime/sock-dir/containerd.sock"}, {Name: "CONTAINERD_SOCKET", Value: "/runtime/sock-dir/containerd.sock"}, + {Name: "RUNTIME_NRI_SOCKET", Value: "/runtime/nri-sock-dir/nri.sock"}, }, VolumeMounts: []corev1.VolumeMount{ {Name: "kata-artifacts", MountPath: "/var/lib/kata"}, {Name: "containerd-config", MountPath: "/runtime/config-dir/"}, {Name: "containerd-socket", MountPath: "/runtime/sock-dir/"}, + {Name: "nri-socket", MountPath: "/runtime/nri-sock-dir/"}, }, - }).WithPullSecret("pull-secret").WithPodAnnotations(map[string]string{"nvidia.com/kata-manager.last-applied-hash": "1929911998"}).WithHostPathVolume("kata-artifacts", "/var/lib/kata", ptr.To(corev1.HostPathDirectoryOrCreate)).WithHostPathVolume("containerd-config", "/etc/containerd", ptr.To(corev1.HostPathDirectoryOrCreate)).WithHostPathVolume("containerd-socket", "/run/containerd", nil), + }).WithPullSecret("pull-secret").WithPodAnnotations(map[string]string{"nvidia.com/kata-manager.last-applied-hash": "1929911998"}). + WithHostPathVolume("kata-artifacts", "/var/lib/kata", ptr.To(corev1.HostPathDirectoryOrCreate)). + WithHostPathVolume("containerd-config", "/etc/containerd", ptr.To(corev1.HostPathDirectoryOrCreate)). + WithHostPathVolume("containerd-socket", "/run/containerd", nil). + WithHostPathVolume("nri-socket", "/var/run/nri", ptr.To(corev1.HostPathDirectoryOrCreate)), }, { description: "transform kata manager with custom container runtime socket", @@ -1630,17 +1666,20 @@ func TestTransformKataManager(t *testing.T) { {Name: "RUNTIME", Value: "containerd"}, {Name: "RUNTIME_CONFIG", Value: "/runtime/config-dir/config.toml"}, {Name: "RUNTIME_SOCKET", Value: "/runtime/sock-dir/containerd.sock"}, + {Name: "RUNTIME_NRI_SOCKET", Value: "/runtime/nri-sock-dir/nri.sock"}, }, VolumeMounts: []corev1.VolumeMount{ {Name: "kata-artifacts", MountPath: "/var/lib/kata"}, {Name: "containerd-config", MountPath: "/runtime/config-dir/"}, {Name: "containerd-socket", MountPath: "/runtime/sock-dir/"}, + {Name: "nri-socket", MountPath: "/runtime/nri-sock-dir/"}, }, }).WithPullSecret("pull-secret"). WithPodAnnotations(map[string]string{"nvidia.com/kata-manager.last-applied-hash": "1929911998"}). WithHostPathVolume("kata-artifacts", "/var/lib/kata", ptr.To(corev1.HostPathDirectoryOrCreate)). WithHostPathVolume("containerd-config", "/var/lib/rancher/k3s/agent/etc/containerd", ptr.To(corev1.HostPathDirectoryOrCreate)). - WithHostPathVolume("containerd-socket", "/run/k3s/containerd", nil), + WithHostPathVolume("containerd-socket", "/run/k3s/containerd", nil). + WithHostPathVolume("nri-socket", "/var/run/nri", ptr.To(corev1.HostPathDirectoryOrCreate)), }, } From 436d85455da1e98bc79d50e72e29313354515671 Mon Sep 17 00:00:00 2001 From: Tariq Ibrahim Date: Thu, 4 Dec 2025 14:18:49 -0800 Subject: [PATCH 2/2] add nri plugin annotation instead of setting nvidia runtimeclass Signed-off-by: Tariq Ibrahim --- .../gpu-feature-discovery/0500_daemonset.yaml | 2 ++ .../state-dcgm-exporter/0800_daemonset.yaml | 2 ++ assets/state-dcgm/0400_dcgm.yml | 2 ++ .../state-device-plugin/0500_daemonset.yaml | 2 ++ assets/state-kata-manager/0600_daemonset.yaml | 2 ++ assets/state-mig-manager/0600_daemonset.yaml | 2 ++ .../0400_daemonset.yaml | 2 ++ .../0500_daemonset.yaml | 2 ++ .../0500_daemonset.yaml | 2 ++ .../0500_daemonset.yaml | 2 ++ .../0600_daemonset.yaml | 2 ++ controllers/object_controls.go | 28 +++++++++---------- .../manifests/cuda-workload-validation.yaml | 2 ++ 13 files changed, 38 insertions(+), 14 deletions(-) diff --git a/assets/gpu-feature-discovery/0500_daemonset.yaml b/assets/gpu-feature-discovery/0500_daemonset.yaml index 8cf620184..8be9f0769 100644 --- a/assets/gpu-feature-discovery/0500_daemonset.yaml +++ b/assets/gpu-feature-discovery/0500_daemonset.yaml @@ -16,6 +16,8 @@ spec: labels: app: gpu-feature-discovery app.kubernetes.io/part-of: nvidia-gpu + annotations: + nvidia.cdi.k8s.io/container.gpu-feature-discovery: management.nvidia.com/gpu=all spec: nodeSelector: nvidia.com/gpu.deploy.gpu-feature-discovery: "true" diff --git a/assets/state-dcgm-exporter/0800_daemonset.yaml b/assets/state-dcgm-exporter/0800_daemonset.yaml index 30e65f103..750a5f8e4 100644 --- a/assets/state-dcgm-exporter/0800_daemonset.yaml +++ b/assets/state-dcgm-exporter/0800_daemonset.yaml @@ -15,6 +15,8 @@ spec: metadata: labels: app: nvidia-dcgm-exporter + annotations: + nvidia.cdi.k8s.io/container.nvidia-dcgm-exporter: management.nvidia.com/gpu=all spec: nodeSelector: nvidia.com/gpu.deploy.dcgm-exporter: "true" diff --git a/assets/state-dcgm/0400_dcgm.yml b/assets/state-dcgm/0400_dcgm.yml index 14fea317a..c0478c873 100644 --- a/assets/state-dcgm/0400_dcgm.yml +++ b/assets/state-dcgm/0400_dcgm.yml @@ -15,6 +15,8 @@ spec: metadata: labels: app: nvidia-dcgm + annotations: + nvidia.cdi.k8s.io/container.nvidia-dcgm-ctr: management.nvidia.com/gpu=all spec: nodeSelector: nvidia.com/gpu.deploy.dcgm: "true" diff --git a/assets/state-device-plugin/0500_daemonset.yaml b/assets/state-device-plugin/0500_daemonset.yaml index c4d85adfb..838cd5f6f 100644 --- a/assets/state-device-plugin/0500_daemonset.yaml +++ b/assets/state-device-plugin/0500_daemonset.yaml @@ -15,6 +15,8 @@ spec: metadata: labels: app: nvidia-device-plugin-daemonset + annotations: + nvidia.cdi.k8s.io/container.nvidia-device-plugin: management.nvidia.com/gpu=all spec: nodeSelector: nvidia.com/gpu.deploy.device-plugin: "true" diff --git a/assets/state-kata-manager/0600_daemonset.yaml b/assets/state-kata-manager/0600_daemonset.yaml index 1f4ca17cb..628c49c30 100644 --- a/assets/state-kata-manager/0600_daemonset.yaml +++ b/assets/state-kata-manager/0600_daemonset.yaml @@ -13,6 +13,8 @@ spec: metadata: labels: name: nvidia-kata-manager + annotations: + nvidia.cdi.k8s.io/container.nvidia-kata-manager: management.nvidia.com/gpu=all spec: tolerations: - key: nvidia.com/gpu diff --git a/assets/state-mig-manager/0600_daemonset.yaml b/assets/state-mig-manager/0600_daemonset.yaml index 1a9076169..16976f251 100644 --- a/assets/state-mig-manager/0600_daemonset.yaml +++ b/assets/state-mig-manager/0600_daemonset.yaml @@ -13,6 +13,8 @@ spec: metadata: labels: app: nvidia-mig-manager + annotations: + nvidia.cdi.k8s.io/container.nvidia-mig-manager: management.nvidia.com/gpu=all spec: nodeSelector: nvidia.com/gpu.deploy.mig-manager: "true" diff --git a/assets/state-mps-control-daemon/0400_daemonset.yaml b/assets/state-mps-control-daemon/0400_daemonset.yaml index 3be58af20..d799530d5 100644 --- a/assets/state-mps-control-daemon/0400_daemonset.yaml +++ b/assets/state-mps-control-daemon/0400_daemonset.yaml @@ -15,6 +15,8 @@ spec: metadata: labels: app: nvidia-device-plugin-mps-control-daemon + annotations: + nvidia.cdi.k8s.io/container.mps-control-daemon-ctr: management.nvidia.com/gpu=all spec: nodeSelector: nvidia.com/gpu.deploy.device-plugin: "true" diff --git a/assets/state-operator-validation/0500_daemonset.yaml b/assets/state-operator-validation/0500_daemonset.yaml index 2030d6b01..a1757a13b 100644 --- a/assets/state-operator-validation/0500_daemonset.yaml +++ b/assets/state-operator-validation/0500_daemonset.yaml @@ -16,6 +16,8 @@ spec: labels: app: nvidia-operator-validator app.kubernetes.io/part-of: gpu-operator + annotations: + nvidia.cdi.k8s.io/container.toolkit-validation: management.nvidia.com/gpu=all spec: nodeSelector: nvidia.com/gpu.deploy.operator-validator: "true" diff --git a/assets/state-sandbox-device-plugin/0500_daemonset.yaml b/assets/state-sandbox-device-plugin/0500_daemonset.yaml index f99b6f075..ebc930fd3 100644 --- a/assets/state-sandbox-device-plugin/0500_daemonset.yaml +++ b/assets/state-sandbox-device-plugin/0500_daemonset.yaml @@ -15,6 +15,8 @@ spec: metadata: labels: app: nvidia-sandbox-device-plugin-daemonset + annotations: + nvidia.cdi.k8s.io/container.nvidia-sandbox-device-plugin-ctr: management.nvidia.com/gpu=all spec: nodeSelector: nvidia.com/gpu.deploy.sandbox-device-plugin: "true" diff --git a/assets/state-sandbox-validation/0500_daemonset.yaml b/assets/state-sandbox-validation/0500_daemonset.yaml index fcc2aa12a..bd2995763 100644 --- a/assets/state-sandbox-validation/0500_daemonset.yaml +++ b/assets/state-sandbox-validation/0500_daemonset.yaml @@ -16,6 +16,8 @@ spec: labels: app: nvidia-sandbox-validator app.kubernetes.io/part-of: gpu-operator + annotations: + nvidia.cdi.k8s.io/container.nvidia-sandbox-validator: management.nvidia.com/gpu=all spec: nodeSelector: nvidia.com/gpu.deploy.sandbox-validator: "true" diff --git a/assets/state-vgpu-device-manager/0600_daemonset.yaml b/assets/state-vgpu-device-manager/0600_daemonset.yaml index afe9faeea..9b62a3ed3 100644 --- a/assets/state-vgpu-device-manager/0600_daemonset.yaml +++ b/assets/state-vgpu-device-manager/0600_daemonset.yaml @@ -15,6 +15,8 @@ spec: metadata: labels: app: nvidia-vgpu-device-manager + annotations: + nvidia.cdi.k8s.io/container.nvidia-vgpu-device-manager: management.nvidia.com/gpu=all spec: nodeSelector: nvidia.com/gpu.deploy.vgpu-device-manager: "true" diff --git a/controllers/object_controls.go b/controllers/object_controls.go index 6ac6a17a2..ede25ea11 100644 --- a/controllers/object_controls.go +++ b/controllers/object_controls.go @@ -945,7 +945,7 @@ func TransformGPUDiscoveryPlugin(obj *appsv1.DaemonSet, config *gpuv1.ClusterPol return err } - setRuntimeClassName(&obj.Spec.Template.Spec, config, n.runtime) + // setRuntimeClassName(&obj.Spec.Template.Spec, config, n.runtime) // update env required for MIG support applyMIGConfiguration(&(obj.Spec.Template.Spec.Containers[0]), config.MIG.Strategy) @@ -1531,7 +1531,7 @@ func TransformDevicePlugin(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolicySpe return err } - setRuntimeClassName(&obj.Spec.Template.Spec, config, n.runtime) + // setRuntimeClassName(&obj.Spec.Template.Spec, config, n.runtime) // update env required for MIG support applyMIGConfiguration(devicePluginMainContainer, config.MIG.Strategy) @@ -1611,7 +1611,7 @@ func TransformMPSControlDaemon(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolic return err } - setRuntimeClassName(&obj.Spec.Template.Spec, config, n.runtime) + // setRuntimeClassName(&obj.Spec.Template.Spec, config, n.runtime) // update env required for MIG support applyMIGConfiguration(mpsControlMainContainer, config.MIG.Strategy) @@ -1719,7 +1719,7 @@ func TransformDCGMExporter(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolicySpe } } - setRuntimeClassName(&obj.Spec.Template.Spec, config, n.runtime) + // setRuntimeClassName(&obj.Spec.Template.Spec, config, n.runtime) // set hostPID if specified for DCGM Exporter if config.DCGMExporter.IsHostPIDEnabled() { @@ -1869,7 +1869,7 @@ func TransformDCGM(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolicySpec, n Clu } } - setRuntimeClassName(&obj.Spec.Template.Spec, config, n.runtime) + // setRuntimeClassName(&obj.Spec.Template.Spec, config, n.runtime) return nil } @@ -1911,7 +1911,7 @@ func TransformMIGManager(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolicySpec, obj.Spec.Template.Spec.Containers[0].Args = config.MIGManager.Args } - setRuntimeClassName(&obj.Spec.Template.Spec, config, n.runtime) + // setRuntimeClassName(&obj.Spec.Template.Spec, config, n.runtime) // set ConfigMap name for "mig-parted-config" Volume for i, vol := range obj.Spec.Template.Spec.Volumes { @@ -2205,7 +2205,7 @@ func TransformValidator(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolicySpec, return fmt.Errorf("%v", err) } - setRuntimeClassName(&obj.Spec.Template.Spec, config, n.runtime) + // setRuntimeClassName(&obj.Spec.Template.Spec, config, n.runtime) var validatorErr error // apply changes for individual component validators(initContainers) @@ -2579,13 +2579,13 @@ func getRuntimeClassName(config *gpuv1.ClusterPolicySpec) string { return DefaultRuntimeClass } -func setRuntimeClassName(podSpec *corev1.PodSpec, config *gpuv1.ClusterPolicySpec, runtime gpuv1.Runtime) { - if !config.CDI.IsEnabled() && runtime == gpuv1.CRIO { - return - } - runtimeClassName := getRuntimeClassName(config) - podSpec.RuntimeClassName = &runtimeClassName -} +// func setRuntimeClassName(podSpec *corev1.PodSpec, config *gpuv1.ClusterPolicySpec, runtime gpuv1.Runtime) { +// if !config.CDI.IsEnabled() && runtime == gpuv1.CRIO { +// return +// } +// runtimeClassName := getRuntimeClassName(config) +// podSpec.RuntimeClassName = &runtimeClassName +//} func setContainerProbe(container *corev1.Container, probe *gpuv1.ContainerProbeSpec, probeType ContainerProbe) { var containerProbe *corev1.Probe diff --git a/validator/manifests/cuda-workload-validation.yaml b/validator/manifests/cuda-workload-validation.yaml index 11aca2a5a..76eb27adb 100644 --- a/validator/manifests/cuda-workload-validation.yaml +++ b/validator/manifests/cuda-workload-validation.yaml @@ -3,6 +3,8 @@ kind: Pod metadata: labels: app: nvidia-cuda-validator + annotations: + nvidia.cdi.k8s.io/container.cuda-validation: management.nvidia.com/gpu=all generateName: nvidia-cuda-validator- namespace: "FILLED_BY_THE_VALIDATOR" spec: