diff --git a/api/nvidia/v1/clusterpolicy_types.go b/api/nvidia/v1/clusterpolicy_types.go index cb8ce2ef8..ea4b21d86 100644 --- a/api/nvidia/v1/clusterpolicy_types.go +++ b/api/nvidia/v1/clusterpolicy_types.go @@ -931,6 +931,13 @@ type DCGMExporterSpec struct { // +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.x-descriptors="urn:alm:descriptor:com.tectonic.ui:booleanSwitch" HostPID *bool `json:"hostPID,omitempty"` + // HostNetwork allows the DCGM-Exporter daemon set to expose metrics port on the host's network namespace. + // +kubebuilder:validation:Optional + // +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors=true + // +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.displayName="Enable hostNetwork for NVIDIA DCGM Exporter" + // +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.x-descriptors="urn:alm:descriptor:com.tectonic.ui:booleanSwitch" + HostNetwork *bool `json:"hostNetwork,omitempty"` + // Optional: HPC job mapping configuration for NVIDIA DCGM Exporter // +kubebuilder:validation:Optional // +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors=true @@ -1970,6 +1977,15 @@ func (e *DCGMExporterSpec) IsHostPIDEnabled() bool { return *e.HostPID } +// IsHostNetworkEnabled returns true if hostNetwork is enabled for DCGM Exporter +func (e *DCGMExporterSpec) IsHostNetworkEnabled() bool { + if e.HostNetwork == nil { + // default is false if not specified by user + return false + } + return *e.HostNetwork +} + // IsHPCJobMappingEnabled returns true if HPC job mapping is enabled for DCGM Exporter func (e *DCGMExporterSpec) IsHPCJobMappingEnabled() bool { if e.HPCJobMapping == nil || e.HPCJobMapping.Enabled == nil { diff --git a/api/nvidia/v1/zz_generated.deepcopy.go b/api/nvidia/v1/zz_generated.deepcopy.go index fbd4ed603..9e68fdb37 100644 --- a/api/nvidia/v1/zz_generated.deepcopy.go +++ b/api/nvidia/v1/zz_generated.deepcopy.go @@ -404,6 +404,11 @@ func (in *DCGMExporterSpec) DeepCopyInto(out *DCGMExporterSpec) { *out = new(bool) **out = **in } + if in.HostNetwork != nil { + in, out := &in.HostNetwork, &out.HostNetwork + *out = new(bool) + **out = **in + } if in.HPCJobMapping != nil { in, out := &in.HPCJobMapping, &out.HPCJobMapping *out = new(DCGMExporterHPCJobMappingConfig) diff --git a/bundle/manifests/nvidia.com_clusterpolicies.yaml b/bundle/manifests/nvidia.com_clusterpolicies.yaml index 0510c140f..3e63d7d7a 100644 --- a/bundle/manifests/nvidia.com_clusterpolicies.yaml +++ b/bundle/manifests/nvidia.com_clusterpolicies.yaml @@ -339,6 +339,10 @@ spec: - name type: object type: array + hostNetwork: + description: HostNetwork allows the DCGM-Exporter daemon set to + expose metrics port on the host's network namespace. + type: boolean hostPID: description: HostPID allows the DCGM-Exporter daemon set to access the host's PID namespace diff --git a/config/crd/bases/nvidia.com_clusterpolicies.yaml b/config/crd/bases/nvidia.com_clusterpolicies.yaml index 0510c140f..3e63d7d7a 100644 --- a/config/crd/bases/nvidia.com_clusterpolicies.yaml +++ b/config/crd/bases/nvidia.com_clusterpolicies.yaml @@ -339,6 +339,10 @@ spec: - name type: object type: array + hostNetwork: + description: HostNetwork allows the DCGM-Exporter daemon set to + expose metrics port on the host's network namespace. + type: boolean hostPID: description: HostPID allows the DCGM-Exporter daemon set to access the host's PID namespace diff --git a/controllers/object_controls.go b/controllers/object_controls.go index 0b08fbc6d..8fb4461d9 100644 --- a/controllers/object_controls.go +++ b/controllers/object_controls.go @@ -1696,8 +1696,15 @@ func TransformDCGMExporter(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolicySpe if remoteEngine != "" && strings.HasPrefix(remoteEngine, "localhost") { // enable hostNetwork for communication with external DCGM using localhost obj.Spec.Template.Spec.HostNetwork = true + obj.Spec.Template.Spec.DNSPolicy = corev1.DNSClusterFirstWithHostNet } } + // set hostNetwork if specified for DCGM Exporter (if it is already enabled above, + // do not touch the value) + if config.DCGMExporter.IsHostNetworkEnabled() { + obj.Spec.Template.Spec.HostNetwork = true + obj.Spec.Template.Spec.DNSPolicy = corev1.DNSClusterFirstWithHostNet + } setRuntimeClassName(&obj.Spec.Template.Spec, config, n.runtime) diff --git a/controllers/transforms_test.go b/controllers/transforms_test.go index 2e0ae7c1d..ff384c676 100644 --- a/controllers/transforms_test.go +++ b/controllers/transforms_test.go @@ -168,6 +168,11 @@ func (d Daemonset) WithHostNetwork(enabled bool) Daemonset { return d } +func (d Daemonset) WithDNSPolicy(policy corev1.DNSPolicy) Daemonset { + d.Spec.Template.Spec.DNSPolicy = policy + return d +} + func (d Daemonset) WithHostPID(enabled bool) Daemonset { d.Spec.Template.Spec.HostPID = enabled return d @@ -1272,6 +1277,133 @@ func TestTransformDCGMExporter(t *testing.T) { }, }).WithContainer(corev1.Container{Name: "dummy"}).WithPullSecret("pull-secret").WithRuntimeClassName("nvidia").WithHostPID(false), }, + { + description: "transform dcgm exporter with hostNetwork enabled", + ds: NewDaemonset(). + WithContainer(corev1.Container{Name: "dcgm-exporter"}). + WithContainer(corev1.Container{Name: "dummy"}), + cpSpec: &gpuv1.ClusterPolicySpec{ + DCGMExporter: gpuv1.DCGMExporterSpec{ + Repository: "nvcr.io/nvidia/cloud-native", + Image: "dcgm-exporter", + Version: "v1.0.0", + ImagePullPolicy: "IfNotPresent", + ImagePullSecrets: []string{"pull-secret"}, + Args: []string{"--fail-on-init-error=false"}, + HostNetwork: newBoolPtr(true), + Env: []gpuv1.EnvVar{ + {Name: "foo", Value: "bar"}, + {Name: "DCGM_REMOTE_HOSTENGINE_INFO", Value: "nvidia-dcgm:5555"}, + }, + }, + }, + expectedDs: NewDaemonset().WithContainer(corev1.Container{ + Name: "dcgm-exporter", + Image: "nvcr.io/nvidia/cloud-native/dcgm-exporter:v1.0.0", + ImagePullPolicy: corev1.PullIfNotPresent, + Args: []string{"--fail-on-init-error=false"}, + Env: []corev1.EnvVar{ + {Name: "DCGM_REMOTE_HOSTENGINE_INFO", Value: "nvidia-dcgm:5555"}, + {Name: "foo", Value: "bar"}, + }, + }).WithContainer(corev1.Container{Name: "dummy"}).WithPullSecret("pull-secret").WithRuntimeClassName("nvidia").WithHostNetwork(true).WithDNSPolicy(corev1.DNSClusterFirstWithHostNet), + }, + { + description: "transform dcgm exporter with hostNetwork disabled", + ds: NewDaemonset(). + WithContainer(corev1.Container{Name: "dcgm-exporter"}). + WithContainer(corev1.Container{Name: "dummy"}), + cpSpec: &gpuv1.ClusterPolicySpec{ + DCGMExporter: gpuv1.DCGMExporterSpec{ + Repository: "nvcr.io/nvidia/cloud-native", + Image: "dcgm-exporter", + Version: "v1.0.0", + ImagePullPolicy: "IfNotPresent", + ImagePullSecrets: []string{"pull-secret"}, + Args: []string{"--fail-on-init-error=false"}, + HostNetwork: newBoolPtr(false), + Env: []gpuv1.EnvVar{ + {Name: "foo", Value: "bar"}, + }, + }, + }, + expectedDs: NewDaemonset().WithContainer(corev1.Container{ + Name: "dcgm-exporter", + Image: "nvcr.io/nvidia/cloud-native/dcgm-exporter:v1.0.0", + ImagePullPolicy: corev1.PullIfNotPresent, + Args: []string{"--fail-on-init-error=false"}, + Env: []corev1.EnvVar{ + {Name: "DCGM_REMOTE_HOSTENGINE_INFO", Value: "nvidia-dcgm:5555"}, + {Name: "foo", Value: "bar"}, + }, + }).WithContainer(corev1.Container{Name: "dummy"}).WithPullSecret("pull-secret").WithRuntimeClassName("nvidia").WithHostNetwork(false), + }, + { + description: "transform dcgm exporter with hostNetwork unspecified", + ds: NewDaemonset(). + WithContainer(corev1.Container{Name: "dcgm-exporter"}). + WithContainer(corev1.Container{Name: "dummy"}), + cpSpec: &gpuv1.ClusterPolicySpec{ + DCGMExporter: gpuv1.DCGMExporterSpec{ + Repository: "nvcr.io/nvidia/cloud-native", + Image: "dcgm-exporter", + Version: "v1.0.0", + ImagePullPolicy: "IfNotPresent", + ImagePullSecrets: []string{"pull-secret"}, + Args: []string{"--fail-on-init-error=false"}, + Env: []gpuv1.EnvVar{ + {Name: "DCGM_REMOTE_HOSTENGINE_INFO", Value: "localhost:5555"}, + {Name: "foo", Value: "bar"}, + }, + }, + }, + expectedDs: NewDaemonset().WithContainer(corev1.Container{ + Name: "dcgm-exporter", + Image: "nvcr.io/nvidia/cloud-native/dcgm-exporter:v1.0.0", + ImagePullPolicy: corev1.PullIfNotPresent, + Args: []string{"--fail-on-init-error=false"}, + Env: []corev1.EnvVar{ + {Name: "DCGM_REMOTE_HOSTENGINE_INFO", Value: "localhost:5555"}, + {Name: "foo", Value: "bar"}, + }, + }).WithContainer(corev1.Container{Name: "dummy"}).WithPullSecret("pull-secret").WithRuntimeClassName("nvidia").WithHostNetwork(false), + }, + { + description: "transform dcgm exporter with dcgm running on the host itself(DGX BaseOS)", + ds: NewDaemonset(). + WithContainer(corev1.Container{ + Name: "dcgm-exporter", + Env: []corev1.EnvVar{{Name: "DCGM_REMOTE_HOSTENGINE_INFO", Value: "localhost:5555"}}, + }). + WithContainer(corev1.Container{Name: "dummy"}), + cpSpec: &gpuv1.ClusterPolicySpec{ + DCGM: gpuv1.DCGMSpec{ + Enabled: newBoolPtr(false), + }, + DCGMExporter: gpuv1.DCGMExporterSpec{ + Repository: "nvcr.io/nvidia/cloud-native", + Image: "dcgm-exporter", + Version: "v1.0.0", + ImagePullPolicy: "IfNotPresent", + ImagePullSecrets: []string{"pull-secret"}, + Args: []string{"--fail-on-init-error=false"}, + Env: []gpuv1.EnvVar{ + {Name: "DCGM_REMOTE_HOSTENGINE_INFO", Value: "localhost:5555"}, + {Name: "foo", Value: "bar"}, + }, + }, + }, + expectedDs: NewDaemonset().WithContainer(corev1.Container{ + Name: "dcgm-exporter", + Image: "nvcr.io/nvidia/cloud-native/dcgm-exporter:v1.0.0", + ImagePullPolicy: corev1.PullIfNotPresent, + Args: []string{"--fail-on-init-error=false"}, + Env: []corev1.EnvVar{ + {Name: "DCGM_REMOTE_HOSTENGINE_INFO", Value: "localhost:5555"}, + {Name: "foo", Value: "bar"}, + }, + }).WithContainer(corev1.Container{Name: "dummy"}).WithPullSecret("pull-secret").WithRuntimeClassName("nvidia").WithHostNetwork(true).WithDNSPolicy(corev1.DNSClusterFirstWithHostNet), + }, { description: "transform dcgm exporter, openshift", openshiftVersion: "1.0.0", diff --git a/deployments/gpu-operator/crds/nvidia.com_clusterpolicies.yaml b/deployments/gpu-operator/crds/nvidia.com_clusterpolicies.yaml index 0510c140f..3e63d7d7a 100644 --- a/deployments/gpu-operator/crds/nvidia.com_clusterpolicies.yaml +++ b/deployments/gpu-operator/crds/nvidia.com_clusterpolicies.yaml @@ -339,6 +339,10 @@ spec: - name type: object type: array + hostNetwork: + description: HostNetwork allows the DCGM-Exporter daemon set to + expose metrics port on the host's network namespace. + type: boolean hostPID: description: HostPID allows the DCGM-Exporter daemon set to access the host's PID namespace diff --git a/deployments/gpu-operator/templates/clusterpolicy.yaml b/deployments/gpu-operator/templates/clusterpolicy.yaml index c9ce43a31..33efec386 100644 --- a/deployments/gpu-operator/templates/clusterpolicy.yaml +++ b/deployments/gpu-operator/templates/clusterpolicy.yaml @@ -539,6 +539,9 @@ spec: {{- if .Values.dcgmExporter.hostPID }} hostPID: {{ .Values.dcgmExporter.hostPID }} {{- end }} + {{- if .Values.dcgmExporter.hostNetwork }} + hostNetwork: {{ .Values.dcgmExporter.hostNetwork }} + {{- end }} {{- if .Values.dcgmExporter.hpcJobMapping }} hpcJobMapping: {{ toYaml .Values.dcgmExporter.hpcJobMapping | nindent 6 }} {{- end }} diff --git a/deployments/gpu-operator/values.yaml b/deployments/gpu-operator/values.yaml index 184697143..bdd5dd470 100644 --- a/deployments/gpu-operator/values.yaml +++ b/deployments/gpu-operator/values.yaml @@ -283,6 +283,7 @@ dcgmExporter: env: [] resources: {} hostPID: false + hostNetwork: false # HPC job mapping configuration for correlating GPU metrics with HPC workload manager jobs # This is used by HPC workload managers like Slurm to label GPU metrics with job IDs # hpcJobMapping: