Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 16 additions & 0 deletions api/nvidia/v1/clusterpolicy_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -931,6 +931,13 @@ type DCGMExporterSpec struct {
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.x-descriptors="urn:alm:descriptor:com.tectonic.ui:booleanSwitch"
HostPID *bool `json:"hostPID,omitempty"`

// HostNetwork allows the DCGM-Exporter daemon set to expose metrics port on the host's network namespace.
// +kubebuilder:validation:Optional
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors=true
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.displayName="Enable hostNetwork for NVIDIA DCGM Exporter"
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.x-descriptors="urn:alm:descriptor:com.tectonic.ui:booleanSwitch"
HostNetwork *bool `json:"hostNetwork,omitempty"`

// Optional: HPC job mapping configuration for NVIDIA DCGM Exporter
// +kubebuilder:validation:Optional
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors=true
Expand Down Expand Up @@ -1970,6 +1977,15 @@ func (e *DCGMExporterSpec) IsHostPIDEnabled() bool {
return *e.HostPID
}

// IsHostNetworkEnabled returns true if hostNetwork is enabled for DCGM Exporter
func (e *DCGMExporterSpec) IsHostNetworkEnabled() bool {
if e.HostNetwork == nil {
// default is false if not specified by user
return false
}
return *e.HostNetwork
}

// IsHPCJobMappingEnabled returns true if HPC job mapping is enabled for DCGM Exporter
func (e *DCGMExporterSpec) IsHPCJobMappingEnabled() bool {
if e.HPCJobMapping == nil || e.HPCJobMapping.Enabled == nil {
Expand Down
5 changes: 5 additions & 0 deletions api/nvidia/v1/zz_generated.deepcopy.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

4 changes: 4 additions & 0 deletions bundle/manifests/nvidia.com_clusterpolicies.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -339,6 +339,10 @@ spec:
- name
type: object
type: array
hostNetwork:
description: HostNetwork allows the DCGM-Exporter daemon set to
expose metrics port on the host's network namespace.
type: boolean
hostPID:
description: HostPID allows the DCGM-Exporter daemon set to access
the host's PID namespace
Expand Down
4 changes: 4 additions & 0 deletions config/crd/bases/nvidia.com_clusterpolicies.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -339,6 +339,10 @@ spec:
- name
type: object
type: array
hostNetwork:
description: HostNetwork allows the DCGM-Exporter daemon set to
expose metrics port on the host's network namespace.
type: boolean
hostPID:
description: HostPID allows the DCGM-Exporter daemon set to access
the host's PID namespace
Expand Down
5 changes: 5 additions & 0 deletions controllers/object_controls.go
Original file line number Diff line number Diff line change
Expand Up @@ -1698,6 +1698,11 @@ func TransformDCGMExporter(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolicySpe
obj.Spec.Template.Spec.HostNetwork = true
}
}
// set hostNetwork if specified for DCGM Exporter (if it is already enabled above,
// do not touch the value)
if config.DCGMExporter.IsHostNetworkEnabled() {
obj.Spec.Template.Spec.HostNetwork = true
}

setRuntimeClassName(&obj.Spec.Template.Spec, config, n.runtime)

Expand Down
91 changes: 91 additions & 0 deletions controllers/transforms_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -1272,6 +1272,97 @@ func TestTransformDCGMExporter(t *testing.T) {
},
}).WithContainer(corev1.Container{Name: "dummy"}).WithPullSecret("pull-secret").WithRuntimeClassName("nvidia").WithHostPID(false),
},
{
description: "transform dcgm exporter with hostNetwork enabled",
ds: NewDaemonset().
WithContainer(corev1.Container{Name: "dcgm-exporter"}).
WithContainer(corev1.Container{Name: "dummy"}),
cpSpec: &gpuv1.ClusterPolicySpec{
DCGMExporter: gpuv1.DCGMExporterSpec{
Repository: "nvcr.io/nvidia/cloud-native",
Image: "dcgm-exporter",
Version: "v1.0.0",
ImagePullPolicy: "IfNotPresent",
ImagePullSecrets: []string{"pull-secret"},
Args: []string{"--fail-on-init-error=false"},
HostNetwork: newBoolPtr(true),
Env: []gpuv1.EnvVar{
{Name: "foo", Value: "bar"},
{Name: "DCGM_REMOTE_HOSTENGINE_INFO", Value: "nvidia-dcgm:5555"},
},
},
},
expectedDs: NewDaemonset().WithContainer(corev1.Container{
Name: "dcgm-exporter",
Image: "nvcr.io/nvidia/cloud-native/dcgm-exporter:v1.0.0",
ImagePullPolicy: corev1.PullIfNotPresent,
Args: []string{"--fail-on-init-error=false"},
Env: []corev1.EnvVar{
{Name: "DCGM_REMOTE_HOSTENGINE_INFO", Value: "nvidia-dcgm:5555"},
{Name: "foo", Value: "bar"},
},
}).WithContainer(corev1.Container{Name: "dummy"}).WithPullSecret("pull-secret").WithRuntimeClassName("nvidia").WithHostNetwork(true),
},
{
description: "transform dcgm exporter with hostNetwork disabled",
ds: NewDaemonset().
WithContainer(corev1.Container{Name: "dcgm-exporter"}).
WithContainer(corev1.Container{Name: "dummy"}),
cpSpec: &gpuv1.ClusterPolicySpec{
DCGMExporter: gpuv1.DCGMExporterSpec{
Repository: "nvcr.io/nvidia/cloud-native",
Image: "dcgm-exporter",
Version: "v1.0.0",
ImagePullPolicy: "IfNotPresent",
ImagePullSecrets: []string{"pull-secret"},
Args: []string{"--fail-on-init-error=false"},
HostNetwork: newBoolPtr(false),
Env: []gpuv1.EnvVar{
{Name: "foo", Value: "bar"},
},
},
},
expectedDs: NewDaemonset().WithContainer(corev1.Container{
Name: "dcgm-exporter",
Image: "nvcr.io/nvidia/cloud-native/dcgm-exporter:v1.0.0",
ImagePullPolicy: corev1.PullIfNotPresent,
Args: []string{"--fail-on-init-error=false"},
Env: []corev1.EnvVar{
{Name: "DCGM_REMOTE_HOSTENGINE_INFO", Value: "nvidia-dcgm:5555"},
{Name: "foo", Value: "bar"},
},
}).WithContainer(corev1.Container{Name: "dummy"}).WithPullSecret("pull-secret").WithRuntimeClassName("nvidia").WithHostNetwork(false),
},
{
description: "transform dcgm exporter with hostNetwork unspecified",
ds: NewDaemonset().
WithContainer(corev1.Container{Name: "dcgm-exporter"}).
WithContainer(corev1.Container{Name: "dummy"}),
cpSpec: &gpuv1.ClusterPolicySpec{
DCGMExporter: gpuv1.DCGMExporterSpec{
Repository: "nvcr.io/nvidia/cloud-native",
Image: "dcgm-exporter",
Version: "v1.0.0",
ImagePullPolicy: "IfNotPresent",
ImagePullSecrets: []string{"pull-secret"},
Args: []string{"--fail-on-init-error=false"},
Env: []gpuv1.EnvVar{
{Name: "DCGM_REMOTE_HOSTENGINE_INFO", Value: "localhost:5555"},
{Name: "foo", Value: "bar"},
},
},
},
expectedDs: NewDaemonset().WithContainer(corev1.Container{
Name: "dcgm-exporter",
Image: "nvcr.io/nvidia/cloud-native/dcgm-exporter:v1.0.0",
ImagePullPolicy: corev1.PullIfNotPresent,
Args: []string{"--fail-on-init-error=false"},
Env: []corev1.EnvVar{
{Name: "DCGM_REMOTE_HOSTENGINE_INFO", Value: "localhost:5555"},
{Name: "foo", Value: "bar"},
},
}).WithContainer(corev1.Container{Name: "dummy"}).WithPullSecret("pull-secret").WithRuntimeClassName("nvidia").WithHostNetwork(false),
},
{
description: "transform dcgm exporter, openshift",
openshiftVersion: "1.0.0",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -339,6 +339,10 @@ spec:
- name
type: object
type: array
hostNetwork:
description: HostNetwork allows the DCGM-Exporter daemon set to
expose metrics port on the host's network namespace.
type: boolean
hostPID:
description: HostPID allows the DCGM-Exporter daemon set to access
the host's PID namespace
Expand Down
3 changes: 3 additions & 0 deletions deployments/gpu-operator/templates/clusterpolicy.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -539,6 +539,9 @@ spec:
{{- if .Values.dcgmExporter.hostPID }}
hostPID: {{ .Values.dcgmExporter.hostPID }}
{{- end }}
{{- if .Values.dcgmExporter.hostNetwork }}
hostNetwork: {{ .Values.dcgmExporter.hostNetwork }}
{{- end }}
{{- if .Values.dcgmExporter.hpcJobMapping }}
hpcJobMapping: {{ toYaml .Values.dcgmExporter.hpcJobMapping | nindent 6 }}
{{- end }}
Expand Down
1 change: 1 addition & 0 deletions deployments/gpu-operator/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -283,6 +283,7 @@ dcgmExporter:
env: []
resources: {}
hostPID: false
hostNetwork: false
# HPC job mapping configuration for correlating GPU metrics with HPC workload manager jobs
# This is used by HPC workload managers like Slurm to label GPU metrics with job IDs
# hpcJobMapping:
Expand Down