diff --git a/cmd/nvidia-validator/main.go b/cmd/nvidia-validator/main.go index 42ecfcb2c..bea33a169 100644 --- a/cmd/nvidia-validator/main.go +++ b/cmd/nvidia-validator/main.go @@ -32,6 +32,7 @@ import ( "github.com/NVIDIA/go-nvlib/pkg/nvpci" devchar "github.com/NVIDIA/nvidia-container-toolkit/cmd/nvidia-ctk/system/create-dev-char-symlinks" log "github.com/sirupsen/logrus" + "github.com/stretchr/testify/assert/yaml" cli "github.com/urfave/cli/v3" corev1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/api/resource" @@ -67,6 +68,9 @@ type NvidiaFs struct{} // GDRCopy driver component type GDRCopy struct{} +// NvidiaPeermem driver component +type NvidiaPeermem struct{} + // CUDA represents spec to run cuda workload type CUDA struct { ctx context.Context @@ -150,12 +154,16 @@ const ( defaultDriverInstallDir = "/run/nvidia/driver" // defaultDriverInstallDirCtrPath indicates the default path where the NVIDIA driver install dir is mounted in the container defaultDriverInstallDirCtrPath = "/run/nvidia/driver" + // additionalDriversFlagsFilePath indicates the path to the file which contains additional drivers status flags + additionalDriversFlagsFilePath = defaultDriverInstallDirCtrPath + "/.additional-drivers-flags" // driverStatusFile indicates status file for containerizeddriver readiness driverStatusFile = "driver-ready" // nvidiaFsStatusFile indicates status file for nvidia-fs driver readiness nvidiaFsStatusFile = "nvidia-fs-ready" // gdrCopyStatusFile indicates status file for GDRCopy driver (gdrdrv) readiness gdrCopyStatusFile = "gdrcopy-ready" + // nvidiaPeermemStatusFile indicates status file for nvidia-peermem driver readiness + nvidiaPeermemStatusFile = "nvidia-peermem-ready" // toolkitStatusFile indicates status file for toolkit readiness toolkitStatusFile = "toolkit-ready" // pluginStatusFile indicates status file for plugin readiness @@ -445,6 +453,8 @@ func isValidComponent() bool { case "nvidia-fs": fallthrough case "gdrcopy": + fallthrough + case "nvidia-peermem": return true default: return false @@ -509,6 +519,10 @@ func start(ctx context.Context, cli *cli.Command) error { return err } + return validateComponent(ctx, componentFlag) +} + +func validateComponent(ctx context.Context, componentFlag string) error { switch componentFlag { case "driver": driver := &Driver{ @@ -533,6 +547,13 @@ func start(ctx context.Context, cli *cli.Command) error { return fmt.Errorf("error validating gdrcopy driver installation: %w", err) } return nil + case "nvidia-peermem": + nvidiaPeermem := &NvidiaPeermem{} + err := nvidiaPeermem.validate() + if err != nil { + return fmt.Errorf("error validating nvidia-peermem driver installation: %w", err) + } + return nil case "toolkit": toolkit := &Toolkit{} err := toolkit.validate() @@ -795,9 +816,53 @@ func (d *Driver) runValidation(silent bool) (driverInfo, error) { if err != nil { return driverInfo{}, err } + + err = validateAdditionalDriverComponents(d.ctx) + if err != nil { + return driverInfo{}, err + } + return getDriverInfo(false, hostRootFlag, driverInstallDirFlag, driverInstallDirCtrPathFlag), nil } +func validateAdditionalDriverComponents(ctx context.Context) error { + data, err := os.ReadFile(additionalDriversFlagsFilePath) + if err != nil { + return err + } + + supportedFeatures := map[string]string{ + "GDRCOPY_ENABLED": "gdrcopy", + "GDS_ENABLED": "nvidia-fs", + "GPU_DIRECT_RDMA_ENABLED": "nvidia-peermem", + } + + features := map[string]bool{} + if err := yaml.Unmarshal(data, &features); err != nil { + return err + } + + for k, enabled := range features { + if !enabled { + log.Debugf("%s is set to %t, skipping checking it", k, enabled) + continue + } + + component, ok := supportedFeatures[k] + if !ok { + log.Infof("unsupported feature flag: %s, skipping checking it", k) + continue + } + + log.Infof("Validating additional enabled driver component: %s", component) + if err := validateComponent(ctx, component); err != nil { + return err + } + } + + return nil +} + func (d *Driver) validate() error { // delete driver status file is already present err := deleteStatusFile(outputDirFlag + "/" + driverStatusFile) @@ -994,6 +1059,38 @@ func (g *GDRCopy) runValidation(silent bool) error { return runCommand(command, args, silent) } +func (n *NvidiaPeermem) validate() error { + // delete driver status file if already present + err := deleteStatusFile(outputDirFlag + "/" + nvidiaPeermemStatusFile) + if err != nil { + return err + } + + err = n.runValidation(false) + if err != nil { + log.Info("nvidia-peermem driver is not ready") + return err + } + + // create driver status file + err = createStatusFile(outputDirFlag + "/" + nvidiaPeermemStatusFile) + if err != nil { + return err + } + return nil +} + +func (n *NvidiaPeermem) runValidation(silent bool) error { + // check for nvidia_peermem module to be loaded + command := shell + args := []string{"-c", "lsmod | grep -E '^nvidia_peermem\\s'"} + + if withWaitFlag { + return runCommandWithWait(command, args, sleepIntervalSecondsFlag, silent) + } + return runCommand(command, args, silent) +} + func (t *Toolkit) validate() error { // delete status file is already present err := deleteStatusFile(outputDirFlag + "/" + toolkitStatusFile) diff --git a/controllers/object_controls.go b/controllers/object_controls.go index 0b08fbc6d..3c3c753a6 100644 --- a/controllers/object_controls.go +++ b/controllers/object_controls.go @@ -3435,6 +3435,15 @@ func transformDriverContainer(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolicy setContainerProbe(driverContainer, config.Driver.ReadinessProbe, Readiness) } + if config.GDRCopy != nil && config.GDRCopy.IsEnabled() { + // set env indicating gdrcopy is enabled + setContainerEnv(driverContainer, GDRCopyEnabledEnvName, "true") + } + if config.GPUDirectStorage != nil && config.GPUDirectStorage.IsEnabled() { + // set env indicating gds is enabled + setContainerEnv(driverContainer, GDSEnabledEnvName, "true") + } + if config.Driver.GPUDirectRDMA != nil && config.Driver.GPUDirectRDMA.IsEnabled() { // set env indicating nvidia-peermem is enabled to compile module with required ib_* interfaces setContainerEnv(driverContainer, GPUDirectRDMAEnabledEnvName, "true") diff --git a/manifests/state-driver/0500_daemonset.yaml b/manifests/state-driver/0500_daemonset.yaml index e3d0dc1f6..974b2893e 100644 --- a/manifests/state-driver/0500_daemonset.yaml +++ b/manifests/state-driver/0500_daemonset.yaml @@ -226,6 +226,22 @@ spec: value: "true" {{- end }} {{- end }} + {{- if and (.GPUDirectRDMA) (deref .GPUDirectRDMA.Enabled) }} + - name: GPU_DIRECT_RDMA_ENABLED + value: "true" + {{- if deref .GPUDirectRDMA.UseHostMOFED }} + - name: USE_HOST_MOFED + value: "true" + {{- end }} + {{- end }} + {{- if and (.GDS) (deref .GDS.Spec.Enabled) }} + - name: GDS_ENABLED + value: "true" + {{- end }} + {{- if and (.GDRCopy) (deref .GDRCopy.Spec.Enabled) }} + - name: GDRCOPY_ENABLED + value: "true" + {{- end }} {{- if and (.Openshift) (.Runtime.OpenshiftVersion) }} - name: OPENSHIFT_VERSION value: {{ .Runtime.OpenshiftVersion | quote }}