From 53e1031a4ffa5b0769cecafa92232c46e4f6a018 Mon Sep 17 00:00:00 2001 From: root Date: Tue, 10 Sep 2024 06:13:24 +0200 Subject: [PATCH 01/10] sle15: Update Registry information Registry URIs for both SLE BCI and UBI were updated to reflect the current version. Signed-off-by: Egbert Eich --- sle15/Dockerfile | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/sle15/Dockerfile b/sle15/Dockerfile index 00d3fbf8..824454a3 100644 --- a/sle15/Dockerfile +++ b/sle15/Dockerfile @@ -1,7 +1,7 @@ ARG SLES_VERSION -FROM nvcr.io/nvidia/cuda:12.6.2-base-ubi8 as license +FROM nvcr.io/nvidia/cuda:12.6.0-base-ubi9 as license -FROM registry.suse.com/bci/golang:1.17 as build +FROM registry.suse.com/bci/golang:1.23 as build RUN zypper --non-interactive install -y git wget tar gzip @@ -12,7 +12,7 @@ RUN git clone https://github.com/NVIDIA/gpu-driver-container driver && \ go build -o vgpu-util && \ mv vgpu-util /work -FROM registry.suse.com/suse/sle15:$SLES_VERSION +FROM registry.suse.com/bci/bci-base:$SLES_VERSION #ARG BASE_URL=http://us.download.nvidia.com/XFree86/Linux-x86_64 ARG BASE_URL=https://us.download.nvidia.com/tesla From 4c26246416515ec72df9f3db6cbbe381fd81a54c Mon Sep 17 00:00:00 2001 From: Egbert Eich Date: Fri, 20 Sep 2024 16:21:20 +0200 Subject: [PATCH 02/10] sle15: Add build and load options Signed-off-by: Egbert Eich --- sle15/nvidia-driver | 50 +++++++++++++++++++++++++++++++++++++++------ 1 file changed, 44 insertions(+), 6 deletions(-) diff --git a/sle15/nvidia-driver b/sle15/nvidia-driver index 57c96e20..e91b87f6 100755 --- a/sle15/nvidia-driver +++ b/sle15/nvidia-driver @@ -419,7 +419,7 @@ _start_vgpu_topology_daemon() { nvidia-topologyd } -init() { +_prepare() { if [ "${DRIVER_TYPE}" = "vgpu" ]; then _find_vgpu_driver_version || exit 1 fi @@ -434,6 +434,10 @@ init() { echo -e "\n========== NVIDIA Software Installer ==========\n" echo -e "Starting installation of NVIDIA driver version ${DRIVER_VERSION} for Linux kernel version ${KERNEL_VERSION}\n" +} + +_prepare_exclusive() { + _prepare exec 3> ${PID_FILE} if ! flock -n 3; then @@ -447,7 +451,10 @@ init() { _unload_driver || exit 1 _unmount_rootfs +} +_build() { + local cleanup=false # Install dependencies if _kernel_requires_package; then @@ -455,14 +462,21 @@ init() { _resolve_kernel_version || exit 1 _install_prerequisites _create_driver_package - #_remove_prerequisites - _cleanup_package_cache + cleanup=true fi - # Build the driver + # Build the driver - rootfs needs to be mounted as the build magic attempts to + # load the driver. _install_driver - _load_driver || exit 1 + if $cleanup; then + # Do not call _remove_prerequisites as this will delete depmod information + _cleanup_package_cache + fi +} + +_load() { _mount_rootfs + _load_driver || exit 1 _write_kernel_update_hook echo "Done, now waiting for signal" @@ -473,6 +487,26 @@ init() { exit 0 } +init() { + _prepare_exclusive + + _build + + _load +} + +build() { + _prepare + + _build +} + +load() { + _prepare_exclusive + + _load +} + update() { exec 3>&2 if exec 2> /dev/null 4< ${PID_FILE}; then @@ -511,7 +545,7 @@ update() { if _kernel_requires_package; then _create_driver_package fi - #_remove_prerequisites + # Do not call _remove_prerequisites as this will delete demod information _cleanup_package_cache echo "Done" @@ -524,6 +558,8 @@ Usage: $0 COMMAND [ARG...] Commands: init [-a | --accept-license] [-m | --max-threads MAX_THREADS] + build [-a | --accept-license] [-m | --max-threads MAX_THREADS] + load update [-k | --kernel VERSION] [-s | --sign KEYID] [-t | --tag TAG] [-m | --max-threads MAX_THREADS] EOF exit 1 @@ -535,6 +571,8 @@ fi command=$1; shift case "${command}" in init) options=$(getopt -l accept-license,max-threads: -o am: -- "$@") ;; + build) options=$(getopt -l accept-license,tag:,max-threads: -o a:t:m: -- "$@") ;; + load) options="" ;; update) options=$(getopt -l kernel:,sign:,tag:,max-threads: -o k:s:t:m: -- "$@") ;; *) usage ;; esac From ff8544a2f9281060eed80e023eb40d19b186298c Mon Sep 17 00:00:00 2001 From: Egbert Eich Date: Fri, 20 Sep 2024 16:25:56 +0200 Subject: [PATCH 03/10] sle15: Add support for open driver build & attempt to suppress warnings Signed-off-by: Egbert Eich --- sle15/nvidia-driver | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/sle15/nvidia-driver b/sle15/nvidia-driver index e91b87f6..2c11e833 100755 --- a/sle15/nvidia-driver +++ b/sle15/nvidia-driver @@ -12,6 +12,9 @@ NVIDIA_MODULE_PARAMS=() NVIDIA_UVM_MODULE_PARAMS=() NVIDIA_MODESET_MODULE_PARAMS=() +OPEN_KERNEL_MODULES_ENABLED=${OPEN_KERNEL_MODULES_ENABLED:-true} +[[ "${OPEN_KERNEL_MODULES_ENABLED}" == "false" ]] && KERNEL_TYPE=kernel-open || KERNEL_TYPE=kernel + _update_package_cache() { if [ "${PACKAGE_TAG:-}" != "builtin" ]; then echo "Updating the package cache..." @@ -96,8 +99,8 @@ _kernel_requires_package() { echo "Checking NVIDIA driver packages..." - [[ ! -d /usr/src/nvidia-${DRIVER_VERSION}/kernel ]] && return 0 - cd /usr/src/nvidia-${DRIVER_VERSION}/kernel + [[ ! -d /usr/src/nvidia-${DRIVER_VERSION}/${KERNEL_TYPE} ]] && return 0 + cd /usr/src/nvidia-${DRIVER_VERSION}/${KERNEL_TYPE} proc_mount_arg="--proc-mount-point /lib/modules/${KERNEL_VERSION}/proc" for pkg_name in $(ls -d -1 precompiled/** 2> /dev/null); do @@ -120,7 +123,7 @@ _create_driver_package() ( trap "make -s -j ${MAX_THREADS} SYSSRC=/lib/modules/${KERNEL_VERSION}/source clean > /dev/null" EXIT echo "Compiling NVIDIA driver kernel modules..." - cd /usr/src/nvidia-${DRIVER_VERSION}/kernel + cd /usr/src/nvidia-${DRIVER_VERSION}/${KERNEL_TYPE} make -s -j ${MAX_THREADS} SYSSRC=/lib/modules/${KERNEL_VERSION}/source nv-linux.o nv-modeset-linux.o > /dev/null echo "Relinking NVIDIA driver kernel modules..." @@ -331,7 +334,7 @@ _install_driver() { if [ "${ACCEPT_LICENSE}" = "yes" ]; then install_args+=("--accept-license") fi - nvidia-installer --kernel-module-only --no-drm --ui=none --no-nouveau-check ${install_args[@]+"${install_args[@]}"} + IGNORE_CC_MISMATCH=1 nvidia-installer --kernel-module-only --no-drm --ui=none --no-nouveau-check -m=${KERNEL_TYPE} --no-rebuild-initramfs ${install_args[@]+"${install_args[@]}"} } # Mount the driver rootfs into the run directory with the exception of sysfs. @@ -429,7 +432,7 @@ _prepare() { cd NVIDIA-Linux-$DRIVER_ARCH-$DRIVER_VERSION && \ sh /tmp/install.sh nvinstall && \ mkdir -p /usr/src/nvidia-$DRIVER_VERSION && \ - mv LICENSE mkprecompiled kernel /usr/src/nvidia-$DRIVER_VERSION && \ + mv LICENSE mkprecompiled $KERNEL_TYPE /usr/src/nvidia-$DRIVER_VERSION && \ sed '9,${/^\(kernel\|LICENSE\)/!d}' .manifest > /usr/src/nvidia-$DRIVER_VERSION/.manifest echo -e "\n========== NVIDIA Software Installer ==========\n" From 9987cc0c8a60368af4a1ff08e319d4f81b9a0739 Mon Sep 17 00:00:00 2001 From: Egbert Eich Date: Fri, 20 Sep 2024 16:28:08 +0200 Subject: [PATCH 04/10] sle15: Set firmware search path Signed-off-by: Egbert Eich --- sle15/nvidia-driver | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/sle15/nvidia-driver b/sle15/nvidia-driver index 2c11e833..8688ea5e 100755 --- a/sle15/nvidia-driver +++ b/sle15/nvidia-driver @@ -208,6 +208,25 @@ _get_module_params() { # Load the kernel modules and start persistenced. _load_driver() { + local nv_fw_search_path="$RUN_DIR/driver/lib/firmware" + local set_fw_path="true" + local fw_path_config_file="/sys/module/firmware_class/parameters/path" + for param in "${NVIDIA_MODULE_PARAMS[@]}"; do + if [[ "$param" == "NVreg_EnableGpuFirmware=0" ]]; then + set_fw_path="false" + fi + done + + if [[ "$set_fw_path" == "true" ]]; then + echo "Configuring the following firmware search path in '$fw_path_config_file': $nv_fw_search_path" + if [[ ! -z $(grep '[^[:space:]]' $fw_path_config_file) ]]; then + echo "WARNING: A search path is already configured in $fw_path_config_file" + echo " Retaining the current configuration" + else + echo -n "$nv_fw_search_path" > $fw_path_config_file || echo "WARNING: Failed to configure the firmware search path" + fi + fi + echo "Parsing kernel module parameters..." _get_module_params From 848e1e673a9da525a230de58c025058455361d6b Mon Sep 17 00:00:00 2001 From: Egbert Eich Date: Fri, 20 Sep 2024 16:28:49 +0200 Subject: [PATCH 05/10] sle15: Check for SELinux Signed-off-by: Egbert Eich --- sle15/nvidia-driver | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/sle15/nvidia-driver b/sle15/nvidia-driver index 8688ea5e..65eeb65f 100755 --- a/sle15/nvidia-driver +++ b/sle15/nvidia-driver @@ -363,6 +363,16 @@ _mount_rootfs() { mount --make-private /sys mkdir -p ${RUN_DIR}/driver mount --rbind / ${RUN_DIR}/driver + + echo "Check SELinux status" + if [ -e /sys/fs/selinux ]; then + echo "SELinux is enabled" + echo "Change device files security context for selinux compatibility" + chcon -R -t container_file_t ${RUN_DIR}/driver/dev + else + echo "SELinux is disabled, skipping..." + fi + } # Unmount the driver rootfs from the run directory. From ef138dc56583b42ec5ffb2876e4ff7df3a5ff98c Mon Sep 17 00:00:00 2001 From: Egbert Eich Date: Fri, 20 Sep 2024 16:29:45 +0200 Subject: [PATCH 06/10] sle15: Add comment explaining purpose of environment variable Signed-off-by: Egbert Eich --- sle15/nvidia-driver | 1 + 1 file changed, 1 insertion(+) diff --git a/sle15/nvidia-driver b/sle15/nvidia-driver index 65eeb65f..f49b739b 100755 --- a/sle15/nvidia-driver +++ b/sle15/nvidia-driver @@ -20,6 +20,7 @@ _update_package_cache() { echo "Updating the package cache..." FLAVOR="$(echo ${KERNEL_VERSION} | cut -d- -f3)" if [ "$FLAVOR" == "azure" ]; then + # consumed by container-suseconnect when calling `zypper refresh` export ADDITIONAL_MODULES="sle-module-public-cloud" fi if ! zypper refresh; then From b6a3f0c0bb72b0a6197b05519a448341c5f0f474 Mon Sep 17 00:00:00 2001 From: Egbert Eich Date: Fri, 20 Sep 2024 15:51:31 +0200 Subject: [PATCH 07/10] sle15: nvidia-installer - include drm, supress attempt to load modules The DRM module is now required for the driver to function properly. Loading modules here will preempt the explicit module load later. Signed-off-by: Egbert Eich --- sle15/nvidia-driver | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sle15/nvidia-driver b/sle15/nvidia-driver index f49b739b..74903a7f 100755 --- a/sle15/nvidia-driver +++ b/sle15/nvidia-driver @@ -354,7 +354,7 @@ _install_driver() { if [ "${ACCEPT_LICENSE}" = "yes" ]; then install_args+=("--accept-license") fi - IGNORE_CC_MISMATCH=1 nvidia-installer --kernel-module-only --no-drm --ui=none --no-nouveau-check -m=${KERNEL_TYPE} --no-rebuild-initramfs ${install_args[@]+"${install_args[@]}"} + IGNORE_CC_MISMATCH=1 nvidia-installer --kernel-module-only --ui=none --no-nouveau-check -m=${KERNEL_TYPE} --no-rebuild-initramfs ${install_args[@]+"${install_args[@]}"} --skip-module-load # --no-drm } # Mount the driver rootfs into the run directory with the exception of sysfs. From 17a584ec4635277e0dc8979358092a4bbf07ffee Mon Sep 17 00:00:00 2001 From: Egbert Eich Date: Fri, 20 Sep 2024 16:38:24 +0200 Subject: [PATCH 08/10] sle15: Add support for dependency count for modeset module The DRM module is a dependency of the modeset module. Make sure to track this properly to ensure old drivers are fully unloaded. Signed-off-by: Egbert Eich --- sle15/nvidia-driver | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/sle15/nvidia-driver b/sle15/nvidia-driver index 74903a7f..428b3f7d 100755 --- a/sle15/nvidia-driver +++ b/sle15/nvidia-driver @@ -268,9 +268,11 @@ _load_driver() { _unload_driver() { local rmmod_args=() local nvidia_deps=0 + local nvidia_modeset_deps=0 local nvidia_refs=0 local nvidia_uvm_refs=0 local nvidia_modeset_refs=0 + local nvidia_drm_refs=0 echo "Stopping NVIDIA persistence daemon..." if [ -f /var/run/nvidia-persistenced/nvidia-persistenced.pid ]; then @@ -318,6 +320,11 @@ _unload_driver() { fi echo "Unloading NVIDIA driver kernel modules..." + if [ -f /sys/module/nvidia_drm/refcnt ]; then + nvidia_drm_refs=$(< /sys/module/nvidia_drm/refcnt) + rmmod_args+=("nvidia-drm") + ((++nvidia_modeset_deps)) + fi if [ -f /sys/module/nvidia_modeset/refcnt ]; then nvidia_modeset_refs=$(< /sys/module/nvidia_modeset/refcnt) rmmod_args+=("nvidia-modeset") @@ -332,7 +339,7 @@ _unload_driver() { nvidia_refs=$(< /sys/module/nvidia/refcnt) rmmod_args+=("nvidia") fi - if [ ${nvidia_refs} -gt ${nvidia_deps} ] || [ ${nvidia_uvm_refs} -gt 0 ] || [ ${nvidia_modeset_refs} -gt 0 ]; then + if [ ${nvidia_refs} -gt ${nvidia_deps} ] || [ ${nvidia_uvm_refs} -gt 0 ] || [ ${nvidia_modeset_refs} -gt ${nvidia_modeset_deps} ] || [ ${nvidia_drm_refs} -gt 0 ]; then echo "Could not unload NVIDIA driver kernel modules, driver is in use" >&2 return 1 fi From 6a5963c77eb885e885341e58c8976702e8090d5c Mon Sep 17 00:00:00 2001 From: Egbert Eich Date: Fri, 20 Sep 2024 16:43:41 +0200 Subject: [PATCH 09/10] sle15: Do not install distro version of open driver when building one from CUDA The SUSE's version of the NVIDIA driver has 'Supplements:' on PCI device IDs. These cause the open driver to be installed on machines which happen to have a matching NVIDIA card when the kernel is installed. Use a not-well-known trick to work around this. Signed-off-by: Egbert Eich --- sle15/nvidia-driver | 3 +++ 1 file changed, 3 insertions(+) diff --git a/sle15/nvidia-driver b/sle15/nvidia-driver index 428b3f7d..25e9d910 100755 --- a/sle15/nvidia-driver +++ b/sle15/nvidia-driver @@ -70,10 +70,13 @@ _install_prerequisites() ( echo "Installing Linux kernel source..." local version_without_flavor=$(echo ${KERNEL_VERSION} | cut -d- -f-2) + export ZYPP_MODALIAS_SYSFS=$(mktemp /tmp/modalias-XXXX) if ! zypper --non-interactive in -y --no-recommends --capability kernel-${FLAVOR} = ${version_without_flavor} kernel-${FLAVOR}-devel = ${version_without_flavor} ; then echo "FATAL: failed to install kernel packages. Ensure SLES subscription is available." + rm -f ${ZYPP_MODALIAS_SYSFS} exit 1 fi + rm -f ${ZYPP_MODALIAS_SYSFS}; unset ZYPP_MODALIAS_SYSFS echo "Generating Linux kernel version string..." extract-vmlinux /boot/vmlinuz-${KERNEL_VERSION} | strings | grep -E '^Linux version' | sed 's/^\(.*\)\s\+(.*)$/\1/' > version From 616a428d0ce2192a314a499ff750c934c2ea8c95 Mon Sep 17 00:00:00 2001 From: Egbert Eich Date: Thu, 28 Nov 2024 16:22:39 +0100 Subject: [PATCH 10/10] sle15: Before loading the newly built modules make sure nothing is loaded The module build process may load modules which are installed on the container host itself. Make sure these are unloaded before loading our brand new modules. Signed-off-by: Egbert Eich --- sle15/nvidia-driver | 3 +++ 1 file changed, 3 insertions(+) diff --git a/sle15/nvidia-driver b/sle15/nvidia-driver index 25e9d910..db7bf9b6 100755 --- a/sle15/nvidia-driver +++ b/sle15/nvidia-driver @@ -519,6 +519,9 @@ _build() { _load() { _mount_rootfs + # Something in the build process may have decided to load drivers that happened to be installed. + # Make sure they are uninstalled. + lsmod | grep -q nvidia && { _unload_driver || exit 1; } || true _load_driver || exit 1 _write_kernel_update_hook