Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
31 changes: 31 additions & 0 deletions rhel9/common.sh
Original file line number Diff line number Diff line change
Expand Up @@ -45,3 +45,34 @@ _gdrcopy_enabled() {
fi
return 1
}

# Read a config file and convert newlines to spaces
_read_conf_file() {
local file="$1"
[ -f "$file" ] && tr '\n' ' ' < "$file"
}

# Build driver configuration for state comparison
# Note: Variables are expected to be set by the sourcing script (nvidia-driver)
_build_driver_config() {
cat <<EOF
DRIVER_VERSION=${DRIVER_VERSION}
DRIVER_TYPE=${DRIVER_TYPE}
KERNEL_VERSION=$(uname -r)
GPU_DIRECT_RDMA_ENABLED=${GPU_DIRECT_RDMA_ENABLED}
USE_HOST_MOFED=${USE_HOST_MOFED}
KERNEL_MODULE_TYPE=${KERNEL_MODULE_TYPE}
NVIDIA_MODULE_PARAMS=$(_read_conf_file /drivers/nvidia.conf)
NVIDIA_UVM_MODULE_PARAMS=$(_read_conf_file /drivers/nvidia-uvm.conf)
NVIDIA_MODESET_MODULE_PARAMS=$(_read_conf_file /drivers/nvidia-modeset.conf)
NVIDIA_PEERMEM_MODULE_PARAMS=$(_read_conf_file /drivers/nvidia-peermem.conf)
EOF
}

# Check if fast path should be used (driver already loaded with matching config)
_should_use_fast_path() {
[ -f /sys/module/nvidia/refcnt ] && [ -f /run/nvidia/driver-config.state ] || return 1
local current_config=$(_build_driver_config)
local stored_config=$(cat /run/nvidia/driver-config.state 2>/dev/null || echo "")
[ "${current_config}" = "${stored_config}" ]
}
196 changes: 144 additions & 52 deletions rhel9/nvidia-driver
100755 → 100644
Original file line number Diff line number Diff line change
Expand Up @@ -8,12 +8,14 @@ PID_FILE=${RUN_DIR}/${0##*/}.pid
DRIVER_VERSION=${DRIVER_VERSION:?"Missing DRIVER_VERSION env"}
KERNEL_UPDATE_HOOK=/run/kernel/postinst.d/update-nvidia-driver
NUM_VGPU_DEVICES=0
DRIVER_TYPE="${DRIVER_TYPE:-passthrough}"
GPU_DIRECT_RDMA_ENABLED="${GPU_DIRECT_RDMA_ENABLED:-false}"
USE_HOST_MOFED="${USE_HOST_MOFED:-false}"
NVIDIA_MODULE_PARAMS=()
NVIDIA_UVM_MODULE_PARAMS=()
NVIDIA_MODESET_MODULE_PARAMS=()
NVIDIA_PEERMEM_MODULE_PARAMS=()
TARGETARCH=${TARGETARCH:?"Missing TARGETARCH env"}
USE_HOST_MOFED="${USE_HOST_MOFED:-false}"
DNF_RELEASEVER=${DNF_RELEASEVER:-""}
RHEL_VERSION=${RHEL_VERSION:-""}
RHEL_MAJOR_VERSION=9
Expand Down Expand Up @@ -398,44 +400,7 @@ _load_driver() {
set +o xtrace -o nounset
fi

echo "Starting NVIDIA persistence daemon..."
nvidia-persistenced --persistence-mode

if [ "${DRIVER_TYPE}" = "vgpu" ]; then
echo "Copying gridd.conf..."
cp /drivers/gridd.conf /etc/nvidia/gridd.conf
if [ "${VGPU_LICENSE_SERVER_TYPE}" = "NLS" ]; then
echo "Copying ClientConfigToken..."
mkdir -p /etc/nvidia/ClientConfigToken/
cp /drivers/ClientConfigToken/* /etc/nvidia/ClientConfigToken/
fi

echo "Starting nvidia-gridd.."
LD_LIBRARY_PATH=/usr/lib64/nvidia/gridd nvidia-gridd

# Start virtual topology daemon
_start_vgpu_topology_daemon
fi

if _assert_nvlink5_system; then
_ensure_nvlink5_prerequisites || return 1
echo "Starting NVIDIA fabric manager daemon for NVLink5+..."

fm_config_file=/usr/share/nvidia/nvswitch/fabricmanager.cfg
fm_pid_file=/var/run/nvidia-fabricmanager/nv-fabricmanager.pid
nvlsm_config_file=/usr/share/nvidia/nvlsm/nvlsm.conf
nvlsm_pid_file=/var/run/nvidia-fabricmanager/nvlsm.pid
/usr/bin/nvidia-fabricmanager-start.sh --mode start \
--fm-config-file $fm_config_file \
--fm-pid-file $fm_pid_file \
--nvlsm-config-file $nvlsm_config_file \
--nvlsm-pid-file $nvlsm_pid_file

# If not a NVLink5+ switch, check for the presence of NVLink4 (or below) switches
elif _assert_nvswitch_system; then
echo "Starting NVIDIA fabric manager daemon..."
nv-fabricmanager -c /usr/share/nvidia/nvswitch/fabricmanager.cfg
fi
_start_daemons
}

# Stop persistenced and unload the kernel modules if they are currently loaded.
Expand Down Expand Up @@ -477,6 +442,21 @@ _unload_driver() {
fi
fi

if [ -f /var/run/nvidia-topologyd/nvidia-topologyd.pid ]; then
echo "Stopping NVIDIA topology daemon..."
local pid=$(< /var/run/nvidia-topologyd/nvidia-topologyd.pid)

kill -SIGTERM "${pid}"
for i in $(seq 1 50); do
kill -0 "${pid}" 2> /dev/null || break
sleep 0.1
done
if [ $i -eq 50 ]; then
echo "Could not stop NVIDIA topology daemon" >&2
return 1
fi
fi

if [ -f /var/run/nvidia-fabricmanager/nv-fabricmanager.pid ]; then
echo "Stopping NVIDIA fabric manager daemon..."
local pid=$(< /var/run/nvidia-fabricmanager/nv-fabricmanager.pid)
Expand Down Expand Up @@ -566,11 +546,7 @@ _install_driver() {
install_args+=("--skip-module-load")
fi

IGNORE_CC_MISMATCH=1 nvidia-installer --kernel-module-only --no-drm --ui=none --no-nouveau-check -m=${KERNEL_TYPE} ${install_args[@]+"${install_args[@]}"}
# May need to add no-cc-check for Rhel, otherwise it complains about cc missing in path
# /proc/version and lib/modules/KERNEL_VERSION/proc are different, by default installer looks at /proc/ so, added the proc-mount-point
# TODO: remove the -a flag. its not needed. in the new driver version, license-acceptance is implicit
#nvidia-installer --kernel-module-only --no-drm --ui=none --no-nouveau-check --no-cc-version-check --proc-mount-point /lib/modules/${KERNEL_VERSION}/proc ${install_args[@]+"${install_args[@]}"}
IGNORE_CC_MISMATCH=1 nvidia-installer --silent --kernel-module-only --no-drm --ui=none --no-nouveau-check -m=${KERNEL_TYPE} ${install_args[@]+"${install_args[@]}"}
}

# Mount the driver rootfs into the run directory with the exception of sysfs.
Expand Down Expand Up @@ -701,6 +677,91 @@ _start_vgpu_topology_daemon() {
nvidia-topologyd
}

_start_daemons() {
echo "Starting NVIDIA persistence daemon..."
nvidia-persistenced --persistence-mode

if [ "${DRIVER_TYPE}" = "vgpu" ]; then
echo "Copying gridd.conf..."
cp /drivers/gridd.conf /etc/nvidia/gridd.conf
if [ "${VGPU_LICENSE_SERVER_TYPE}" = "NLS" ]; then
echo "Copying ClientConfigToken..."
mkdir -p /etc/nvidia/ClientConfigToken/
cp /drivers/ClientConfigToken/* /etc/nvidia/ClientConfigToken/
fi

echo "Starting nvidia-gridd.."
LD_LIBRARY_PATH=/usr/lib64/nvidia/gridd nvidia-gridd

# Start virtual topology daemon
_start_vgpu_topology_daemon
fi

if _assert_nvlink5_system; then
_ensure_nvlink5_prerequisites || return 1
echo "Starting NVIDIA fabric manager daemon for NVLink5+..."

fm_config_file=/usr/share/nvidia/nvswitch/fabricmanager.cfg
fm_pid_file=/var/run/nvidia-fabricmanager/nv-fabricmanager.pid
nvlsm_config_file=/usr/share/nvidia/nvlsm/nvlsm.conf
nvlsm_pid_file=/var/run/nvidia-fabricmanager/nvlsm.pid
/usr/bin/nvidia-fabricmanager-start.sh --mode start \
--fm-config-file $fm_config_file \
--fm-pid-file $fm_pid_file \
--nvlsm-config-file $nvlsm_config_file \
--nvlsm-pid-file $nvlsm_pid_file

# If not a NVLink5+ switch, check for the presence of NVLink4 (or below) switches
elif _assert_nvswitch_system; then
echo "Starting NVIDIA fabric manager daemon..."
nv-fabricmanager -c /usr/share/nvidia/nvswitch/fabricmanager.cfg
fi
}

_store_driver_config() {
local config_file="${RUN_DIR}/driver-config.state"
echo "Storing driver configuration state..."
_build_driver_config > "$config_file"
echo "Driver configuration stored at $config_file"
}

_wait_for_signal() {
echo "Done, now waiting for signal"
sleep infinity &
trap "echo 'Caught signal'; _shutdown && { kill $!; exit 0; }" HUP INT QUIT PIPE TERM
trap - EXIT
while true; do wait $! || continue; done
exit 0
}

_userspace_only_install() {
echo "Detected matching loaded driver & config (${DRIVER_VERSION}); performing userspace-only install"
_unmount_rootfs

cd /drivers
[ ! -d "NVIDIA-Linux-${DRIVER_ARCH}-${DRIVER_VERSION}" ] && sh NVIDIA-Linux-${DRIVER_ARCH}-${DRIVER_VERSION}.run -x
cd NVIDIA-Linux-${DRIVER_ARCH}-${DRIVER_VERSION}

echo "Installing userspace components (libraries and binaries)..."
local install_args="--silent --no-kernel-module --no-nouveau-check --no-nvidia-modprobe --no-drm --no-peermem --ui=none"
[ "${ACCEPT_LICENSE}" = "yes" ] && install_args="$install_args --accept-license"
IGNORE_CC_MISMATCH=1 ./nvidia-installer $install_args

# Copy kernel module sources if not already present (needed for sidecar containers)
if [ ! -d "/usr/src/nvidia-${DRIVER_VERSION}" ]; then
_resolve_kernel_type || exit 1
mkdir -p /usr/src/nvidia-${DRIVER_VERSION}
cp -r LICENSE mkprecompiled ${KERNEL_TYPE} /usr/src/nvidia-${DRIVER_VERSION}/
sed '9,${/^\(kernel\|LICENSE\)/!d}' .manifest > /usr/src/nvidia-${DRIVER_VERSION}/.manifest
fi

_mount_rootfs
_start_daemons
_write_kernel_update_hook
_store_driver_config
echo "Userspace-only install complete"
}

_prepare() {
if [ "${DRIVER_TYPE}" = "vgpu" ]; then
_find_vgpu_driver_version || exit 1
Expand Down Expand Up @@ -758,17 +819,48 @@ _load() {
_load_driver
_mount_rootfs
_write_kernel_update_hook

echo "Done, now waiting for signal"
sleep infinity &
trap "echo 'Caught signal'; _shutdown && { kill $!; exit 0; }" HUP INT QUIT PIPE TERM
trap - EXIT
while true; do wait $! || continue; done
exit 0
_store_driver_config
_wait_for_signal
}

init() {
_prepare_exclusive
if [ "${DRIVER_TYPE}" = "vgpu" ]; then
_find_vgpu_driver_version || exit 1
fi

echo -e "\n========== NVIDIA Software Installer ==========\n"
echo -e "Starting installation of NVIDIA driver version ${DRIVER_VERSION} for Linux kernel version ${KERNEL_VERSION}\n"

exec 3> ${PID_FILE}
if ! flock -n 3; then
echo "An instance of the NVIDIA driver is already running, aborting"
exit 1
fi
echo $$ >&3

trap "echo 'Caught signal'; exit 1" HUP INT QUIT PIPE TERM
trap "_shutdown" EXIT

if _should_use_fast_path; then
_userspace_only_install
_wait_for_signal
fi

_unload_driver || exit 1
_unmount_rootfs

# Install the userspace components and copy the kernel module sources.
sh NVIDIA-Linux-$DRIVER_ARCH-$DRIVER_VERSION.run -x && \
cd NVIDIA-Linux-$DRIVER_ARCH-$DRIVER_VERSION && \
sh /tmp/install.sh nvinstall

# Determine the kernel module type
_resolve_kernel_type || exit 1

# Copy the kernel module sources
mkdir -p /usr/src/nvidia-$DRIVER_VERSION && \
mv LICENSE mkprecompiled ${KERNEL_TYPE} /usr/src/nvidia-$DRIVER_VERSION && \
sed '9,${/^\(kernel\|LICENSE\)/!d}' .manifest > /usr/src/nvidia-$DRIVER_VERSION/.manifest

_build

Expand Down
20 changes: 20 additions & 0 deletions rhel9/ocp_dtk_entrypoint
100755 → 100644
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,13 @@ nv-ctr-run-with-dtk() {
exec bash -x nvidia-driver init
fi

if _should_use_fast_path; then
echo "Fast path detected: skipping DTK build and module copy, proceeding with userspace-only install"
exec bash -x nvidia-driver init
fi

echo "Fast path not detected: building driver and modules"

if [[ ! -f "$DRIVER_TOOLKIT_SHARED_DIR/dir_prepared" ]]; then
cp -r \
/tmp/install.sh \
Expand Down Expand Up @@ -80,6 +87,19 @@ dtk-build-driver() {
sleep inf
fi

# Check if fast path is being used - if so, skip building and signal completion
if _should_use_fast_path; then
echo "Fast path detected in DTK container: driver already loaded with matching config, skipping build"
echo "Signaling driver_built and sleeping forever..."
touch "$DRIVER_TOOLKIT_SHARED_DIR/driver_build_started"
touch "$DRIVER_TOOLKIT_SHARED_DIR/driver_built"
while [ -f "$DRIVER_TOOLKIT_SHARED_DIR/driver_built" ]; do
sleep 30
done
echo "WARNING: driver_built flag disappeared"
exit 0
fi

if ! [[ -f "/lib/modules/$(uname -r)/vmlinuz" ]]; then
echo "WARNING: broken Driver Toolkit image detected:"
echo "- Node kernel: $(uname -r)"
Expand Down
Loading