Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 12 additions & 0 deletions rhel10/precompiled/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -162,7 +162,19 @@ RUN --mount=type=secret,id=RHSM_ORG,target=/run/secrets/RHSM_ORG \
&& if [ "$DRIVER_TYPE" != "vgpu" ] && [ "$TARGETARCH" != "arm64" ]; then \
versionArray=(${DRIVER_VERSION//./ }); \
DRIVER_BRANCH=${versionArray[0]}; \
if [ "$DRIVER_BRANCH" -ge "580" ]; then \
dnf install -y nvidia-fabricmanager-${DRIVER_VERSION} libnvidia-nscq-${DRIVER_VERSION}; \
else \
dnf install -y nvidia-fabric-manager-${DRIVER_VERSION} libnvidia-nscq-${DRIVER_BRANCH}-${DRIVER_VERSION} ; \
fi \
if [ "$DRIVER_BRANCH" -ge "580" ]; then \
dnf install -y nvidia-imex-${DRIVER_VERSION} libnvdsm-${DRIVER_VERSION}; \
elif [ "$DRIVER_BRANCH" -ge "570" ]; then \
dnf install -y nvidia-imex-${DRIVER_BRANCH}-${DRIVER_VERSION} libnvsdm-${DRIVER_BRANCH}-${DRIVER_VERSION} ; \
fi \
if [ "$DRIVER_BRANCH" -ge "550" ]; then \
dnf install install -y infiniband-diags nvlsm ; \
fi \
fi \
&& dnf clean all \
&& subscription-manager unregister ; \
Expand Down
42 changes: 41 additions & 1 deletion rhel10/precompiled/nvidia-driver
Original file line number Diff line number Diff line change
Expand Up @@ -122,6 +122,31 @@ _assert_nvswitch_system() {
return 0
}

_assert_nvlink5_system() (
for dir in /sys/class/infiniband/*/device; do
# Define the path to the VPD file
vpd_file="$dir/vpd"

# Check if the VPD file exists
if [ -f "$vpd_file" ]; then
# Search for 'SW_MNG' in the VPD file
if grep -q "SW_MNG" "$vpd_file"; then
echo "Detected NVLink5+ system"
return 0
fi
fi
done
return 1
)

_ensure_nvlink5_prerequisites() (
until lsmod | grep mlx5_core > /dev/null 2>&1 && lsmod | grep ib_umad > /dev/null 2>&1;
do
echo "waiting for the mlx5_core and ib_umad kernel modules to be loaded"
sleep 10
done
)

# For each kernel module configuration file mounted into the container,
# parse the file contents and extract the custom module parameters that
# are to be passed as input to 'modprobe'.
Expand Down Expand Up @@ -224,7 +249,22 @@ _load_driver() {
_start_vgpu_topology_daemon
fi

if _assert_nvswitch_system; then
if _assert_nvlink5_system; then
_ensure_nvlink5_prerequisites || return 1
echo "Starting NVIDIA fabric manager daemon for NVLink5+..."

fm_config_file=/usr/share/nvidia/nvswitch/fabricmanager.cfg
fm_pid_file=/var/run/nvidia-fabricmanager/nv-fabricmanager.pid
nvlsm_config_file=/usr/share/nvidia/nvlsm/nvlsm.conf
nvlsm_pid_file=/var/run/nvidia-fabricmanager/nvlsm.pid
/usr/bin/nvidia-fabricmanager-start.sh --mode start \
--fm-config-file $fm_config_file \
--fm-pid-file $fm_pid_file \
--nvlsm-config-file $nvlsm_config_file \
--nvlsm-pid-file $nvlsm_pid_file

# If not a NVLink5+ switch, check for the presence of NVLink4 (or below) switches
elif _assert_nvswitch_system; then
echo "Starting NVIDIA fabric manager daemon..."
nv-fabricmanager -c /usr/share/nvidia/nvswitch/fabricmanager.cfg
fi
Expand Down
12 changes: 12 additions & 0 deletions rhel9/precompiled/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -162,7 +162,19 @@ RUN --mount=type=secret,id=RHSM_ORG,target=/run/secrets/RHSM_ORG \
&& if [ "$DRIVER_TYPE" != "vgpu" ] && [ "$TARGETARCH" != "arm64" ]; then \
versionArray=(${DRIVER_VERSION//./ }); \
DRIVER_BRANCH=${versionArray[0]}; \
if [ "$DRIVER_BRANCH" -ge "580" ]; then \
dnf install -y nvidia-fabricmanager-${DRIVER_VERSION} libnvidia-nscq-${DRIVER_VERSION}; \
else \
dnf install -y nvidia-fabric-manager-${DRIVER_VERSION} libnvidia-nscq-${DRIVER_BRANCH}-${DRIVER_VERSION} ; \
fi \
if [ "$DRIVER_BRANCH" -ge "580" ]; then \
dnf install -y nvidia-imex-${DRIVER_VERSION} libnvdsm-${DRIVER_VERSION}; \
elif [ "$DRIVER_BRANCH" -ge "570" ]; then \
dnf install -y nvidia-imex-${DRIVER_BRANCH}-${DRIVER_VERSION} libnvsdm-${DRIVER_BRANCH}-${DRIVER_VERSION} ; \
fi \
if [ "$DRIVER_BRANCH" -ge "550" ]; then \
dnf install install -y infiniband-diags nvlsm ; \
fi \
fi \
&& dnf clean all \
&& subscription-manager unregister ; \
Expand Down
42 changes: 41 additions & 1 deletion rhel9/precompiled/nvidia-driver
Original file line number Diff line number Diff line change
Expand Up @@ -124,6 +124,31 @@ _assert_nvswitch_system() {
return 0
}

_assert_nvlink5_system() (
for dir in /sys/class/infiniband/*/device; do
# Define the path to the VPD file
vpd_file="$dir/vpd"

# Check if the VPD file exists
if [ -f "$vpd_file" ]; then
# Search for 'SW_MNG' in the VPD file
if grep -q "SW_MNG" "$vpd_file"; then
echo "Detected NVLink5+ system"
return 0
fi
fi
done
return 1
)

_ensure_nvlink5_prerequisites() (
until lsmod | grep mlx5_core > /dev/null 2>&1 && lsmod | grep ib_umad > /dev/null 2>&1;
do
echo "waiting for the mlx5_core and ib_umad kernel modules to be loaded"
sleep 10
done
)

# For each kernel module configuration file mounted into the container,
# parse the file contents and extract the custom module parameters that
# are to be passed as input to 'modprobe'.
Expand Down Expand Up @@ -250,7 +275,22 @@ _load_driver() {
_start_vgpu_topology_daemon
fi

if _assert_nvswitch_system; then
if _assert_nvlink5_system; then
_ensure_nvlink5_prerequisites || return 1
echo "Starting NVIDIA fabric manager daemon for NVLink5+..."

fm_config_file=/usr/share/nvidia/nvswitch/fabricmanager.cfg
fm_pid_file=/var/run/nvidia-fabricmanager/nv-fabricmanager.pid
nvlsm_config_file=/usr/share/nvidia/nvlsm/nvlsm.conf
nvlsm_pid_file=/var/run/nvidia-fabricmanager/nvlsm.pid
/usr/bin/nvidia-fabricmanager-start.sh --mode start \
--fm-config-file $fm_config_file \
--fm-pid-file $fm_pid_file \
--nvlsm-config-file $nvlsm_config_file \
--nvlsm-pid-file $nvlsm_pid_file

# If not a NVLink5+ switch, check for the presence of NVLink4 (or below) switches
elif _assert_nvswitch_system; then
echo "Starting NVIDIA fabric manager daemon..."
nv-fabricmanager -c /usr/share/nvidia/nvswitch/fabricmanager.cfg
fi
Expand Down
Loading