From a08b959d74d995b26e2d50bb6c082b80449e7006 Mon Sep 17 00:00:00 2001 From: Shiva Kumar Date: Thu, 14 May 2026 18:24:23 +0530 Subject: [PATCH 1/3] Precompiled: Ubuntu26.04 driver container support Signed-off-by: Shiva Kumar (SW-CLOUD) --- .common-ci.yml | 19 + .github/workflows/precompiled.yaml | 95 +++-- .gitlab-ci.yml | 9 + .nvidia-ci.yml | 47 +++ Makefile | 26 +- base/Dockerfile | 37 ++ multi-arch.mk | 1 + ubuntu26.04/precompiled/Dockerfile | 62 +++ ubuntu26.04/precompiled/local-repo.sh | 118 ++++++ ubuntu26.04/precompiled/nvidia-driver | 566 ++++++++++++++++++++++++++ 10 files changed, 950 insertions(+), 30 deletions(-) create mode 100644 ubuntu26.04/precompiled/Dockerfile create mode 100755 ubuntu26.04/precompiled/local-repo.sh create mode 100755 ubuntu26.04/precompiled/nvidia-driver diff --git a/.common-ci.yml b/.common-ci.yml index 8db2bcc8a..1c11558c1 100644 --- a/.common-ci.yml +++ b/.common-ci.yml @@ -110,6 +110,14 @@ trigger-pipeline: KERNEL_FLAVOR: [aws, azure, azure-fde, generic, nvidia, oracle] LTS_KERNEL: ["6.8"] +# Define the matrix of precompiled jobs that can be run in parallel for ubuntu26.04 +.driver-versions-precompiled-ubuntu26.04: + parallel: + matrix: + - DRIVER_BRANCH: [580] + KERNEL_FLAVOR: [aws, azure, azure-fde, generic, nvidia, oracle] + LTS_KERNEL: ["6.14"] + .dist-ubuntu22.04: variables: DIST: ubuntu22.04 @@ -467,3 +475,14 @@ release:staging-precompiled-ubuntu24.04: - .release:staging-precompiled needs: - image-precompiled-ubuntu24.04 + +# Precompiled Ubuntu26.04 release +release:staging-precompiled-ubuntu26.04: + variables: + DIST: signed_ubuntu26.04 + BASE_TARGET: questing + extends: + - .driver-versions-precompiled-ubuntu26.04 + - .release:staging-precompiled + needs: + - image-precompiled-ubuntu26.04 diff --git a/.github/workflows/precompiled.yaml b/.github/workflows/precompiled.yaml index 5b77aafa7..c58f0c50e 100644 --- a/.github/workflows/precompiled.yaml +++ b/.github/workflows/precompiled.yaml @@ -29,6 +29,7 @@ jobs: runs-on: linux-amd64-cpu4 outputs: driver_branch: ${{ steps.extract_driver_branch.outputs.driver_branch }} + exclude_build_matrix_pairs: ${{ steps.extract_driver_branch.outputs.exclude_build_matrix_pairs }} kernel_flavors: ${{ steps.extract_driver_branch.outputs.kernel_flavors }} dist: ${{ steps.extract_driver_branch.outputs.dist }} lts_kernel: ${{ steps.extract_driver_branch.outputs.lts_kernel }} @@ -39,7 +40,7 @@ jobs: id: extract_driver_branch run: | # get driver_branch - DRIVER_BRANCH=("535" "580") + DRIVER_BRANCH=("535" "580" "595") driver_branch_json=$(printf '%s\n' "${DRIVER_BRANCH[@]}" | jq -R . | jq -cs .) echo "driver_branch=$driver_branch_json" >> $GITHUB_OUTPUT @@ -49,15 +50,18 @@ jobs: echo "kernel_flavors=$kernel_flavors_json" >> $GITHUB_OUTPUT # get ubuntu distributions - DIST=("ubuntu22.04" "ubuntu24.04") + DIST=("ubuntu22.04" "ubuntu24.04" "ubuntu26.04") dist_json=$(printf '%s\n' "${DIST[@]}" | jq -R . | jq -cs .) echo "dist=$dist_json" >> $GITHUB_OUTPUT # LTS_KERNEL setup - LTS_KERNEL=("5.15" "6.8") + LTS_KERNEL=("5.15" "6.8" "7.0") lts_kernel_json=$(printf '%s\n' "${LTS_KERNEL[@]}" | jq -R . | jq -cs .) echo "lts_kernel=$lts_kernel_json" >> $GITHUB_OUTPUT + EXCLUDE_BUILD_MATRIX_PAIRS=("ubuntu22.04 595" "ubuntu24.04 535" "ubuntu26.04 535" "ubuntu26.04 580") + echo "exclude_build_matrix_pairs=$(printf '%s\n' "${EXCLUDE_BUILD_MATRIX_PAIRS[@]}" | jq -R . | jq -cs .)" >> $GITHUB_OUTPUT + precompiled-build-image: needs: set-driver-version-matrix runs-on: linux-amd64-cpu4 @@ -70,10 +74,25 @@ jobs: exclude: - dist: ubuntu24.04 driver_branch: 535 + - dist: ubuntu26.04 + driver_branch: 535 + - dist: ubuntu26.04 + driver_branch: 580 + - dist: ubuntu22.04 + driver_branch: 595 - lts_kernel: 5.15 dist: ubuntu24.04 + - lts_kernel: 7.0 + dist: ubuntu24.04 + - lts_kernel: 5.15 + dist: ubuntu26.04 + - lts_kernel: 6.8 + dist: ubuntu26.04 + - lts_kernel: 7.0 + dist: ubuntu22.04 - flavor: azure-fde dist: ubuntu22.04 + max-parallel: 5 steps: - uses: actions/checkout@v6 name: Check out code @@ -113,6 +132,8 @@ jobs: BASE_TARGET="jammy" elif [[ "${{ matrix.dist }}" == "ubuntu24.04" ]]; then BASE_TARGET="noble" + elif [[ "${{ matrix.dist }}" == "ubuntu26.04" ]]; then + BASE_TARGET="resolute" fi make DRIVER_BRANCH=${{ matrix.driver_branch }} KERNEL_FLAVOR=${{ matrix.flavor }} LTS_KERNEL=${LTS_KERNEL} build-base-${BASE_TARGET} @@ -143,6 +164,8 @@ jobs: BASE_TARGET="jammy" elif [[ "${{ matrix.dist }}" == "ubuntu24.04" ]]; then BASE_TARGET="noble" + elif [[ "${{ matrix.dist }}" == "ubuntu26.04" ]]; then + BASE_TARGET="resolute" fi tar -cvf kernel-version-${{ matrix.driver_branch }}-${KERNEL_VERSION}-${DIST}.tar kernel_version.txt docker save "${PRIVATE_REGISTRY}/nvidia/driver:base-${BASE_TARGET}-${LTS_KERNEL}-${{ matrix.flavor }}-${{ matrix.driver_branch }}" \ @@ -183,6 +206,14 @@ jobs: exclude: - lts_kernel: 5.15 dist: ubuntu24.04 + - lts_kernel: 7.0 + dist: ubuntu24.04 + - lts_kernel: 5.15 + dist: ubuntu26.04 + - lts_kernel: 6.8 + dist: ubuntu26.04 + - lts_kernel: 7.0 + dist: ubuntu22.04 needs: - precompiled-build-image - set-driver-version-matrix @@ -212,14 +243,14 @@ jobs: kernel_flavors_json='${{ needs.set-driver-version-matrix.outputs.kernel_flavors }}' KERNEL_FLAVORS=($(echo "$kernel_flavors_json" | jq -r '.[]')) driver_branch_json='${{ needs.set-driver-version-matrix.outputs.driver_branch }}' - DRIVER_BRANCHES=($(echo "$driver_branch_json" | jq -r '.[]')) - - # remove 535 driver branch for ubuntu24.04 - if [ "$DIST" == "ubuntu24.04" ]; then - DRIVER_BRANCHES=($(for branch in "${DRIVER_BRANCHES[@]}"; do - [[ $branch != "535" ]] && echo "$branch" - done)) - fi + exclude_pairs_json='${{ needs.set-driver-version-matrix.outputs.exclude_build_matrix_pairs }}' + DRIVER_BRANCHES=() + for b in $(echo "$driver_branch_json" | jq -r '.[]'); do + pair="$DIST $b" + if ! echo "$exclude_pairs_json" | jq -r '.[]' | grep -qx "$pair"; then + DRIVER_BRANCHES+=("$b") + fi + done source ./tests/scripts/ci-precompiled-helpers.sh KERNEL_VERSIONS=($(get_kernel_versions_to_test KERNEL_FLAVORS[@] DRIVER_BRANCHES[@] $DIST $LTS_KERNEL)) if [ -z "$KERNEL_VERSIONS" ]; then @@ -358,15 +389,17 @@ jobs: echo "DIST=$DIST" >> $GITHUB_ENV KERNEL_VERSION=${KERNEL_VERSION%-*} echo "KERNEL_VERSION=$KERNEL_VERSION" >> $GITHUB_ENV - driver_branch_json="${{ needs.set-driver-version-matrix.outputs.driver_branch }}" - DRIVER_BRANCHES=($(echo "$driver_branch_json" | jq -r '.[]')) - echo "DRIVER_BRANCHES=${DRIVER_BRANCHES[*]}" >> $GITHUB_ENV + printf 'DRIVER_BRANCHES_JSON=%s\n' '${{ needs.set-driver-version-matrix.outputs.driver_branch }}' >> $GITHUB_ENV + printf 'EXCLUDE_PAIRS_JSON=%s\n' '${{ needs.set-driver-version-matrix.outputs.exclude_build_matrix_pairs }}' >> $GITHUB_ENV - name: Configure Holodeck e2e test config (kernel, instance) run: | yq eval '.spec += {"kernel": {"version": strenv(KERNEL_VERSION)}}' -i tests/holodeck_ubuntu.yaml if [[ "$DIST" == "ubuntu24.04" ]]; then yq eval '.spec.instance.os = "ubuntu-24.04"' -i tests/holodeck_ubuntu.yaml fi + if [[ "$DIST" == "ubuntu26.04" ]]; then + yq eval '.spec.instance.os = "ubuntu-26.04"' -i tests/holodeck_ubuntu.yaml + fi - name: Set up Holodeck uses: NVIDIA/holodeck@v0.3.2 @@ -401,15 +434,13 @@ jobs: GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} run: | rc=0 - # for precompiled driver we are setting driver branch as driver version - DRIVER_BRANCHES=(${{ env.DRIVER_BRANCHES }}) - # remove 535 driver branch for ubuntu24.04 - if [ "$DIST" == "ubuntu24.04" ]; then - DRIVER_BRANCHES=($(for branch in "${DRIVER_BRANCHES[@]}"; do - [[ $branch != "535" ]] && echo "$branch" - done)) - fi + DRIVER_BRANCHES=($(echo '${{ env.DRIVER_BRANCHES_JSON }}' | jq -r '.[]')) + exclude_pairs='${{ env.EXCLUDE_PAIRS_JSON }}' for DRIVER_VERSION in "${DRIVER_BRANCHES[@]}"; do + pair="$DIST $DRIVER_VERSION" + if echo "$exclude_pairs" | jq -r '.[]' | grep -qx "$pair"; then + continue + fi echo "Running e2e for DRIVER_VERSION=$DRIVER_VERSION" status=0 TEST_CASE_ARGS="${GPU_OPERATOR_OPTIONS} --set driver.version=${DRIVER_VERSION}" @@ -455,18 +486,26 @@ jobs: password: ${{ secrets.GITHUB_TOKEN }} - name: Set image vars + id: set_image_vars run: | echo "PRIVATE_REGISTRY=ghcr.io" >> $GITHUB_ENV + KERNEL_VERSION="${{ matrix.kernel_version }}" + DIST="${KERNEL_VERSION##*-}" + pair="$DIST ${{ matrix.driver_branch }}" + echo "run_publish=true" >> $GITHUB_OUTPUT + if echo '${{ needs.set-driver-version-matrix.outputs.exclude_build_matrix_pairs }}' | jq -r '.[]' | grep -qx "$pair"; then + echo "run_publish=false" >> $GITHUB_OUTPUT + fi - name: Download base image artifact - if: ${{ ! (matrix.driver_branch == 535 && contains(matrix.kernel_version, 'ubuntu24.04')) }} + if: steps.set_image_vars.outputs.run_publish == 'true' uses: actions/download-artifact@v8 with: name: base-images-${{ matrix.driver_branch }}-${{ matrix.kernel_version }} path: ./ - name: Publish base image - if: ${{ ! (matrix.driver_branch == 535 && contains(matrix.kernel_version, 'ubuntu24.04')) }} + if: steps.set_image_vars.outputs.run_publish == 'true' run: | LTS_KERNEL=$(echo "${{ matrix.kernel_version }}" | sed -E 's/^([0-9]+\.[0-9]+)\..*/\1/') KERNEL_FLAVOR=$(echo "${{ matrix.kernel_version }}" | sed -E 's/^[0-9]+\.[0-9]+\.[0-9]+-[0-9]+-(.*)-ubuntu[0-9]+\.[0-9]+$/\1/') @@ -475,6 +514,8 @@ jobs: BASE_TARGET="jammy" elif [[ "${DIST}" == "ubuntu24.04" ]]; then BASE_TARGET="noble" + elif [[ "${DIST}" == "ubuntu26.04" ]]; then + BASE_TARGET="resolute" fi image_path="./base-images-${{ matrix.driver_branch }}-${{ matrix.kernel_version }}.tar" echo "uploading $image_path" @@ -486,14 +527,14 @@ jobs: fi - name: Download built image artifact - if: ${{ ! (matrix.driver_branch == 535 && contains(matrix.kernel_version, 'ubuntu24.04')) }} + if: steps.set_image_vars.outputs.run_publish == 'true' uses: actions/download-artifact@v8 with: name: driver-images-${{ matrix.driver_branch }}-${{ matrix.kernel_version }} path: ./ - name: Publish image - if: ${{ ! (matrix.driver_branch == 535 && contains(matrix.kernel_version, 'ubuntu24.04')) }} + if: steps.set_image_vars.outputs.run_publish == 'true' run: | image_path="./driver-images-${{ matrix.driver_branch }}-${{ matrix.kernel_version }}.tar" echo "uploading $image_path" @@ -505,7 +546,7 @@ jobs: fi - name: Slack notification - if: ${{ ! (matrix.driver_branch == 535 && contains(matrix.kernel_version, 'ubuntu24.04')) && github.ref == 'refs/heads/main' }} + if: ${{ steps.set_image_vars.outputs.run_publish == 'true' && github.ref == 'refs/heads/main' }} uses: slackapi/slack-github-action@v3.0.3 with: token: ${{ secrets.SLACK_BOT_TOKEN }} diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 15db285a4..01a368592 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -137,3 +137,12 @@ image-precompiled-ubuntu24.04: extends: - .driver-versions-precompiled-ubuntu24.04 - .image-build-precompiled + +image-precompiled-ubuntu26.04: + variables: + DIST: signed_ubuntu26.04 + BASE_TARGET: questing + CVE_UPDATES: "curl libc6" + extends: + - .driver-versions-precompiled-ubuntu26.04 + - .image-build-precompiled diff --git a/.nvidia-ci.yml b/.nvidia-ci.yml index 9cd2ff68e..7baeeefe9 100644 --- a/.nvidia-ci.yml +++ b/.nvidia-ci.yml @@ -112,6 +112,20 @@ image-precompiled-ubuntu24.04: - .driver-versions-precompiled-ubuntu24.04 - .image-pull-generic +image-precompiled-ubuntu26.04: + variables: + DIST: signed_ubuntu26.04 + BASE_TARGET: questing + PRECOMPILED: "true" + CVE_UPDATES: "curl libc6" + rules: + - if: $CI_PIPELINE_SOURCE == "schedule" + when: delayed + start_in: 30 minutes + extends: + - .driver-versions-precompiled-ubuntu26.04 + - .image-pull-generic + .image-pull-ubuntu22.04: # Perform for each DRIVER_VERSION extends: @@ -281,6 +295,18 @@ image-rocky10: - !reference [.scan-rules-common, rules] - !reference [.precompiled-rules, rules] +.scan-precompiled-ubuntu26.04: + variables: + DIST: signed_ubuntu26.04 + BASE_TARGET: questing + PRECOMPILED: "true" + extends: + - .driver-versions-precompiled-ubuntu26.04 + - .scan-generic + rules: + - !reference [.scan-rules-common, rules] + - !reference [.precompiled-rules, rules] + .scan-precompiled-ubuntu22.04: variables: DIST: signed_ubuntu22.04 @@ -334,6 +360,15 @@ scan-precompiled-ubuntu24.04-amd64: needs: - image-precompiled-ubuntu24.04 +scan-precompiled-ubuntu26.04-amd64: + variables: + PLATFORM: linux/amd64 + extends: + - .scan-precompiled-ubuntu26.04 + - .platform-amd64 + needs: + - image-precompiled-ubuntu26.04 + scan-precompiled-ubuntu22.04-amd64: variables: PLATFORM: linux/amd64 @@ -476,6 +511,18 @@ release:ngc-precompiled-ubuntu24.04: rules: - !reference [.precompiled-rules, rules] +release:ngc-precompiled-ubuntu26.04: + variables: + DIST: signed_ubuntu26.04 + BASE_TARGET: questing + PRECOMPILED: "true" + extends: + - .driver-versions-precompiled-ubuntu26.04 + - .release-generic + - .release:ngc-variables + rules: + - !reference [.precompiled-rules, rules] + release:ngc-precompiled-ubuntu22.04: variables: DIST: signed_ubuntu22.04 diff --git a/Makefile b/Makefile index b74bd4b5a..4859bbc1a 100644 --- a/Makefile +++ b/Makefile @@ -54,10 +54,10 @@ OUT_IMAGE_TAG = $(OUT_IMAGE_VERSION)-$(OUT_DIST) OUT_IMAGE = $(OUT_IMAGE_NAME):$(OUT_IMAGE_TAG) ##### Public rules ##### -DISTRIBUTIONS := ubuntu22.04 ubuntu24.04 signed_ubuntu22.04 signed_ubuntu24.04 rhel8 rhel9 rhel10 rocky8 rocky9 rocky10 precompiled_rhcos +DISTRIBUTIONS := ubuntu22.04 ubuntu24.04 signed_ubuntu22.04 signed_ubuntu24.04 signed_ubuntu26.04 rhel8 rhel9 rhel10 rocky8 rocky9 rocky10 precompiled_rhcos RHCOS_VERSIONS := rhcos4.14 rhcos4.15 rhcos4.16 rhcos4.17 rhcos4.18 rhel9.6 PUSH_TARGETS := $(patsubst %, push-%, $(DISTRIBUTIONS)) -BASE_FROM := noble jammy +BASE_FROM := resolute noble jammy PUSH_TARGETS := $(patsubst %, push-%, $(DISTRIBUTIONS)) VGPU_GUEST_DRIVER_PUSH_TARGETS := $(patsubst %, push-vgpuguest-%, $(DISTRIBUTIONS) $(RHCOS_VERSIONS)) VGPU_HOST_DRIVER_PUSH_TARGETS := $(patsubst %, push-vgpuhost-%, $(DISTRIBUTIONS) $(RHCOS_VERSIONS)) @@ -98,6 +98,10 @@ pull-signed_ubuntu24.04%: DIST = ubuntu24.04 pull-signed_ubuntu24.04%: DRIVER_TAG = $(DRIVER_BRANCH) pull-signed_ubuntu24.04%: IMAGE_TAG = $(DRIVER_BRANCH)-$(KERNEL_VERSION)-$(DIST) +pull-signed_ubuntu26.04%: DIST = ubuntu26.04 +pull-signed_ubuntu26.04%: DRIVER_TAG = $(DRIVER_BRANCH) +pull-signed_ubuntu26.04%: IMAGE_TAG = $(DRIVER_BRANCH)-$(KERNEL_VERSION)-$(DIST) + PLATFORM ?= linux/amd64 $(DRIVER_PULL_TARGETS): pull-%: $(DOCKER) pull "--platform=$(PLATFORM)" "$(IMAGE)" @@ -116,6 +120,10 @@ archive-signed_ubuntu24.04%: DIST = ubuntu24.04 archive-signed_ubuntu24.04%: DRIVER_TAG = $(DRIVER_BRANCH) archive-signed_ubuntu24.04%: IMAGE_TAG = $(if $(VERSION),$(VERSION)-)$(DRIVER_BRANCH)-$(KERNEL_VERSION)-$(DIST) +archive-signed_ubuntu26.04%: DIST = ubuntu26.04 +archive-signed_ubuntu26.04%: DRIVER_TAG = $(DRIVER_BRANCH) +archive-signed_ubuntu26.04%: IMAGE_TAG = $(if $(VERSION),$(VERSION)-)$(DRIVER_BRANCH)-$(KERNEL_VERSION)-$(DIST) + $(DRIVER_ARCHIVE_TARGETS): archive-%: $(DOCKER) save "$(IMAGE)" -o "archive.tar" @@ -139,6 +147,11 @@ push-signed_ubuntu24.04%: DRIVER_TAG = $(DRIVER_BRANCH) push-signed_ubuntu24.04%: IMAGE_TAG = $(if $(VERSION),$(VERSION)-)$(DRIVER_BRANCH)-$(KERNEL_VERSION)-$(DIST) push-signed_ubuntu24.04%: OUT_IMAGE_TAG = $(if $(VERSION),$(VERSION)-)$(DRIVER_BRANCH)-$(KERNEL_VERSION)-$(DIST) +push-signed_ubuntu26.04%: DIST = ubuntu26.04 +push-signed_ubuntu26.04%: DRIVER_TAG = $(DRIVER_BRANCH) +push-signed_ubuntu26.04%: IMAGE_TAG = $(if $(VERSION),$(VERSION)-)$(DRIVER_BRANCH)-$(KERNEL_VERSION)-$(DIST) +push-signed_ubuntu26.04%: OUT_IMAGE_TAG = $(if $(VERSION),$(VERSION)-)$(DRIVER_BRANCH)-$(KERNEL_VERSION)-$(DIST) + # $(DRIVER_BUILD_TARGETS) is in the form of build-$(DIST)-$(DRIVER_VERSION) # Parse the target to set the required variables. build-%: DIST = $(word 2,$(subst -, ,$@)) @@ -191,6 +204,14 @@ build-signed_ubuntu24.04%: DRIVER_TAG = $(DRIVER_BRANCH) build-signed_ubuntu24.04%: IMAGE_TAG = $(if $(VERSION),$(VERSION)-)$(DRIVER_BRANCH)-$(KERNEL_VERSION)-$(DIST) build-signed_ubuntu24.04%: DOCKER_BUILD_ARGS = --build-arg KERNEL_VERSION="$(KERNEL_VERSION)" +# ubuntu26.04 Precompiled Driver +build-signed_ubuntu26.04%: DIST = ubuntu26.04 +build-signed_ubuntu26.04%: SUBDIR = . +build-signed_ubuntu26.04%: DOCKERFILE = $(CURDIR)/ubuntu26.04/precompiled/Dockerfile +build-signed_ubuntu26.04%: DRIVER_TAG = $(DRIVER_BRANCH) +build-signed_ubuntu26.04%: IMAGE_TAG = $(if $(VERSION),$(VERSION)-)$(DRIVER_BRANCH)-$(KERNEL_VERSION)-$(DIST) +build-signed_ubuntu26.04%: DOCKER_BUILD_ARGS = --build-arg KERNEL_VERSION="$(KERNEL_VERSION)" + # base is an image used to poll Canonical for the latest kernel version # LTS_KERNEL must be defined in the environment when invoking this target. LTS_KERNEL ?= "" @@ -298,4 +319,3 @@ $(VGPU_HOST_DRIVER_BUILD_TARGETS): push-vgpuhost-%: $(if $(VGPU_HOST_DRIVER_VERSION),,$(error "VGPU_HOST_DRIVER_VERSION is not set")) push-vgpuhost-%: DRIVER_TAG = $(VGPU_HOST_DRIVER_VERSION) push-vgpuhost-%: DIST = $(word 3,$(subst -, ,$@)) - diff --git a/base/Dockerfile b/base/Dockerfile index bcfdec39a..6e5956fd2 100644 --- a/base/Dockerfile +++ b/base/Dockerfile @@ -1,3 +1,40 @@ +# Ubuntu 26.04 +FROM ubuntu:resolute-20260421 AS resolute + +SHELL ["/bin/bash", "-c"] + +ARG DRIVER_BRANCH +ARG KERNEL_FLAVOR +ARG LTS_KERNEL +ENV DRIVER_BRANCH=${DRIVER_BRANCH} +ENV KERNEL_FLAVOR=${KERNEL_FLAVOR} +ENV LTS_KERNEL=${LTS_KERNEL} + +RUN rm -f /etc/apt/sources.list.d/cuda* && \ + echo "deb [arch=amd64] http://archive.ubuntu.com/ubuntu/ resolute main restricted universe" > /etc/apt/sources.list && \ + echo "deb [arch=amd64] http://archive.ubuntu.com/ubuntu/ resolute-updates main restricted universe" >> /etc/apt/sources.list && \ + echo "deb [arch=amd64] http://archive.ubuntu.com/ubuntu/ resolute-security main restricted universe" >> /etc/apt/sources.list && \ + echo "deb [arch=amd64] http://us.archive.ubuntu.com/ubuntu resolute-updates main restricted" >> /etc/apt/sources.list && \ + echo "deb [arch=amd64] http://us.archive.ubuntu.com/ubuntu resolute-security main restricted" >> /etc/apt/sources.list && \ + rm -f /etc/apt/sources.list.d/ubuntu.sources + +RUN echo 'debconf debconf/frontend select Noninteractive' | debconf-set-selections + +ENV NVIDIA_VISIBLE_DEVICES=void + +RUN apt-get update && apt-get install -y --no-install-recommends \ + apt-utils git curl && \ + rm -rf /var/lib/apt/lists/* + +RUN usermod -o -u 0 -g 0 _apt + +COPY generate-ci-config /usr/local/bin/generate-ci-config + +RUN chmod +x /usr/local/bin/generate-ci-config && \ + generate-ci-config + +ENTRYPOINT ["/usr/bin/sleep","1000"] + # Ubuntu 24.04 FROM ubuntu:noble-20260410 AS noble diff --git a/multi-arch.mk b/multi-arch.mk index d5201eb70..c0f1e5cea 100644 --- a/multi-arch.mk +++ b/multi-arch.mk @@ -26,3 +26,4 @@ $(DRIVER_PUSH_TARGETS): push-%: # No multi-arch support for the following distributions build-signed_ubuntu22.04%: DOCKER_BUILD_PLATFORM_OPTIONS = --platform=linux/amd64 build-signed_ubuntu24.04%: DOCKER_BUILD_PLATFORM_OPTIONS = --platform=linux/amd64 +build-signed_ubuntu26.04%: DOCKER_BUILD_PLATFORM_OPTIONS = --platform=linux/amd64 diff --git a/ubuntu26.04/precompiled/Dockerfile b/ubuntu26.04/precompiled/Dockerfile new file mode 100644 index 000000000..2bae19566 --- /dev/null +++ b/ubuntu26.04/precompiled/Dockerfile @@ -0,0 +1,62 @@ +ARG BASE_IMAGE=ubuntu:resolute-20260421 +FROM ${BASE_IMAGE} + +ENV DEBIAN_FRONTEND=noninteractive + +ARG BASE_URL=https://us.download.nvidia.com/tesla +ARG TARGETARCH +ENV TARGETARCH=$TARGETARCH +ARG DRIVER_BRANCH=580 +ENV DRIVER_BRANCH=$DRIVER_BRANCH +ARG DRIVER_VERSION=580.126.20 +ENV DRIVER_VERSION=$DRIVER_VERSION + +ARG KERNEL_VERSION=7.0.0-12-generic +ENV KERNEL_VERSION=$KERNEL_VERSION + +ENV NVIDIA_VISIBLE_DEVICES=void + +RUN echo 'debconf debconf/frontend select Noninteractive' | debconf-set-selections + +RUN dpkg --add-architecture i386 && \ + apt-get update && apt-get install -y --no-install-recommends \ + apt-utils \ + build-essential \ + ca-certificates \ + curl \ + kmod \ + file \ + libelf-dev \ + libglvnd-dev \ + pkg-config && \ + rm -rf /var/lib/apt/lists/* + +# Fetch GPG keys for CUDA repo +RUN rm -f /etc/apt/sources.list.d/cuda* && \ + curl -fsSL https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2604/x86_64/cuda-keyring_1.1-1_all.deb -o cuda-keyring_1.1-1_all.deb && \ + dpkg -i cuda-keyring_1.1-1_all.deb && \ + rm -f cuda-keyring_1.1-1_all.deb + +RUN usermod -o -u 0 -g 0 _apt + +# Install / upgrade packages here that are required to resolve CVEs +ARG CVE_UPDATES +RUN if [ -n "${CVE_UPDATES}" ]; then \ + apt-get update && apt-get --only-upgrade -y install ${CVE_UPDATES} && \ + rm -rf /var/lib/apt/lists/*; \ + fi + +COPY ubuntu26.04/precompiled/nvidia-driver /usr/local/bin + +ADD ubuntu26.04/precompiled/local-repo.sh /tmp + +RUN mkdir -p /usr/local/repos && \ + /tmp/local-repo.sh download_driver_package_deps && \ + /tmp/local-repo.sh build_local_apt_repo && \ + /tmp/local-repo.sh fetch_nvidia_installer && \ + # Remove all other ubuntu apt sources to ensure we only pull from the local apt repo + rm /etc/apt/sources.list.d/* + +WORKDIR /drivers + +ENTRYPOINT ["nvidia-driver", "init"] diff --git a/ubuntu26.04/precompiled/local-repo.sh b/ubuntu26.04/precompiled/local-repo.sh new file mode 100755 index 000000000..97b57f019 --- /dev/null +++ b/ubuntu26.04/precompiled/local-repo.sh @@ -0,0 +1,118 @@ +#!/usr/bin/env bash + +set -eu + +LOCAL_REPO_DIR=/usr/local/repos +DRIVER_ARCH=${TARGETARCH/amd64/x86_64} && DRIVER_ARCH=${DRIVER_ARCH/arm64/aarch64} +DRIVER_RUN_FILE=NVIDIA-Linux-$DRIVER_ARCH-$DRIVER_VERSION + +download_apt_with_dep () { + local package_name="$1" + local package_version + if [ $# -gt 1 ] && [ -n "$2" ]; then + package_version="$2" + apt-get download "${package_name}=${package_version}" + else + apt-get download "${package_name}" + fi + + dependent_pkgs=$(apt-cache depends --recurse --no-recommends --no-suggests --no-conflicts --no-breaks --no-replaces --no-enhances "$package_name" | grep "^\w" | grep -v "$package_name" | sort -u) + if [ -n "$dependent_pkgs" ]; then + apt-get download $dependent_pkgs + fi +} + +nvlink5_pkgs_download() { + if [ "$DRIVER_BRANCH" -ge "570" ]; then + download_apt_with_dep nvlsm + download_apt_with_dep infiniband-diags + fi +} + +nvsdm_download() { + if [ "$TARGETARCH" = "amd64" ]; then + if [ "$DRIVER_BRANCH" -ge "595" ]; then + download_apt_with_dep libnvsdm ${DRIVER_VERSION}* + elif [ "$DRIVER_BRANCH" -ge "560" ] && [ "$DRIVER_BRANCH" -lt "580" ]; then + download_apt_with_dep libnvsdm-${DRIVER_BRANCH} ${DRIVER_VERSION}* + fi + fi +} + +fabricmanager_download() { + if [ "$DRIVER_BRANCH" -ge "595" ]; then + download_apt_with_dep nvidia-fabricmanager ${DRIVER_VERSION}* + else + download_apt_with_dep nvidia-fabricmanager-${DRIVER_BRANCH} ${DRIVER_VERSION}* + fi +} + +nscq_download() { + if [ "$DRIVER_BRANCH" -ge "595" ]; then + download_apt_with_dep libnvidia-nscq ${DRIVER_VERSION}* + else + download_apt_with_dep libnvidia-nscq-${DRIVER_BRANCH} ${DRIVER_VERSION}* + fi +} + +imex_download() { + if [ "$DRIVER_BRANCH" -ge "595" ]; then + download_apt_with_dep nvidia-imex ${DRIVER_VERSION}* + elif [ "$DRIVER_BRANCH" -ge "550" ]; then + download_apt_with_dep nvidia-imex-${DRIVER_BRANCH} ${DRIVER_VERSION}* + fi +} + +download_driver_package_deps () { + apt-get update + pushd ${LOCAL_REPO_DIR} + + download_apt_with_dep linux-objects-nvidia-${DRIVER_BRANCH}-server-${KERNEL_VERSION} + download_apt_with_dep linux-signatures-nvidia-${KERNEL_VERSION} + download_apt_with_dep linux-modules-nvidia-${DRIVER_BRANCH}-server-${KERNEL_VERSION} + download_apt_with_dep linux-modules-nvidia-${DRIVER_BRANCH}-server-open-${KERNEL_VERSION} + download_apt_with_dep nvidia-utils-${DRIVER_BRANCH}-server + download_apt_with_dep nvidia-headless-no-dkms-${DRIVER_BRANCH}-server + download_apt_with_dep libnvidia-decode-${DRIVER_BRANCH}-server + download_apt_with_dep libnvidia-extra-${DRIVER_BRANCH}-server + download_apt_with_dep libnvidia-encode-${DRIVER_BRANCH}-server + download_apt_with_dep libnvidia-fbc1-${DRIVER_BRANCH}-server + download_apt_with_dep libnvidia-gl-${DRIVER_BRANCH}-server + + fabricmanager_download + nscq_download + nvlink5_pkgs_download + imex_download + nvsdm_download + + ls -al . + popd +} + +build_local_apt_repo () { + pushd ${LOCAL_REPO_DIR} + dpkg-scanpackages . /dev/null | gzip -9c | tee Packages.gz > /dev/null + echo "deb [trusted=yes] file:${LOCAL_REPO_DIR} ./" > /etc/apt/sources.list + popd + apt-get update +} + +fetch_nvidia_installer () { + curl -fSsl -O $BASE_URL/$DRIVER_VERSION/$DRIVER_RUN_FILE.run + chmod +x $DRIVER_RUN_FILE.run + sh $DRIVER_RUN_FILE.run -x + mv $DRIVER_RUN_FILE/nvidia-installer /usr/bin/ + rm -rf $DRIVER_RUN_FILE + rm $DRIVER_RUN_FILE.run +} + +if [ "$1" = "download_driver_package_deps" ]; then + download_driver_package_deps +elif [ "$1" = "build_local_apt_repo" ]; then + build_local_apt_repo +elif [ "$1" = "fetch_nvidia_installer" ]; then + fetch_nvidia_installer +else + echo "Unknown function: $1" + exit 1 +fi diff --git a/ubuntu26.04/precompiled/nvidia-driver b/ubuntu26.04/precompiled/nvidia-driver new file mode 100755 index 000000000..a4ea7266d --- /dev/null +++ b/ubuntu26.04/precompiled/nvidia-driver @@ -0,0 +1,566 @@ +#! /bin/bash +# Copyright (c) 2018-2022, NVIDIA CORPORATION. All rights reserved. + +set -eu + +KERNEL_VERSION=$(uname -r) +RUN_DIR=/run/nvidia +PID_FILE=${RUN_DIR}/${0##*/}.pid +DRIVER_BRANCH=${DRIVER_BRANCH:?"Missing driver version"} +GPU_DIRECT_RDMA_ENABLED="${GPU_DIRECT_RDMA_ENABLED:-false}" +USE_HOST_MOFED="${USE_HOST_MOFED:-false}" +NVIDIA_MODULE_PARAMS=() +NVIDIA_UVM_MODULE_PARAMS=() +NVIDIA_MODESET_MODULE_PARAMS=() +NVIDIA_PEERMEM_MODULE_PARAMS=() +TARGETARCH=${TARGETARCH:?"Missing TARGETARCH env"} +KERNEL_MODULE_TYPE=${KERNEL_MODULE_TYPE:-auto} +MODPROBE_CONFIG_DIR="/etc/modprobe.d" + + +fabricmanager_install() { + local fabricmanager_package_name + if [ "$DRIVER_BRANCH" -ge "595" ]; then + fabricmanager_package_name=nvidia-fabricmanager + else + fabricmanager_package_name=nvidia-fabricmanager-${DRIVER_BRANCH} + fi + apt-get install -y --no-install-recommends ${fabricmanager_package_name}=${DRIVER_VERSION}* + apt-mark hold ${fabricmanager_package_name} +} + +nscq_install() { + local nscq_package_name + if [ "$DRIVER_BRANCH" -ge "595" ]; then + nscq_package_name=libnvidia-nscq + else + nscq_package_name=libnvidia-nscq-${DRIVER_BRANCH} + fi + apt-get install -y --no-install-recommends ${nscq_package_name}=${DRIVER_VERSION}* + apt-mark hold ${nscq_package_name} +} + +imex_install() { + local imex_package_name + if [ "$DRIVER_BRANCH" -ge "595" ]; then + imex_package_name=nvidia-imex + elif [ "$DRIVER_BRANCH" -ge "550" ]; then + imex_package_name=nvidia-imex-${DRIVER_BRANCH} + else + return 0 + fi + apt-get install -y --no-install-recommends ${imex_package_name}=${DRIVER_VERSION}* + apt-mark hold ${imex_package_name} +} + +nvlink5_pkgs_install() { + if [ "$DRIVER_BRANCH" -ge "570" ]; then + apt-get install -y --no-install-recommends nvlsm + apt-get install -y --no-install-recommends infiniband-diags + fi +} + +# libnvsdm packages are not available for arm64 +nvsdm_install() { + local nvsdm_package_name + if [ "$TARGETARCH" = "amd64" ]; then + if [ "$DRIVER_BRANCH" -ge "595" ]; then + nvsdm_package_name=libnvsdm + elif [ "$DRIVER_BRANCH" -ge "560" ] && [ "$DRIVER_BRANCH" -lt "580" ]; then + nvsdm_package_name=libnvsdm-${DRIVER_BRANCH} + else + return 0 + fi + apt-get install -y --no-install-recommends ${nvsdm_package_name}=${DRIVER_VERSION}* + apt-mark hold ${nvsdm_package_name} + fi +} + +_update_package_cache() { + if [ "${PACKAGE_TAG:-}" != "builtin" ]; then + echo "Updating the package cache..." + if ! apt-get -qq update; then + echo "ERROR: Failed to update package cache. "\ + "Ensure that the cluster can access the proper networks." + exit 1 + fi + fi +} + +_assert_nvswitch_system() { + [ -d /proc/driver/nvidia-nvswitch/devices ] || return 1 + if [ -z "$(ls -A /proc/driver/nvidia-nvswitch/devices)" ]; then + return 1 + fi + return 0 +} + +_assert_nvlink5_system() ( + for dir in /sys/class/infiniband/*/device; do + # Define the path to the VPD file + vpd_file="$dir/vpd" + + # Check if the VPD file exists + if [ -f "$vpd_file" ]; then + # Search for 'SW_MNG' in the VPD file + if grep -q "SW_MNG" "$vpd_file"; then + echo "Detected NVLink5+ system" + return 0 + fi + fi + done + return 1 +) + +_ensure_nvlink5_prerequisites() ( + until lsmod | grep mlx5_core > /dev/null 2>&1 && lsmod | grep ib_umad > /dev/null 2>&1; + do + echo "waiting for the mlx5_core and ib_umad kernel modules to be loaded" + sleep 10 + done +) + +# Check if mellanox devices are present +_mellanox_devices_present() { + devices_found=0 + for dev in /sys/bus/pci/devices/*; do + read vendor < $dev/vendor + if [ "$vendor" = "0x15b3" ]; then + echo "Mellanox device found at $(basename $dev)" + return 0 + fi + done + echo "No Mellanox devices were found..." + return 1 +} + +_gpu_direct_rdma_enabled() { + if [ "${GPU_DIRECT_RDMA_ENABLED}" = "true" ]; then + # check if mellanox cards are present + if _mellanox_devices_present; then + return 0 + fi + fi + return 1 +} + +# For each kernel module configuration file mounted into the container, +# parse the file contents and extract the custom module parameters that +# are to be passed as input to 'modprobe'. +# +# Assumptions: +# - Configuration files are named .conf (i.e. nvidia.conf, nvidia-uvm.conf). +# - Configuration files are mounted inside the container at /drivers. +# - Each line in the file contains at least one parameter, where parameters on the same line +# are space delimited. It is up to the user to properly format the file to ensure +# the correct set of parameters are passed to 'modprobe'. +_get_module_params() { + local base_path="/drivers" + # nvidia + if [ -f "${base_path}/nvidia.conf" ]; then + while IFS="" read -r param || [ -n "$param" ]; do + NVIDIA_MODULE_PARAMS+=("$param") + done <"${base_path}/nvidia.conf" + echo "Module parameters provided for nvidia: ${NVIDIA_MODULE_PARAMS[@]}" + fi + # nvidia-uvm + if [ -f "${base_path}/nvidia-uvm.conf" ]; then + while IFS="" read -r param || [ -n "$param" ]; do + NVIDIA_UVM_MODULE_PARAMS+=("$param") + done <"${base_path}/nvidia-uvm.conf" + echo "Module parameters provided for nvidia-uvm: ${NVIDIA_UVM_MODULE_PARAMS[@]}" + fi + # nvidia-modeset + if [ -f "${base_path}/nvidia-modeset.conf" ]; then + while IFS="" read -r param || [ -n "$param" ]; do + NVIDIA_MODESET_MODULE_PARAMS+=("$param") + done <"${base_path}/nvidia-modeset.conf" + echo "Module parameters provided for nvidia-modeset: ${NVIDIA_MODESET_MODULE_PARAMS[@]}" + fi + # nvidia-peermem + if [ -f "${base_path}/nvidia-peermem.conf" ]; then + while IFS="" read -r param || [ -n "$param" ]; do + NVIDIA_PEERMEM_MODULE_PARAMS+=("$param") + done <"${base_path}/nvidia-peermem.conf" + echo "Module parameters provided for nvidia-peermem: ${NVIDIA_PEERMEM_MODULE_PARAMS[@]}" + fi +} + +_create_module_params_conf() { + echo "Parsing kernel module parameters..." + _get_module_params + + if [ ${#NVIDIA_MODULE_PARAMS[@]} -gt 0 ]; then + echo "Configuring nvidia module parameters in ${MODPROBE_CONFIG_DIR}/nvidia.conf" + echo "options nvidia ${NVIDIA_MODULE_PARAMS[@]}" > ${MODPROBE_CONFIG_DIR}/nvidia.conf + fi + if [ ${#NVIDIA_UVM_MODULE_PARAMS[@]} -gt 0 ]; then + echo "Configuring nvidia-uvm module parameters in ${MODPROBE_CONFIG_DIR}/nvidia-uvm.conf" + echo "options nvidia-uvm ${NVIDIA_UVM_MODULE_PARAMS[@]}" > ${MODPROBE_CONFIG_DIR}/nvidia-uvm.conf + fi + if [ ${#NVIDIA_MODESET_MODULE_PARAMS[@]} -gt 0 ]; then + echo "Configuring nvidia-modeset module parameters in ${MODPROBE_CONFIG_DIR}/nvidia-modeset.conf" + echo "options nvidia-modeset ${NVIDIA_MODESET_MODULE_PARAMS[@]}" > ${MODPROBE_CONFIG_DIR}/nvidia-modeset.conf + fi + if [ ${#NVIDIA_PEERMEM_MODULE_PARAMS[@]} -gt 0 ]; then + echo "Configuring nvidia-peermem module parameters in ${MODPROBE_CONFIG_DIR}/nvidia-peermem.conf" + echo "options nvidia-peermem ${NVIDIA_PEERMEM_MODULE_PARAMS[@]}" > ${MODPROBE_CONFIG_DIR}/nvidia-peermem.conf + fi +} + +# Load the kernel modules and start persistenced. +_load_driver() { + local nv_fw_search_path="$RUN_DIR/driver/lib/firmware" + local set_fw_path="true" + local fw_path_config_file="/sys/module/firmware_class/parameters/path" + for param in "${NVIDIA_MODULE_PARAMS[@]}"; do + if [[ "$param" == "NVreg_EnableGpuFirmware=0" ]]; then + set_fw_path="false" + fi + done + + if [[ "$set_fw_path" == "true" ]]; then + echo "Configuring the following firmware search path in '$fw_path_config_file': $nv_fw_search_path" + if [[ ! -z $(grep '[^[:space:]]' $fw_path_config_file) ]]; then + echo "WARNING: A search path is already configured in $fw_path_config_file" + echo " Retaining the current configuration" + else + echo -n "$nv_fw_search_path" > $fw_path_config_file || echo "WARNING: Failed to configure firmware search path" + fi + fi + + echo "Loading ipmi and i2c_core kernel modules..." + modprobe -a i2c_core ipmi_msghandler ipmi_devintf + + echo "Loading NVIDIA driver kernel modules..." + set -o xtrace +o nounset + modprobe nvidia + modprobe nvidia-uvm + modprobe nvidia-modeset + set +o xtrace -o nounset + + + if _gpu_direct_rdma_enabled; then + echo "Loading NVIDIA Peer Memory kernel module..." + set -o xtrace +o nounset + modprobe nvidia-peermem + set +o xtrace -o nounset + fi + + echo "Starting NVIDIA persistence daemon..." + nvidia-persistenced --persistence-mode + + DRIVER_VERSION=$(nvidia-smi -q | grep "Driver Version" | awk -F: '{print $2}' | xargs) + if _assert_nvlink5_system; then + _ensure_nvlink5_prerequisites || return 1 + + echo "Installing NVIDIA fabric manager, libnvsdm and nvlsm packages..." + nvlink5_pkgs_install + fabricmanager_install + nvsdm_install + imex_install + + echo "Starting NVIDIA fabric manager daemon for NVLink5+..." + + fm_config_file=/usr/share/nvidia/nvswitch/fabricmanager.cfg + fm_pid_file=/var/run/nvidia-fabricmanager/nv-fabricmanager.pid + nvlsm_config_file=/usr/share/nvidia/nvlsm/nvlsm.conf + nvlsm_pid_file=/var/run/nvidia-fabricmanager/nvlsm.pid + /usr/bin/nvidia-fabricmanager-start.sh --mode start \ + --fm-config-file $fm_config_file \ + --fm-pid-file $fm_pid_file \ + --nvlsm-config-file $nvlsm_config_file \ + --nvlsm-pid-file $nvlsm_pid_file + + # If not a NVLink5+ switch, check for the presence of NVLink4 (or below) switches + elif _assert_nvswitch_system; then + echo "Installing NVIDIA fabric manager and libnvidia NSCQ packages..." + fabricmanager_install + nscq_install + imex_install + + echo "Starting NVIDIA fabric manager daemon..." + nv-fabricmanager -c /usr/share/nvidia/nvswitch/fabricmanager.cfg + fi + + return 0 +} + +# Stop persistenced and unload the kernel modules if they are currently loaded. +_unload_driver() { + local rmmod_args=() + local nvidia_deps=0 + local nvidia_refs=0 + local nvidia_uvm_refs=0 + local nvidia_modeset_refs=0 + local nvidia_peermem_refs=0 + + if [ -f /var/run/nvidia-persistenced/nvidia-persistenced.pid ]; then + echo "Stopping NVIDIA persistence daemon..." + local pid=$(< /var/run/nvidia-persistenced/nvidia-persistenced.pid) + + kill -SIGTERM "${pid}" + for i in $(seq 1 50); do + kill -0 "${pid}" 2> /dev/null || break + sleep 0.1 + done + if [ $i -eq 50 ]; then + echo "Could not stop NVIDIA persistence daemon" >&2 + return 1 + fi + fi + + if [ -f /var/run/nvidia-fabricmanager/nv-fabricmanager.pid ]; then + echo "Stopping NVIDIA fabric manager daemon..." + local pid=$(< /var/run/nvidia-fabricmanager/nv-fabricmanager.pid) + + kill -SIGTERM "${pid}" + for i in $(seq 1 50); do + kill -0 "${pid}" 2> /dev/null || break + sleep 0.1 + done + if [ $i -eq 50 ]; then + echo "Could not stop NVIDIA fabric manager daemon" >&2 + return 1 + fi + fi + + if [ -f /var/run/nvidia-fabricmanager/nvlsm.pid ]; then + echo "Stopping NVLink Subnet Manager daemon..." + local pid=$(< /var/run/nvidia-fabricmanager/nvlsm.pid) + + kill -SIGTERM "${pid}" + for i in $(seq 1 50); do + kill -0 "${pid}" 2> /dev/null || break + sleep 0.1 + done + if [ $i -eq 50 ]; then + echo "Could not stop NVLink Subnet Manager daemon" >&2 + return 1 + fi + fi + + echo "Unloading NVIDIA driver kernel modules..." + if [ -f /sys/module/nvidia_drm/refcnt ]; then + nvidia_drm_refs=$(< /sys/module/nvidia_drm/refcnt) + rmmod_args+=("nvidia-drm") + ((++nvidia_deps)) + fi + if [ -f /sys/module/nvidia_modeset/refcnt ]; then + nvidia_modeset_refs=$(< /sys/module/nvidia_modeset/refcnt) + rmmod_args+=("nvidia-modeset") + ((++nvidia_deps)) + fi + if [ -f /sys/module/nvidia_uvm/refcnt ]; then + nvidia_uvm_refs=$(< /sys/module/nvidia_uvm/refcnt) + rmmod_args+=("nvidia-uvm") + ((++nvidia_deps)) + fi + if [ -f /sys/module/nvidia_peermem/refcnt ]; then + nvidia_peermem_refs=$(< /sys/module/nvidia_peermem/refcnt) + rmmod_args+=("nvidia-peermem") + ((++nvidia_deps)) + fi + if [ -f /sys/module/nvidia/refcnt ]; then + nvidia_refs=$(< /sys/module/nvidia/refcnt) + rmmod_args+=("nvidia") + fi + if [ ${nvidia_refs} -gt ${nvidia_deps} ]; then + # run lsmod to debug module usage + lsmod | grep nvidia + echo "Could not unload NVIDIA driver kernel modules, driver is in use" >&2 + return 1 + fi + + if [ ${#rmmod_args[@]} -gt 0 ]; then + rmmod ${rmmod_args[@]} + fi + return 0 +} + +_resolve_kernel_type_from_driver_branch() { + [[ "${DRIVER_BRANCH}" -lt 560 ]] && KERNEL_TYPE=kernel || KERNEL_TYPE=kernel-open +} + +# _resolve_kernel_type determines which kernel module type, open or proprietary, to install. +# This function assumes that the nvidia-installer binary is in the PATH, so this function +# should only be invoked after the userspace driver components have been installed. +# +# KERNEL_MODULE_TYPE is the frontend interface that users can use to configure which module +# to install. Valid values for KERNEL_MODULE_TYPE are 'auto' (default), 'open', and 'proprietary'. +# When 'auto' is configured, we use the nvidia-installer to recommend the module type to install. +_resolve_kernel_type() { + if [ "${KERNEL_MODULE_TYPE}" == "proprietary" ]; then + KERNEL_TYPE=kernel + elif [ "${KERNEL_MODULE_TYPE}" == "open" ]; then + KERNEL_TYPE=kernel-open + elif [ "${KERNEL_MODULE_TYPE}" == "auto" ]; then + kernel_module_type=$(nvidia-installer --print-recommended-kernel-module-type 2> /dev/null) + if [ $? -ne 0 ]; then + echo "failed to retrieve the recommended kernel module type from nvidia-installer, falling back to using the driver branch" + _resolve_kernel_type_from_driver_branch + return 0 + fi + [[ "${kernel_module_type}" == "open" ]] && KERNEL_TYPE=kernel-open || KERNEL_TYPE=kernel + else + echo "invalid value for the KERNEL_MODULE_TYPE variable: ${KERNEL_MODULE_TYPE}" + return 1 + fi +} + +# Link and install the kernel modules from a precompiled packages +_install_driver() { + # Install necessary driver userspace packages + apt-get install -y --no-install-recommends \ + nvidia-utils-${DRIVER_BRANCH}-server \ + nvidia-headless-no-dkms-${DRIVER_BRANCH}-server \ + libnvidia-decode-${DRIVER_BRANCH}-server \ + libnvidia-extra-${DRIVER_BRANCH}-server \ + libnvidia-encode-${DRIVER_BRANCH}-server \ + libnvidia-fbc1-${DRIVER_BRANCH}-server \ + libnvidia-gl-${DRIVER_BRANCH}-server + + # Now install the precompiled kernel module packages signed by Canonical + if [ "$KERNEL_TYPE" = "kernel-open" ]; then + echo "Installing Open NVIDIA driver kernel modules..." + apt-get install --no-install-recommends -y \ + linux-signatures-nvidia-${KERNEL_VERSION} \ + linux-modules-nvidia-${DRIVER_BRANCH}-server-open-${KERNEL_VERSION} + else + echo "Installing Closed NVIDIA driver kernel modules..." + apt-get install --no-install-recommends -y \ + linux-objects-nvidia-${DRIVER_BRANCH}-server-${KERNEL_VERSION} \ + linux-signatures-nvidia-${KERNEL_VERSION} \ + linux-modules-nvidia-${DRIVER_BRANCH}-server-${KERNEL_VERSION} + fi +} + +# Mount the driver rootfs into the run directory with the exception of sysfs. +_mount_rootfs() { + echo "Mounting NVIDIA driver rootfs..." + mount --make-runbindable /sys + mount --make-private /sys + mkdir -p ${RUN_DIR}/driver + mount --rbind / ${RUN_DIR}/driver +} + +# Unmount the driver rootfs from the run directory. +_unmount_rootfs() { + echo "Unmounting NVIDIA driver rootfs..." + if findmnt -r -o TARGET | grep "${RUN_DIR}/driver" > /dev/null; then + umount -l -R ${RUN_DIR}/driver + fi +} + +init() { + # Determine the kernel module type + _resolve_kernel_type || exit 1 + + echo -e "\n========== NVIDIA Software Installer ==========\n" + echo -e "Starting installation of NVIDIA driver branch ${DRIVER_BRANCH} for Linux kernel version ${KERNEL_VERSION}\n" + + exec 3> ${PID_FILE} + if ! flock -n 3; then + echo "An instance of the NVIDIA driver is already running, aborting" + exit 1 + fi + echo $$ >&3 + + trap "echo 'Caught signal'; exit 1" HUP INT QUIT PIPE TERM + trap "_shutdown" EXIT + + _unload_driver || exit 1 + _unmount_rootfs + + _update_package_cache + + _create_module_params_conf + _install_driver + _load_driver || exit 1 + _mount_rootfs + + echo "Done, now waiting for signal" + sleep infinity & + trap "echo 'Caught signal'; _shutdown && { kill $!; exit 0; }" HUP INT QUIT PIPE TERM + trap - EXIT + while true; do wait $! || continue; done + exit 0 +} + +_shutdown() { + if _unload_driver; then + _unmount_rootfs + rm -f ${PID_FILE} + return 0 + fi + return 1 +} + +# Wait for MOFED drivers to be loaded and load nvidia-peermem whenever it gets unloaded during MOFED driver updates +reload_nvidia_peermem() { + if [ "$USE_HOST_MOFED" = "true" ]; then + until lsmod | grep mlx5_core > /dev/null 2>&1 && [ -f /sys/module/nvidia/refcnt ]; + do + echo "waiting for mellanox ofed and nvidia drivers to be installed" + sleep 10 + done + else + # use driver readiness flag created by MOFED container + until [ -f /run/mellanox/drivers/.driver-ready ] && [ -f /sys/module/nvidia/refcnt ]; + do + echo "waiting for mellanox ofed and nvidia drivers to be installed" + sleep 10 + done + fi + # get any parameters provided for nvidia-peermem + _get_module_params && set +o nounset + if chroot /run/nvidia/driver modprobe nvidia-peermem "${NVIDIA_PEERMEM_MODULE_PARAMS[@]}"; then + if [ -f /sys/module/nvidia_peermem/refcnt ]; then + echo "successfully loaded nvidia-peermem module, now waiting for signal" + sleep inf + trap "echo 'Caught signal'; exit 1" HUP INT QUIT PIPE TERM + fi + fi + echo "failed to load nvidia-peermem module" + exit 1 +} + +# probe by gpu-opertor for liveness/startup checks for nvidia-peermem module to be loaded when MOFED drivers are ready +probe_nvidia_peermem() { + if lsmod | grep mlx5_core > /dev/null 2>&1; then + if [ ! -f /sys/module/nvidia_peermem/refcnt ]; then + echo "nvidia-peermem module is not loaded" + return 1 + fi + else + echo "MOFED drivers are not ready, skipping probe to avoid container restarts..." + fi + return 0 +} + +usage() { + cat >&2 < Date: Fri, 15 May 2026 13:11:30 +0530 Subject: [PATCH 2/3] Precompiled: Ubuntu26.04 driver container support Signed-off-by: Shiva Kumar (SW-CLOUD) --- .github/workflows/precompiled.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/precompiled.yaml b/.github/workflows/precompiled.yaml index c58f0c50e..23baa1cfb 100644 --- a/.github/workflows/precompiled.yaml +++ b/.github/workflows/precompiled.yaml @@ -402,7 +402,7 @@ jobs: fi - name: Set up Holodeck - uses: NVIDIA/holodeck@v0.3.2 + uses: NVIDIA/holodeck@ubuntu26.04 env: AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }} From 13a221e5cac3fac3be2481663c120e8c0befc8ba Mon Sep 17 00:00:00 2001 From: Shiva Kumar Date: Fri, 15 May 2026 13:39:10 +0530 Subject: [PATCH 3/3] Precompiled: Ubuntu26.04 driver container support Signed-off-by: Shiva Kumar (SW-CLOUD) --- .github/workflows/precompiled.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/precompiled.yaml b/.github/workflows/precompiled.yaml index 23baa1cfb..7df364d66 100644 --- a/.github/workflows/precompiled.yaml +++ b/.github/workflows/precompiled.yaml @@ -92,7 +92,7 @@ jobs: dist: ubuntu22.04 - flavor: azure-fde dist: ubuntu22.04 - max-parallel: 5 + max-parallel: 25 steps: - uses: actions/checkout@v6 name: Check out code