diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 9f9236b09f..82ab7210c9 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -24,6 +24,7 @@ on: schedule: # every 24 hours at midnight UTC - cron: "0 0 * * *" + workflow_dispatch: jobs: ci-vars: diff --git a/.github/workflows/coverage.yml b/.github/workflows/coverage.yml index 9581cff308..de1e713e49 100644 --- a/.github/workflows/coverage.yml +++ b/.github/workflows/coverage.yml @@ -275,13 +275,15 @@ jobs: uses: nv-gha-runners/setup-proxy-cache@main continue-on-error: true - - name: Update driver + # DRIVER above is 'latest' so install_gpu_driver.ps1 is intentionally + # skipped (it errors on latest/earliest); configure_driver_mode.ps1 + # still runs to put the pre-installed driver into TCC mode. + - name: Configure driver mode shell: powershell env: DRIVER_MODE: "TCC" - GPU_TYPE: "a100" run: | - ci/tools/install_gpu_driver.ps1 + ci/tools/configure_driver_mode.ps1 - name: Ensure GPU is working run: | diff --git a/.github/workflows/test-wheel-linux.yml b/.github/workflows/test-wheel-linux.yml index f8002f5124..57bc4dc555 100644 --- a/.github/workflows/test-wheel-linux.yml +++ b/.github/workflows/test-wheel-linux.yml @@ -85,8 +85,13 @@ jobs: # Read base matrix from YAML file for the specific architecture TEST_MATRIX=$(yq -o json ".linux[\"${MATRIX_TYPE}\"] | map(select(.ARCH == \"${ARCH}\"))" ci/test-matrix.yml) - # Apply matrix filter and wrap in include structure - MATRIX=$(echo "$TEST_MATRIX" | jq -c '${{ inputs.matrix_filter }} | if (. | length) > 0 then {include: .} else "Error: Empty matrix\n" | halt_error(1) end') + # Apply matrix filter; reject custom DRIVER + FLAVOR=wsl (the + # in-container driver swap doesn't work under WSL); add a + # RUNNER_DRIVER field that maps any custom version back to + # 'latest' (the install script swaps the driver itself, so we + # need to land on the runner that ships with the most recent + # pre-installed driver); wrap in include structure. + MATRIX=$(echo "$TEST_MATRIX" | jq -c '${{ inputs.matrix_filter }} | if any(.[]; .DRIVER != "latest" and .DRIVER != "earliest" and .FLAVOR == "wsl") then "Error: custom DRIVER is not supported with FLAVOR=wsl\n" | halt_error(1) else . end | map(. + {RUNNER_DRIVER: (if .DRIVER == "latest" or .DRIVER == "earliest" then .DRIVER else "latest" end)}) | if (. | length) > 0 then {include: .} else "Error: Empty matrix\n" | halt_error(1) end') echo "MATRIX=${MATRIX}" | tee --append "${GITHUB_OUTPUT}" @@ -101,21 +106,21 @@ jobs: strategy: fail-fast: false matrix: ${{ fromJSON(needs.compute-matrix.outputs.MATRIX) }} - runs-on: "${{ matrix.FLAVOR || 'linux' }}-${{ matrix.ARCH }}-gpu-${{ matrix.GPU }}-${{ matrix.DRIVER }}-${{ matrix.GPU_COUNT }}" + runs-on: "${{ matrix.FLAVOR || 'linux' }}-${{ matrix.ARCH }}-gpu-${{ matrix.GPU }}-${{ matrix.RUNNER_DRIVER }}-${{ matrix.GPU_COUNT }}" # The build stage could fail but we want the CI to keep moving. if: ${{ github.repository_owner == 'nvidia' && !cancelled() }} # Our self-hosted runners require a container # TODO: use a different (nvidia?) container container: - options: -u root --security-opt seccomp=unconfined --shm-size 16g + # Custom-DRIVER rows need --privileged --pid=host so install_gpu_driver.sh + # can nsenter to the host for the install + refresh the toolkit bind mounts + # back inside the container. Stock options for latest/earliest rows. + options: ${{ ((matrix.DRIVER == 'latest' || matrix.DRIVER == 'earliest') && '-u root --security-opt seccomp=unconfined --shm-size 16g') || '-u root --security-opt seccomp=unconfined --shm-size 16g --privileged --pid=host' }} image: ubuntu:22.04 env: NVIDIA_VISIBLE_DEVICES: ${{ env.NVIDIA_VISIBLE_DEVICES }} PIP_CACHE_DIR: "/tmp/pip-cache" steps: - - name: Ensure GPU is working - run: nvidia-smi - - name: Checkout ${{ github.event.repository.name }} uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10 # v6.0.3 @@ -131,6 +136,21 @@ jobs: dependencies: "jq wget libgl1 libegl1 g++" dependent_exes: "jq wget" + - name: Install GPU driver + if: ${{ matrix.DRIVER != 'latest' && matrix.DRIVER != 'earliest' }} + env: + DRIVER: ${{ matrix.DRIVER }} + GPU_TYPE: ${{ matrix.GPU }} + run: | + # util-linux for nsenter; install_gpu_driver.sh re-execs onto the + # host (requires --privileged --pid=host on the container, set + # conditionally above) and refreshes the toolkit bind mounts here. + apt-get -y install --no-install-recommends util-linux + ./ci/tools/install_gpu_driver.sh + + - name: Ensure GPU is working + run: nvidia-smi + - name: Set environment variables env: BUILD_CUDA_VER: ${{ inputs.build-ctk-ver }} diff --git a/.github/workflows/test-wheel-windows.yml b/.github/workflows/test-wheel-windows.yml index 320817177f..5675b395af 100644 --- a/.github/workflows/test-wheel-windows.yml +++ b/.github/workflows/test-wheel-windows.yml @@ -81,8 +81,11 @@ jobs: # Read base matrix from YAML file for the specific architecture TEST_MATRIX=$(yq -o json ".windows[\"${MATRIX_TYPE}\"] | map(select(.ARCH == \"${ARCH}\"))" ci/test-matrix.yml) - # Apply matrix filter and wrap in include structure - MATRIX=$(echo "$TEST_MATRIX" | jq -c '${{ inputs.matrix_filter }} | if (. | length) > 0 then {include: .} else "Error: Empty matrix\n" | halt_error(1) end') + # Apply matrix filter; add a RUNNER_DRIVER field that maps any + # custom DRIVER version back to 'latest' (install_gpu_driver.ps1 + # swaps the driver itself, so the runner must be the one that + # ships the most recent pre-installed driver); wrap in include. + MATRIX=$(echo "$TEST_MATRIX" | jq -c '${{ inputs.matrix_filter }} | map(. + {RUNNER_DRIVER: (if .DRIVER == "latest" or .DRIVER == "earliest" then .DRIVER else "latest" end)}) | if (. | length) > 0 then {include: .} else "Error: Empty matrix\n" | halt_error(1) end') echo "MATRIX=${MATRIX}" | tee --append "${GITHUB_OUTPUT}" @@ -95,7 +98,7 @@ jobs: fail-fast: false matrix: ${{ fromJSON(needs.compute-matrix.outputs.MATRIX) }} if: ${{ github.repository_owner == 'nvidia' && !cancelled() }} - runs-on: "windows-${{ matrix.ARCH }}-gpu-${{ matrix.GPU }}-${{ matrix.DRIVER }}-${{ matrix.GPU_COUNT }}" + runs-on: "windows-${{ matrix.ARCH }}-gpu-${{ matrix.GPU }}-${{ matrix.RUNNER_DRIVER }}-${{ matrix.GPU_COUNT }}" steps: - name: Checkout ${{ github.event.repository.name }} uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10 # v6.0.3 @@ -106,13 +109,20 @@ jobs: with: enable-apt: true - - name: Update driver + - name: Install GPU driver + if: ${{ matrix.DRIVER != 'latest' && matrix.DRIVER != 'earliest' }} env: - DRIVER_MODE: ${{ matrix.DRIVER_MODE }} + DRIVER: ${{ matrix.DRIVER }} GPU_TYPE: ${{ matrix.GPU }} run: | ci/tools/install_gpu_driver.ps1 + - name: Configure driver mode + env: + DRIVER_MODE: ${{ matrix.DRIVER_MODE }} + run: | + ci/tools/configure_driver_mode.ps1 + - name: Ensure GPU is working run: | nvidia-smi diff --git a/ci/test-matrix.yml b/ci/test-matrix.yml index 95c5e714ca..51f0d3f063 100644 --- a/ci/test-matrix.yml +++ b/ci/test-matrix.yml @@ -13,7 +13,16 @@ # Windows entries also include DRIVER_MODE. # # Notes: +# - DRIVER accepts: +# * 'latest' - use the runner's pre-installed latest driver (no install step) +# * 'earliest' - use the runner's pre-installed earliest driver (no install step) +# * a version string (e.g. '580.65.06') +# - install that version via ci/tools/install_gpu_driver.sh (Linux) +# or ci/tools/install_gpu_driver.ps1 (Windows) at the start of the +# job. The matrix row is routed to the 'latest' runner image (the +# install scripts swap the driver themselves). # - DRIVER: 'earliest' does not work with CUDA 12.9.1 +# - DRIVER: a custom version is not supported with FLAVOR=wsl on Linux. linux: pull-request: @@ -29,10 +38,10 @@ linux: - { ARCH: 'amd64', PY_VER: '3.12', CUDA_VER: '13.3.0', LOCAL_CTK: '0', GPU: 'l4', GPU_COUNT: '1', DRIVER: 'latest' } - { ARCH: 'amd64', PY_VER: '3.13', CUDA_VER: '12.9.1', LOCAL_CTK: '0', GPU: 'v100', GPU_COUNT: '1', DRIVER: 'latest' } - { ARCH: 'amd64', PY_VER: '3.13', CUDA_VER: '13.0.2', LOCAL_CTK: '1', GPU: 'rtxpro6000', GPU_COUNT: '1', DRIVER: 'latest' } - - { ARCH: 'amd64', PY_VER: '3.13', CUDA_VER: '13.3.0', LOCAL_CTK: '1', GPU: 'rtxpro6000', GPU_COUNT: '1', DRIVER: 'latest' } + - { ARCH: 'amd64', PY_VER: '3.13', CUDA_VER: '13.3.0', LOCAL_CTK: '1', GPU: 'rtxpro6000', GPU_COUNT: '1', DRIVER: '610.43.02' } - { ARCH: 'amd64', PY_VER: '3.14', CUDA_VER: '12.9.1', LOCAL_CTK: '0', GPU: 't4', GPU_COUNT: '1', DRIVER: 'latest' } - { ARCH: 'amd64', PY_VER: '3.14', CUDA_VER: '13.0.2', LOCAL_CTK: '1', GPU: 'l4', GPU_COUNT: '1', DRIVER: 'latest' } - - { ARCH: 'amd64', PY_VER: '3.14', CUDA_VER: '13.3.0', LOCAL_CTK: '1', GPU: 'l4', GPU_COUNT: '1', DRIVER: 'latest' } + - { ARCH: 'amd64', PY_VER: '3.14', CUDA_VER: '13.3.0', LOCAL_CTK: '1', GPU: 'l4', GPU_COUNT: '1', DRIVER: '610.43.02' } - { ARCH: 'amd64', PY_VER: '3.14t', CUDA_VER: '12.9.1', LOCAL_CTK: '1', GPU: 't4', GPU_COUNT: '1', DRIVER: 'latest' } - { ARCH: 'amd64', PY_VER: '3.14t', CUDA_VER: '13.0.2', LOCAL_CTK: '1', GPU: 'l4', GPU_COUNT: '1', DRIVER: 'latest' } - { ARCH: 'amd64', PY_VER: '3.14t', CUDA_VER: '13.3.0', LOCAL_CTK: '1', GPU: 'l4', GPU_COUNT: '1', DRIVER: 'latest' } @@ -74,7 +83,7 @@ linux: - { MODE: 'nightly-pytorch', ARCH: 'arm64', PY_VER: '3.12', CUDA_VER: '13.0.2', LOCAL_CTK: '0', GPU: 'l4', GPU_COUNT: '1', DRIVER: 'latest', TORCH_VER: '2.9.1', TORCH_CUDA: 'cu130' } # nightly-numba-cuda - { MODE: 'nightly-numba-cuda', ARCH: 'amd64', PY_VER: '3.12', CUDA_VER: '12.9.1', LOCAL_CTK: '0', GPU: 'l4', GPU_COUNT: '1', DRIVER: 'latest' } - - { MODE: 'nightly-numba-cuda', ARCH: 'amd64', PY_VER: '3.12', CUDA_VER: '13.3.0', LOCAL_CTK: '0', GPU: 'l4', GPU_COUNT: '1', DRIVER: 'latest' } + - { MODE: 'nightly-numba-cuda', ARCH: 'amd64', PY_VER: '3.12', CUDA_VER: '13.3.0', LOCAL_CTK: '0', GPU: 'l4', GPU_COUNT: '1', DRIVER: '580.65.06' } - { MODE: 'nightly-numba-cuda', ARCH: 'arm64', PY_VER: '3.12', CUDA_VER: '12.9.1', LOCAL_CTK: '0', GPU: 'l4', GPU_COUNT: '1', DRIVER: 'latest' } - { MODE: 'nightly-numba-cuda', ARCH: 'arm64', PY_VER: '3.12', CUDA_VER: '13.3.0', LOCAL_CTK: '0', GPU: 'l4', GPU_COUNT: '1', DRIVER: 'latest' } # nightly-standard (arm64 l4×2 — nightly-only per runner team request) @@ -113,4 +122,4 @@ windows: - { MODE: 'nightly-pytorch', ARCH: 'amd64', PY_VER: '3.12', CUDA_VER: '13.0.2', LOCAL_CTK: '0', GPU: 'l4', GPU_COUNT: '1', DRIVER: 'latest', DRIVER_MODE: 'TCC', TORCH_VER: '2.9.1', TORCH_CUDA: 'cu130' } # nightly-numba-cuda - { MODE: 'nightly-numba-cuda', ARCH: 'amd64', PY_VER: '3.12', CUDA_VER: '12.9.1', LOCAL_CTK: '0', GPU: 'l4', GPU_COUNT: '1', DRIVER: 'latest', DRIVER_MODE: 'TCC' } - - { MODE: 'nightly-numba-cuda', ARCH: 'amd64', PY_VER: '3.12', CUDA_VER: '13.3.0', LOCAL_CTK: '0', GPU: 'l4', GPU_COUNT: '1', DRIVER: 'latest', DRIVER_MODE: 'TCC' } + - { MODE: 'nightly-numba-cuda', ARCH: 'amd64', PY_VER: '3.12', CUDA_VER: '13.3.0', LOCAL_CTK: '0', GPU: 'l4', GPU_COUNT: '1', DRIVER: '596.36', DRIVER_MODE: 'TCC' } diff --git a/ci/tools/configure_driver_mode.ps1 b/ci/tools/configure_driver_mode.ps1 new file mode 100644 index 0000000000..42e0914935 --- /dev/null +++ b/ci/tools/configure_driver_mode.ps1 @@ -0,0 +1,58 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# SPDX-License-Identifier: Apache-2.0 +# +# configure_driver_mode.ps1 -- set the NVIDIA driver mode on a Windows CI +# runner and cycle the display devices so the new mode takes effect +# without rebooting. Always runs (whether or not install_gpu_driver.ps1 +# just ran). When install_gpu_driver.ps1 has run, this single device +# cycle also activates the freshly-installed driver. +# +# Inputs (env): +# DRIVER_MODE One of WDDM, TCC, MCDM. + +function Set-DriverMode { + + # Map matrix DRIVER_MODE to nvidia-smi -fdm code. + # This assumes we have the prior knowledge on which GPU can use which mode. + $driver_mode = $env:DRIVER_MODE + if ($driver_mode -eq "WDDM") { + Write-Output "Setting driver mode to WDDM..." + nvidia-smi -fdm 0 + } elseif ($driver_mode -eq "TCC") { + Write-Output "Setting driver mode to TCC..." + nvidia-smi -fdm 1 + } elseif ($driver_mode -eq "MCDM") { + Write-Output "Setting driver mode to MCDM..." + nvidia-smi -fdm 2 + } else { + Write-Output "Unknown driver mode: $driver_mode" + exit 1 + } + + # Only restart NVIDIA display adapters, not other display devices (e.g. QEMU VGA) + $nvidia_devices = Get-PnpDevice -Class Display -FriendlyName "NVIDIA*" + foreach ($device in $nvidia_devices) { + Write-Output "Restarting device: $($device.FriendlyName) ($($device.InstanceId))" + pnputil /disable-device "$($device.InstanceId)" + pnputil /enable-device "$($device.InstanceId)" + } + + # Poll nvidia-smi until NVML can initialize, or give up after ~60s. + # A fixed sleep is not enough on slower-coming-back-up multi-GPU rows + # (e.g. 2x H100 MCDM) where pnputil enable returns before NVML is + # ready. Pattern borrowed from the runner-team `nvgha-driver.ps1`. + Write-Output "Waiting for nvidia-smi/NVML to come back up after device cycle..." + $deadline = (Get-Date).AddSeconds(60) + do { + Start-Sleep -Seconds 2 + & nvidia-smi.exe 2>&1 | Out-Null + } while ($LASTEXITCODE -ne 0 -and (Get-Date) -lt $deadline) + if ($LASTEXITCODE -ne 0) { + Write-Error "nvidia-smi did not return cleanly within 60s of the device cycle" + exit 1 + } +} + +# Run the functions +Set-DriverMode diff --git a/ci/tools/install_gpu_driver.ps1 b/ci/tools/install_gpu_driver.ps1 index c98416c87e..e61c6bbdbb 100644 --- a/ci/tools/install_gpu_driver.ps1 +++ b/ci/tools/install_gpu_driver.ps1 @@ -1,13 +1,30 @@ # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # SPDX-License-Identifier: Apache-2.0 +# +# install_gpu_driver.ps1 -- install a specific NVIDIA driver version on a +# Windows CI runner. Driver-mode selection and the post-install device +# power-cycle are the responsibility of configure_driver_mode.ps1, which +# the workflow runs immediately after this script (or by itself when +# DRIVER is 'latest'/'earliest' and the runner already brings up the +# right driver). +# +# Inputs (env): +# DRIVER Driver version, e.g. "610.47". Must NOT be 'latest' or +# 'earliest' -- those are runner-pre-installed and the +# workflow is expected to skip this script for them. +# GPU_TYPE Lower-case GPU label from the matrix (e.g. "l4", "rtx4090"). +# Selects the data-center vs desktop installer variant. # Install the driver function Install-Driver { - # Set the correct URL, filename, and arguments to the installer - # This driver is picked to support Windows 11 & CUDA 13.0 - $version = '581.15' + # Driver version is plumbed from the matrix via the DRIVER env var. + $version = $env:DRIVER + if (-not $version -or $version -eq 'latest' -or $version -eq 'earliest') { + Write-Error "DRIVER env var must be a specific version string (e.g. '610.47'); got '$version'." + exit 1 + } # Get GPU type from environment variable $gpu_type = $env:GPU_TYPE @@ -54,33 +71,7 @@ function Install-Driver { # Install the file with the specified path from earlier Write-Output 'Running the driver installer...' Start-Process -FilePath $filepath -ArgumentList $install_args -Wait - Write-Output 'Done!' - - # Handle driver mode configuration - # This assumes we have the prior knowledge on which GPU can use which mode. - $driver_mode = $env:DRIVER_MODE - if ($driver_mode -eq "WDDM") { - Write-Output "Setting driver mode to WDDM..." - nvidia-smi -fdm 0 - } elseif ($driver_mode -eq "TCC") { - Write-Output "Setting driver mode to TCC..." - nvidia-smi -fdm 1 - } elseif ($driver_mode -eq "MCDM") { - Write-Output "Setting driver mode to MCDM..." - nvidia-smi -fdm 2 - } else { - Write-Output "Unknown driver mode: $driver_mode" - exit 1 - } - # Only restart NVIDIA display adapters, not other display devices (e.g. QEMU VGA) - $nvidia_devices = Get-PnpDevice -Class Display -FriendlyName "NVIDIA*" - foreach ($device in $nvidia_devices) { - Write-Output "Restarting device: $($device.FriendlyName) ($($device.InstanceId))" - pnputil /disable-device "$($device.InstanceId)" - pnputil /enable-device "$($device.InstanceId)" - } - # Give it a minute to settle: - Start-Sleep -Seconds 5 + Write-Output 'Install complete; driver mode + device cycle handled by configure_driver_mode.ps1.' } # Run the functions diff --git a/ci/tools/install_gpu_driver.sh b/ci/tools/install_gpu_driver.sh new file mode 100755 index 0000000000..db7ae99858 --- /dev/null +++ b/ci/tools/install_gpu_driver.sh @@ -0,0 +1,220 @@ +#!/bin/bash +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# SPDX-License-Identifier: Apache-2.0 +# +# install_gpu_driver.sh -- install a specific NVIDIA driver version on a +# Linux CI runner. Adapted from nv-gha-runners/vm-images PR #256 +# (`nvgha-driver` CLI), trimmed and parameterised for cuda-python's CI. +# +# !!! ALPHA !!! +# Performs live modifications to the host driver stack (kernel module +# reload, package replacement, and -- inside containers -- toolkit +# bind-mount refresh) and may cause issues. +# +# Inputs (env): +# DRIVER Driver version, e.g. "580.65.06". Must NOT be 'latest' or +# 'earliest' -- those are runner-pre-installed and the +# workflow is expected to skip this script for them. +# GPU_TYPE Lower-case GPU label from the matrix (e.g. "v100", "l4", +# "h100"). Used only to pick the kernel module flavor +# (Volta needs the proprietary/legacy module; everything +# newer can use the open module). +# +# Arch is detected from `uname -m`. +# +# When the script runs inside a container (the cuda-python Linux jobs do) +# it re-execs itself on the host via `nsenter`. The job must declare +# `options: --privileged --pid=host` (the workflow only does this for +# matrix rows with a custom DRIVER). After the host-side install, the +# container's bind-mounted nvidia libs/binaries are refreshed in-place so +# the new driver is visible without restarting the container. +set -euo pipefail + +: "${DRIVER:?DRIVER env var is required (e.g. 580.65.06)}" +: "${GPU_TYPE:?GPU_TYPE env var is required (e.g. l4)}" + +case "$DRIVER" in + latest|earliest) + echo "::error::install_gpu_driver.sh must not be invoked with DRIVER=$DRIVER (runner-pre-installed)" >&2 + exit 1 + ;; +esac + +VERSION="$DRIVER" + +# Volta (V100) requires the legacy/proprietary kernel module; all newer +# GPUs in this matrix support the open module. Extend this if/when older +# GPUs return to the matrix. +case "$GPU_TYPE" in + v100) KMT=proprietary ;; + *) KMT=open ;; +esac + +case "$(uname -m)" in + x86_64) + ARCH_DIR=Linux-x86_64 + ARCH_SUFFIX=x86_64 + ;; + aarch64) + ARCH_DIR=Linux-aarch64 + ARCH_SUFFIX=aarch64 + ;; + *) + echo "::error::unsupported arch: $(uname -m)" >&2 + exit 1 + ;; +esac + +URL="https://us.download.nvidia.com/XFree86/${ARCH_DIR}/${VERSION}/NVIDIA-Linux-${ARCH_SUFFIX}-${VERSION}.run" + +# Re-elevate to root if needed (sudo is preinstalled on the runner image). +if [ "$(id -u)" != 0 ]; then + exec sudo -E DRIVER="$DRIVER" GPU_TYPE="$GPU_TYPE" "$0" "$@" +fi + +echo "install_gpu_driver.sh is ALPHA -- it performs live modifications to the host driver stack and may cause issues" >&2 +echo "DRIVER=${VERSION} GPU_TYPE=${GPU_TYPE} KMT=${KMT} ARCH=${ARCH_SUFFIX}" >&2 +echo "URL=${URL}" >&2 + +# Toolkit packages we keep across the purge: dockerd's --runtime=nvidia +# resolves nvidia-container-runtime through these, and removing them +# breaks `docker exec` against any container started with that runtime. +KEEP_RE='^(nvidia-container-toolkit(-base)?|libnvidia-container1|libnvidia-container-tools)$' + +in_container() { + [ -f /.dockerenv ] || grep -qE '/(docker|kubepods|containerd)' /proc/1/cgroup 2>/dev/null +} + +host_install() { + apt-get -y install build-essential dkms "linux-headers-$(uname -r)" psmisc kmod + + systemctl stop nvidia-persistenced dcgm-exporter 2>/dev/null || true + # if-test instead of `fuser ... || true` so a kill failure surfaces + # (fuser exits 1 when nothing holds the device, which is the happy path). + if fuser /dev/nvidia* >/dev/null 2>&1; then + fuser -kv /dev/nvidia* + fi + sleep 1 + for m in nvidia_uvm nvidia_drm nvidia_modeset nvidia; do + rmmod "$m" 2>/dev/null || true + done + + # Purge existing nvidia/libnvidia packages, except the toolkit pieces + # captured by KEEP_RE. Tolerate apt failures: postrm scripts can trip + # and the .run installer is about to replace everything anyway. + dpkg-query -W -f='${Package}\n' 'nvidia-*' 'libnvidia-*' 2>/dev/null \ + | awk -v re="$KEEP_RE" '$0 !~ re' \ + | xargs -r apt-get -y remove --purge || true + + local d + d=$(mktemp -d) + ( cd "$d" \ + && wget -q -O installer.run "$URL" \ + && sh installer.run --silent --dkms --no-questions \ + --accept-license --ui=none --no-cc-version-check --kernel-module-type="$KMT" ) + modprobe nvidia nvidia_uvm nvidia_modeset + + # Bring nvidia-persistenced back up. NVML state-changing calls from + # inside the test container (e.g. nvmlDeviceSetPersistenceMode, which + # cuda.core's test_persistence_mode_enabled exercises) talk to the + # daemon via /run/nvidia-persistenced/socket; without a live daemon + # they return NVML_ERROR_UNKNOWN. + # + # systemctl can't start the unit (the `--silent --no-questions` .run + # installer drops /usr/bin/nvidia-persistenced but no usable systemd + # unit), so exec the binary directly -- it self-daemonizes. + # + # `--user root` because the default `nvidia-persistenced` user was + # deleted along with `nvidia-compute-utils-*` in the purge above; + # without this flag the daemon's post-fork setuid() fails silently + # and the process exits. + # + # nv-gha-runners/vm-images' `nvgha-driver` has the same latent gap; + # its CUDA-runtime validation workload never issues an NVML SET + # write so it hasn't surfaced there. + /usr/bin/nvidia-persistenced --verbose --user root || true +} + +# Replace the toolkit's bind-mounted nvidia libs/binaries inside this +# container with copies from the host's new install. `cp` (not +# `mount --bind`) because procfs-routed binds drop the exec bit. +refresh_container_libs() { + # Walk /proc/self/mountinfo and match the toolkit-injected nvidia + # binds via mount point (field 5) so deleted source paths -- which + # the kernel suffixes field 4 with " (deleted)" once the host unlinks + # the old lib -- don't break discovery. Filters skip what we can't or + # shouldn't refresh: + # $3 ~ /^0:/ tmpfs/proc/sysfs (e.g. the toolkit hook tmpfs) + # $5 must be under /usr/(bin|lib) binaries + libs only -- explicitly + # NOT /run/nvidia-persistenced/socket + # (cp'ing the daemon's IPC socket unlinks + # the container's view and turns later + # NVML state-changing calls into + # NVML_ERROR_UNKNOWN); NOT /dev/nvidia* + # (character devices); NOT /proc/driver/nvidia + # (procfs); NOT /tmp/nvidia-mps (runtime). + # $5 ~ /\.json$/ vulkan/glvnd config remaps (not version-bound) + # $5 ~ /\/(firmware|xorg)\// firmware loads host-side; xorg unused in CUDA containers + local mounts + mounts=$(awk ' + $3 !~ /^0:/ && + $5 ~ /^\/usr\/(bin|lib)/ && + $5 !~ /\.json$/ && + $5 !~ /\/(firmware|xorg)\// && + $5 ~ /(nvidia|libcuda)/ { print $5 } + ' /proc/self/mountinfo | sort -u) + + for tgt in $mounts; do + local src="/proc/1/root$tgt" + if [ ! -e "$src" ]; then + # Driver swap rewrites the version suffix (libfoo.so.595.71.05 -> + # libfoo.so.580.65.06); strip it and find the new file. + local base + base=$(basename "$tgt") + base="${base%.so.*}.so" + src=$(find "/proc/1/root$(dirname "$tgt")" -maxdepth 1 -name "${base}.*" 2>/dev/null \ + | sort -V | tail -n1) + [ -n "$src" ] || { echo "skip $tgt: no host source" >&2; continue; } + fi + umount "$tgt" 2>/dev/null || true + # --preserve=mode keeps the SUID bit so the refresh doesn't silently + # de-privilege binaries like nvidia-modprobe that ship 4755 (the + # runner-team's nvgha-driver uses plain `cp` and has the same gap). + cp -f --preserve=mode --remove-destination "$src" "$tgt" \ + || echo "WARN: refresh failed for $tgt (src=$src)" >&2 + done + ldconfig +} + +if [ -z "${_NVDRV_NSENTERED:-}" ] && in_container; then + # Re-exec on the host. The runner-team's `nvgha-driver` script lives at a + # host-side absolute path so `"$0"` survives the mount-namespace flip; + # ours lives in the GH workspace mount (container-only), so we pipe the + # script body in via stdin instead -- the `< "$0"` fd is opened before + # nsenter and stays valid across the namespace switch. Env vars (DRIVER, + # GPU_TYPE, _NVDRV_NSENTERED) are inherited by the host-side bash. + _NVDRV_NSENTERED=1 nsenter -t 1 -m -p -n -i -u -- bash -s < "$0" \ + || { echo "::error::container needs 'options: --privileged --pid=host'" >&2; exit 1; } + refresh_container_libs + + # Re-bind /run/nvidia-persistenced from the host. The container's + # original bind mount was pinned to the host's at-container-start + # inode; the daemon stop+restart cycle recreates the dir under a + # fresh inode, stranding the bind mount on the deleted one (socket + # shows up with link count 0 inside the container, NVML SET calls + # return NVML_ERROR_UNKNOWN). Re-bind from /proc/1/root so the + # container picks up the live host dir. Needs CAP_SYS_ADMIN, which + # the workflow grants via --privileged --pid=host on custom-DRIVER + # rows. + if [ -d /proc/1/root/run/nvidia-persistenced ]; then + umount /run/nvidia-persistenced 2>/dev/null || true + mkdir -p /run/nvidia-persistenced + mount --bind /proc/1/root/run/nvidia-persistenced /run/nvidia-persistenced + fi +else + host_install +fi + +nvidia-smi >/dev/null +grep -qF "$VERSION" /proc/driver/nvidia/version