From b1b6070a823898a01e87c49cb1682f6d65a96eb9 Mon Sep 17 00:00:00 2001 From: Leo Fang Date: Sun, 7 Jun 2026 01:48:44 +0000 Subject: [PATCH 01/16] CI: allow specifying custom driver versions in test matrix Extends the DRIVER field in ci/test-matrix.yml beyond 'latest'/'earliest' to accept an explicit version string (e.g. '580.65.06'). For Linux, ci/tools/install_gpu_driver.sh (adapted from nv-gha-runners/vm-images PR #256) swaps the driver in-job via nsenter when the row uses a custom version; for Windows, ci/tools/install_gpu_driver.ps1 is split into install + configure_driver_mode, with the install step gated on the DRIVER value and the mode step always running. The matrix row is routed to a 'latest' runner image when the DRIVER is a custom version (the install scripts perform the swap themselves). Container privileges on Linux (--privileged --pid=host) are added only on rows with a custom DRIVER. Custom DRIVER + FLAVOR=wsl is rejected eagerly in the compute-matrix step. Two existing nightly-numba-cuda rows exercise the new path: - Linux amd64 / 13.3.0 / l4 -> 580.65.06 - Windows amd64 / 13.3.0 / l4 -> 610.47 Closes #293 Closes #1265 --- .github/workflows/coverage.yml | 8 +- .github/workflows/test-wheel-linux.yml | 28 +++- .github/workflows/test-wheel-windows.yml | 20 ++- ci/test-matrix.yml | 13 +- ci/tools/configure_driver_mode.ps1 | 45 ++++++ ci/tools/install_gpu_driver.ps1 | 51 +++---- ci/tools/install_gpu_driver.sh | 167 +++++++++++++++++++++++ 7 files changed, 288 insertions(+), 44 deletions(-) create mode 100644 ci/tools/configure_driver_mode.ps1 create mode 100755 ci/tools/install_gpu_driver.sh diff --git a/.github/workflows/coverage.yml b/.github/workflows/coverage.yml index 9581cff3088..de1e713e499 100644 --- a/.github/workflows/coverage.yml +++ b/.github/workflows/coverage.yml @@ -275,13 +275,15 @@ jobs: uses: nv-gha-runners/setup-proxy-cache@main continue-on-error: true - - name: Update driver + # DRIVER above is 'latest' so install_gpu_driver.ps1 is intentionally + # skipped (it errors on latest/earliest); configure_driver_mode.ps1 + # still runs to put the pre-installed driver into TCC mode. + - name: Configure driver mode shell: powershell env: DRIVER_MODE: "TCC" - GPU_TYPE: "a100" run: | - ci/tools/install_gpu_driver.ps1 + ci/tools/configure_driver_mode.ps1 - name: Ensure GPU is working run: | diff --git a/.github/workflows/test-wheel-linux.yml b/.github/workflows/test-wheel-linux.yml index f8002f5124a..4f56cb57740 100644 --- a/.github/workflows/test-wheel-linux.yml +++ b/.github/workflows/test-wheel-linux.yml @@ -85,8 +85,13 @@ jobs: # Read base matrix from YAML file for the specific architecture TEST_MATRIX=$(yq -o json ".linux[\"${MATRIX_TYPE}\"] | map(select(.ARCH == \"${ARCH}\"))" ci/test-matrix.yml) - # Apply matrix filter and wrap in include structure - MATRIX=$(echo "$TEST_MATRIX" | jq -c '${{ inputs.matrix_filter }} | if (. | length) > 0 then {include: .} else "Error: Empty matrix\n" | halt_error(1) end') + # Apply matrix filter; reject custom DRIVER + FLAVOR=wsl (the + # in-container driver swap doesn't work under WSL); add a + # RUNNER_DRIVER field that maps any custom version back to + # 'latest' (the install script swaps the driver itself, so we + # need to land on the runner that ships with the most recent + # pre-installed driver); wrap in include structure. + MATRIX=$(echo "$TEST_MATRIX" | jq -c '${{ inputs.matrix_filter }} | if any(.[]; .DRIVER != "latest" and .DRIVER != "earliest" and .FLAVOR == "wsl") then "Error: custom DRIVER is not supported with FLAVOR=wsl\n" | halt_error(1) else . end | map(. + {RUNNER_DRIVER: (if .DRIVER == "latest" or .DRIVER == "earliest" then .DRIVER else "latest" end)}) | if (. | length) > 0 then {include: .} else "Error: Empty matrix\n" | halt_error(1) end') echo "MATRIX=${MATRIX}" | tee --append "${GITHUB_OUTPUT}" @@ -101,13 +106,16 @@ jobs: strategy: fail-fast: false matrix: ${{ fromJSON(needs.compute-matrix.outputs.MATRIX) }} - runs-on: "${{ matrix.FLAVOR || 'linux' }}-${{ matrix.ARCH }}-gpu-${{ matrix.GPU }}-${{ matrix.DRIVER }}-${{ matrix.GPU_COUNT }}" + runs-on: "${{ matrix.FLAVOR || 'linux' }}-${{ matrix.ARCH }}-gpu-${{ matrix.GPU }}-${{ matrix.RUNNER_DRIVER }}-${{ matrix.GPU_COUNT }}" # The build stage could fail but we want the CI to keep moving. if: ${{ github.repository_owner == 'nvidia' && !cancelled() }} # Our self-hosted runners require a container # TODO: use a different (nvidia?) container container: - options: -u root --security-opt seccomp=unconfined --shm-size 16g + # Custom-DRIVER rows need --privileged --pid=host so install_gpu_driver.sh + # can nsenter to the host for the install + refresh the toolkit bind mounts + # back inside the container. Stock options for latest/earliest rows. + options: ${{ ((matrix.DRIVER == 'latest' || matrix.DRIVER == 'earliest') && '-u root --security-opt seccomp=unconfined --shm-size 16g') || '-u root --security-opt seccomp=unconfined --shm-size 16g --privileged --pid=host' }} image: ubuntu:22.04 env: NVIDIA_VISIBLE_DEVICES: ${{ env.NVIDIA_VISIBLE_DEVICES }} @@ -131,6 +139,18 @@ jobs: dependencies: "jq wget libgl1 libegl1 g++" dependent_exes: "jq wget" + - name: Install GPU driver + if: ${{ matrix.DRIVER != 'latest' && matrix.DRIVER != 'earliest' }} + env: + DRIVER: ${{ matrix.DRIVER }} + GPU_TYPE: ${{ matrix.GPU }} + run: | + # util-linux for nsenter; install_gpu_driver.sh re-execs onto the + # host (requires --privileged --pid=host on the container, set + # conditionally above) and refreshes the toolkit bind mounts here. + apt-get -y install --no-install-recommends util-linux + ./ci/tools/install_gpu_driver.sh + - name: Set environment variables env: BUILD_CUDA_VER: ${{ inputs.build-ctk-ver }} diff --git a/.github/workflows/test-wheel-windows.yml b/.github/workflows/test-wheel-windows.yml index 320817177f3..5675b395afe 100644 --- a/.github/workflows/test-wheel-windows.yml +++ b/.github/workflows/test-wheel-windows.yml @@ -81,8 +81,11 @@ jobs: # Read base matrix from YAML file for the specific architecture TEST_MATRIX=$(yq -o json ".windows[\"${MATRIX_TYPE}\"] | map(select(.ARCH == \"${ARCH}\"))" ci/test-matrix.yml) - # Apply matrix filter and wrap in include structure - MATRIX=$(echo "$TEST_MATRIX" | jq -c '${{ inputs.matrix_filter }} | if (. | length) > 0 then {include: .} else "Error: Empty matrix\n" | halt_error(1) end') + # Apply matrix filter; add a RUNNER_DRIVER field that maps any + # custom DRIVER version back to 'latest' (install_gpu_driver.ps1 + # swaps the driver itself, so the runner must be the one that + # ships the most recent pre-installed driver); wrap in include. + MATRIX=$(echo "$TEST_MATRIX" | jq -c '${{ inputs.matrix_filter }} | map(. + {RUNNER_DRIVER: (if .DRIVER == "latest" or .DRIVER == "earliest" then .DRIVER else "latest" end)}) | if (. | length) > 0 then {include: .} else "Error: Empty matrix\n" | halt_error(1) end') echo "MATRIX=${MATRIX}" | tee --append "${GITHUB_OUTPUT}" @@ -95,7 +98,7 @@ jobs: fail-fast: false matrix: ${{ fromJSON(needs.compute-matrix.outputs.MATRIX) }} if: ${{ github.repository_owner == 'nvidia' && !cancelled() }} - runs-on: "windows-${{ matrix.ARCH }}-gpu-${{ matrix.GPU }}-${{ matrix.DRIVER }}-${{ matrix.GPU_COUNT }}" + runs-on: "windows-${{ matrix.ARCH }}-gpu-${{ matrix.GPU }}-${{ matrix.RUNNER_DRIVER }}-${{ matrix.GPU_COUNT }}" steps: - name: Checkout ${{ github.event.repository.name }} uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10 # v6.0.3 @@ -106,13 +109,20 @@ jobs: with: enable-apt: true - - name: Update driver + - name: Install GPU driver + if: ${{ matrix.DRIVER != 'latest' && matrix.DRIVER != 'earliest' }} env: - DRIVER_MODE: ${{ matrix.DRIVER_MODE }} + DRIVER: ${{ matrix.DRIVER }} GPU_TYPE: ${{ matrix.GPU }} run: | ci/tools/install_gpu_driver.ps1 + - name: Configure driver mode + env: + DRIVER_MODE: ${{ matrix.DRIVER_MODE }} + run: | + ci/tools/configure_driver_mode.ps1 + - name: Ensure GPU is working run: | nvidia-smi diff --git a/ci/test-matrix.yml b/ci/test-matrix.yml index 95c5e714caa..3d5693a188a 100644 --- a/ci/test-matrix.yml +++ b/ci/test-matrix.yml @@ -13,7 +13,16 @@ # Windows entries also include DRIVER_MODE. # # Notes: +# - DRIVER accepts: +# * 'latest' - use the runner's pre-installed latest driver (no install step) +# * 'earliest' - use the runner's pre-installed earliest driver (no install step) +# * a version string (e.g. '580.65.06') +# - install that version via ci/tools/install_gpu_driver.sh (Linux) +# or ci/tools/install_gpu_driver.ps1 (Windows) at the start of the +# job. The matrix row is routed to the 'latest' runner image (the +# install scripts swap the driver themselves). # - DRIVER: 'earliest' does not work with CUDA 12.9.1 +# - DRIVER: a custom version is not supported with FLAVOR=wsl on Linux. linux: pull-request: @@ -74,7 +83,7 @@ linux: - { MODE: 'nightly-pytorch', ARCH: 'arm64', PY_VER: '3.12', CUDA_VER: '13.0.2', LOCAL_CTK: '0', GPU: 'l4', GPU_COUNT: '1', DRIVER: 'latest', TORCH_VER: '2.9.1', TORCH_CUDA: 'cu130' } # nightly-numba-cuda - { MODE: 'nightly-numba-cuda', ARCH: 'amd64', PY_VER: '3.12', CUDA_VER: '12.9.1', LOCAL_CTK: '0', GPU: 'l4', GPU_COUNT: '1', DRIVER: 'latest' } - - { MODE: 'nightly-numba-cuda', ARCH: 'amd64', PY_VER: '3.12', CUDA_VER: '13.3.0', LOCAL_CTK: '0', GPU: 'l4', GPU_COUNT: '1', DRIVER: 'latest' } + - { MODE: 'nightly-numba-cuda', ARCH: 'amd64', PY_VER: '3.12', CUDA_VER: '13.3.0', LOCAL_CTK: '0', GPU: 'l4', GPU_COUNT: '1', DRIVER: '580.65.06' } - { MODE: 'nightly-numba-cuda', ARCH: 'arm64', PY_VER: '3.12', CUDA_VER: '12.9.1', LOCAL_CTK: '0', GPU: 'l4', GPU_COUNT: '1', DRIVER: 'latest' } - { MODE: 'nightly-numba-cuda', ARCH: 'arm64', PY_VER: '3.12', CUDA_VER: '13.3.0', LOCAL_CTK: '0', GPU: 'l4', GPU_COUNT: '1', DRIVER: 'latest' } # nightly-standard (arm64 l4×2 — nightly-only per runner team request) @@ -113,4 +122,4 @@ windows: - { MODE: 'nightly-pytorch', ARCH: 'amd64', PY_VER: '3.12', CUDA_VER: '13.0.2', LOCAL_CTK: '0', GPU: 'l4', GPU_COUNT: '1', DRIVER: 'latest', DRIVER_MODE: 'TCC', TORCH_VER: '2.9.1', TORCH_CUDA: 'cu130' } # nightly-numba-cuda - { MODE: 'nightly-numba-cuda', ARCH: 'amd64', PY_VER: '3.12', CUDA_VER: '12.9.1', LOCAL_CTK: '0', GPU: 'l4', GPU_COUNT: '1', DRIVER: 'latest', DRIVER_MODE: 'TCC' } - - { MODE: 'nightly-numba-cuda', ARCH: 'amd64', PY_VER: '3.12', CUDA_VER: '13.3.0', LOCAL_CTK: '0', GPU: 'l4', GPU_COUNT: '1', DRIVER: 'latest', DRIVER_MODE: 'TCC' } + - { MODE: 'nightly-numba-cuda', ARCH: 'amd64', PY_VER: '3.12', CUDA_VER: '13.3.0', LOCAL_CTK: '0', GPU: 'l4', GPU_COUNT: '1', DRIVER: '610.47', DRIVER_MODE: 'TCC' } diff --git a/ci/tools/configure_driver_mode.ps1 b/ci/tools/configure_driver_mode.ps1 new file mode 100644 index 00000000000..280e725e11b --- /dev/null +++ b/ci/tools/configure_driver_mode.ps1 @@ -0,0 +1,45 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# SPDX-License-Identifier: Apache-2.0 +# +# configure_driver_mode.ps1 -- set the NVIDIA driver mode on a Windows CI +# runner and cycle the display devices so the new mode takes effect +# without rebooting. Always runs (whether or not install_gpu_driver.ps1 +# just ran). When install_gpu_driver.ps1 has run, this single device +# cycle also activates the freshly-installed driver. +# +# Inputs (env): +# DRIVER_MODE One of WDDM, TCC, MCDM. + +function Set-DriverMode { + + # Map matrix DRIVER_MODE to nvidia-smi -fdm code. + # This assumes we have the prior knowledge on which GPU can use which mode. + $driver_mode = $env:DRIVER_MODE + if ($driver_mode -eq "WDDM") { + Write-Output "Setting driver mode to WDDM..." + nvidia-smi -fdm 0 + } elseif ($driver_mode -eq "TCC") { + Write-Output "Setting driver mode to TCC..." + nvidia-smi -fdm 1 + } elseif ($driver_mode -eq "MCDM") { + Write-Output "Setting driver mode to MCDM..." + nvidia-smi -fdm 2 + } else { + Write-Output "Unknown driver mode: $driver_mode" + exit 1 + } + + # Only restart NVIDIA display adapters, not other display devices (e.g. QEMU VGA) + $nvidia_devices = Get-PnpDevice -Class Display -FriendlyName "NVIDIA*" + foreach ($device in $nvidia_devices) { + Write-Output "Restarting device: $($device.FriendlyName) ($($device.InstanceId))" + pnputil /disable-device "$($device.InstanceId)" + pnputil /enable-device "$($device.InstanceId)" + } + # Give it a minute to settle: + Start-Sleep -Seconds 5 +} + +# Run the functions +Set-DriverMode diff --git a/ci/tools/install_gpu_driver.ps1 b/ci/tools/install_gpu_driver.ps1 index c98416c87e2..e61c6bbdbb1 100644 --- a/ci/tools/install_gpu_driver.ps1 +++ b/ci/tools/install_gpu_driver.ps1 @@ -1,13 +1,30 @@ # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # SPDX-License-Identifier: Apache-2.0 +# +# install_gpu_driver.ps1 -- install a specific NVIDIA driver version on a +# Windows CI runner. Driver-mode selection and the post-install device +# power-cycle are the responsibility of configure_driver_mode.ps1, which +# the workflow runs immediately after this script (or by itself when +# DRIVER is 'latest'/'earliest' and the runner already brings up the +# right driver). +# +# Inputs (env): +# DRIVER Driver version, e.g. "610.47". Must NOT be 'latest' or +# 'earliest' -- those are runner-pre-installed and the +# workflow is expected to skip this script for them. +# GPU_TYPE Lower-case GPU label from the matrix (e.g. "l4", "rtx4090"). +# Selects the data-center vs desktop installer variant. # Install the driver function Install-Driver { - # Set the correct URL, filename, and arguments to the installer - # This driver is picked to support Windows 11 & CUDA 13.0 - $version = '581.15' + # Driver version is plumbed from the matrix via the DRIVER env var. + $version = $env:DRIVER + if (-not $version -or $version -eq 'latest' -or $version -eq 'earliest') { + Write-Error "DRIVER env var must be a specific version string (e.g. '610.47'); got '$version'." + exit 1 + } # Get GPU type from environment variable $gpu_type = $env:GPU_TYPE @@ -54,33 +71,7 @@ function Install-Driver { # Install the file with the specified path from earlier Write-Output 'Running the driver installer...' Start-Process -FilePath $filepath -ArgumentList $install_args -Wait - Write-Output 'Done!' - - # Handle driver mode configuration - # This assumes we have the prior knowledge on which GPU can use which mode. - $driver_mode = $env:DRIVER_MODE - if ($driver_mode -eq "WDDM") { - Write-Output "Setting driver mode to WDDM..." - nvidia-smi -fdm 0 - } elseif ($driver_mode -eq "TCC") { - Write-Output "Setting driver mode to TCC..." - nvidia-smi -fdm 1 - } elseif ($driver_mode -eq "MCDM") { - Write-Output "Setting driver mode to MCDM..." - nvidia-smi -fdm 2 - } else { - Write-Output "Unknown driver mode: $driver_mode" - exit 1 - } - # Only restart NVIDIA display adapters, not other display devices (e.g. QEMU VGA) - $nvidia_devices = Get-PnpDevice -Class Display -FriendlyName "NVIDIA*" - foreach ($device in $nvidia_devices) { - Write-Output "Restarting device: $($device.FriendlyName) ($($device.InstanceId))" - pnputil /disable-device "$($device.InstanceId)" - pnputil /enable-device "$($device.InstanceId)" - } - # Give it a minute to settle: - Start-Sleep -Seconds 5 + Write-Output 'Install complete; driver mode + device cycle handled by configure_driver_mode.ps1.' } # Run the functions diff --git a/ci/tools/install_gpu_driver.sh b/ci/tools/install_gpu_driver.sh new file mode 100755 index 00000000000..5dff7043487 --- /dev/null +++ b/ci/tools/install_gpu_driver.sh @@ -0,0 +1,167 @@ +#!/bin/bash +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# SPDX-License-Identifier: Apache-2.0 +# +# install_gpu_driver.sh -- install a specific NVIDIA driver version on a +# Linux CI runner. Adapted from nv-gha-runners/vm-images PR #256 +# (`nvgha-driver` CLI), trimmed and parameterised for cuda-python's CI. +# +# !!! ALPHA !!! +# Performs live modifications to the host driver stack (kernel module +# reload, package replacement, and -- inside containers -- toolkit +# bind-mount refresh) and may cause issues. +# +# Inputs (env): +# DRIVER Driver version, e.g. "580.65.06". Must NOT be 'latest' or +# 'earliest' -- those are runner-pre-installed and the +# workflow is expected to skip this script for them. +# GPU_TYPE Lower-case GPU label from the matrix (e.g. "v100", "l4", +# "h100"). Used only to pick the kernel module flavor +# (Volta needs the proprietary/legacy module; everything +# newer can use the open module). +# +# Arch is detected from `uname -m`. +# +# When the script runs inside a container (the cuda-python Linux jobs do) +# it re-execs itself on the host via `nsenter`. The job must declare +# `options: --privileged --pid=host` (the workflow only does this for +# matrix rows with a custom DRIVER). After the host-side install, the +# container's bind-mounted nvidia libs/binaries are refreshed in-place so +# the new driver is visible without restarting the container. +set -euo pipefail + +: "${DRIVER:?DRIVER env var is required (e.g. 580.65.06)}" +: "${GPU_TYPE:?GPU_TYPE env var is required (e.g. l4)}" + +case "$DRIVER" in + latest|earliest) + echo "::error::install_gpu_driver.sh must not be invoked with DRIVER=$DRIVER (runner-pre-installed)" >&2 + exit 1 + ;; +esac + +VERSION="$DRIVER" + +# Volta (V100) requires the legacy/proprietary kernel module; all newer +# GPUs in this matrix support the open module. Extend this if/when older +# GPUs return to the matrix. +case "$GPU_TYPE" in + v100) KMT=proprietary ;; + *) KMT=open ;; +esac + +case "$(uname -m)" in + x86_64) + ARCH_DIR=Linux-x86_64 + ARCH_SUFFIX=x86_64 + ;; + aarch64) + ARCH_DIR=Linux-aarch64 + ARCH_SUFFIX=aarch64 + ;; + *) + echo "::error::unsupported arch: $(uname -m)" >&2 + exit 1 + ;; +esac + +URL="https://us.download.nvidia.com/XFree86/${ARCH_DIR}/${VERSION}/NVIDIA-Linux-${ARCH_SUFFIX}-${VERSION}.run" + +# Re-elevate to root if needed (sudo is preinstalled on the runner image). +if [ "$(id -u)" != 0 ]; then + exec sudo -E DRIVER="$DRIVER" GPU_TYPE="$GPU_TYPE" "$0" "$@" +fi + +echo "install_gpu_driver.sh is ALPHA -- it performs live modifications to the host driver stack and may cause issues" >&2 +echo "DRIVER=${VERSION} GPU_TYPE=${GPU_TYPE} KMT=${KMT} ARCH=${ARCH_SUFFIX}" >&2 +echo "URL=${URL}" >&2 + +# Toolkit packages we keep across the purge: dockerd's --runtime=nvidia +# resolves nvidia-container-runtime through these, and removing them +# breaks `docker exec` against any container started with that runtime. +KEEP_RE='^(nvidia-container-toolkit(-base)?|libnvidia-container1|libnvidia-container-tools)$' + +in_container() { + [ -f /.dockerenv ] || grep -qE '/(docker|kubepods|containerd)' /proc/1/cgroup 2>/dev/null +} + +host_install() { + apt-get -y install build-essential dkms "linux-headers-$(uname -r)" psmisc kmod + + systemctl stop nvidia-persistenced dcgm-exporter 2>/dev/null || true + # if-test instead of `fuser ... || true` so a kill failure surfaces + # (fuser exits 1 when nothing holds the device, which is the happy path). + if fuser /dev/nvidia* >/dev/null 2>&1; then + fuser -kv /dev/nvidia* + fi + sleep 1 + for m in nvidia_uvm nvidia_drm nvidia_modeset nvidia; do + rmmod "$m" 2>/dev/null || true + done + + # Purge existing nvidia/libnvidia packages, except the toolkit pieces + # captured by KEEP_RE. Tolerate apt failures: postrm scripts can trip + # and the .run installer is about to replace everything anyway. + dpkg-query -W -f='${Package}\n' 'nvidia-*' 'libnvidia-*' 2>/dev/null \ + | awk -v re="$KEEP_RE" '$0 !~ re' \ + | xargs -r apt-get -y remove --purge || true + + local d + d=$(mktemp -d) + ( cd "$d" \ + && wget -q -O installer.run "$URL" \ + && sh installer.run --silent --dkms --no-questions \ + --accept-license --ui=none --no-cc-version-check --kernel-module-type="$KMT" ) + modprobe nvidia nvidia_uvm nvidia_modeset +} + +# Replace the toolkit's bind-mounted nvidia libs/binaries inside this +# container with copies from the host's new install. `cp` (not +# `mount --bind`) because procfs-routed binds drop the exec bit. +refresh_container_libs() { + # Walk /proc/self/mountinfo and match the toolkit-injected nvidia + # binds via mount point (field 5) so deleted source paths -- which + # the kernel suffixes field 4 with " (deleted)" once the host unlinks + # the old lib -- don't break discovery. Filters skip what we can't or + # shouldn't refresh: + # $3 ~ /^0:/ tmpfs/proc/sysfs (e.g. the toolkit hook tmpfs) + # $5 ~ /\.json$/ vulkan/glvnd config remaps (not version-bound) + # $5 ~ /\/(firmware|xorg)\// firmware loads host-side; xorg unused in CUDA containers + local mounts + mounts=$(awk ' + $3 !~ /^0:/ && + $5 !~ /\.json$/ && + $5 !~ /\/(firmware|xorg)\// && + $5 ~ /(nvidia|libcuda)/ { print $5 } + ' /proc/self/mountinfo | sort -u) + + for tgt in $mounts; do + local src="/proc/1/root$tgt" + if [ ! -e "$src" ]; then + # Driver swap rewrites the version suffix (libfoo.so.595.71.05 -> + # libfoo.so.580.65.06); strip it and find the new file. + local base + base=$(basename "$tgt") + base="${base%.so.*}.so" + src=$(find "/proc/1/root$(dirname "$tgt")" -maxdepth 1 -name "${base}.*" 2>/dev/null \ + | sort -V | tail -n1) + [ -n "$src" ] || { echo "skip $tgt: no host source" >&2; continue; } + fi + umount "$tgt" 2>/dev/null || true + cp -f --remove-destination "$src" "$tgt" \ + || echo "WARN: refresh failed for $tgt (src=$src)" >&2 + done + ldconfig +} + +if [ -z "${_NVDRV_NSENTERED:-}" ] && in_container; then + _NVDRV_NSENTERED=1 nsenter -t 1 -m -p -n -i -u -- "$0" \ + || { echo "::error::container needs 'options: --privileged --pid=host'" >&2; exit 1; } + refresh_container_libs +else + host_install +fi + +nvidia-smi >/dev/null +grep -qF "$VERSION" /proc/driver/nvidia/version From 3e016b572dcded7701f9b2f12c25cec3cb7e5b1d Mon Sep 17 00:00:00 2001 From: Leo Fang Date: Sun, 7 Jun 2026 03:09:01 +0000 Subject: [PATCH 02/16] CI: fix Linux driver nsenter re-exec, swap Windows version, enable ci.yml dispatch - install_gpu_driver.sh: pipe the script body to the host-side bash via stdin (bash -s < "$0") instead of re-execing "$0". The script lives in the GH workspace mount (container-only), so the relative path doesn't resolve after nsenter switches the mount namespace. The < "$0" fd is opened before nsenter and survives the flip. - test-matrix.yml: Windows nightly-numba-cuda row 610.47 -> 596.36 (610.47 isn't published on the CDN; install hit 404). - ci.yml: add workflow_dispatch: trigger so the pipeline can be re-run manually. The existing should-skip / detect-changes gates already handle non-PR events. --- .github/workflows/ci.yml | 1 + ci/test-matrix.yml | 2 +- ci/tools/install_gpu_driver.sh | 8 +++++++- 3 files changed, 9 insertions(+), 2 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 9f9236b09fe..82ab7210c92 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -24,6 +24,7 @@ on: schedule: # every 24 hours at midnight UTC - cron: "0 0 * * *" + workflow_dispatch: jobs: ci-vars: diff --git a/ci/test-matrix.yml b/ci/test-matrix.yml index 3d5693a188a..730791ac283 100644 --- a/ci/test-matrix.yml +++ b/ci/test-matrix.yml @@ -122,4 +122,4 @@ windows: - { MODE: 'nightly-pytorch', ARCH: 'amd64', PY_VER: '3.12', CUDA_VER: '13.0.2', LOCAL_CTK: '0', GPU: 'l4', GPU_COUNT: '1', DRIVER: 'latest', DRIVER_MODE: 'TCC', TORCH_VER: '2.9.1', TORCH_CUDA: 'cu130' } # nightly-numba-cuda - { MODE: 'nightly-numba-cuda', ARCH: 'amd64', PY_VER: '3.12', CUDA_VER: '12.9.1', LOCAL_CTK: '0', GPU: 'l4', GPU_COUNT: '1', DRIVER: 'latest', DRIVER_MODE: 'TCC' } - - { MODE: 'nightly-numba-cuda', ARCH: 'amd64', PY_VER: '3.12', CUDA_VER: '13.3.0', LOCAL_CTK: '0', GPU: 'l4', GPU_COUNT: '1', DRIVER: '610.47', DRIVER_MODE: 'TCC' } + - { MODE: 'nightly-numba-cuda', ARCH: 'amd64', PY_VER: '3.12', CUDA_VER: '13.3.0', LOCAL_CTK: '0', GPU: 'l4', GPU_COUNT: '1', DRIVER: '596.36', DRIVER_MODE: 'TCC' } diff --git a/ci/tools/install_gpu_driver.sh b/ci/tools/install_gpu_driver.sh index 5dff7043487..b7aeb3434f4 100755 --- a/ci/tools/install_gpu_driver.sh +++ b/ci/tools/install_gpu_driver.sh @@ -156,7 +156,13 @@ refresh_container_libs() { } if [ -z "${_NVDRV_NSENTERED:-}" ] && in_container; then - _NVDRV_NSENTERED=1 nsenter -t 1 -m -p -n -i -u -- "$0" \ + # Re-exec on the host. The runner-team's `nvgha-driver` script lives at a + # host-side absolute path so `"$0"` survives the mount-namespace flip; + # ours lives in the GH workspace mount (container-only), so we pipe the + # script body in via stdin instead -- the `< "$0"` fd is opened before + # nsenter and stays valid across the namespace switch. Env vars (DRIVER, + # GPU_TYPE, _NVDRV_NSENTERED) are inherited by the host-side bash. + _NVDRV_NSENTERED=1 nsenter -t 1 -m -p -n -i -u -- bash -s < "$0" \ || { echo "::error::container needs 'options: --privileged --pid=host'" >&2; exit 1; } refresh_container_libs else From c0ca8696e64c7fb1e275ae628905ba6d86144279 Mon Sep 17 00:00:00 2001 From: Leo Fang Date: Sun, 7 Jun 2026 03:27:53 +0000 Subject: [PATCH 03/16] CI: move 'Ensure GPU is working' after 'Install GPU driver' on Linux So nvidia-smi validates the post-install driver state on custom-DRIVER rows. Windows test-wheel + coverage already use Install -> Configure -> Ensure; this brings the Linux test-wheel job into line. --- .github/workflows/test-wheel-linux.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/test-wheel-linux.yml b/.github/workflows/test-wheel-linux.yml index 4f56cb57740..57bc4dc555f 100644 --- a/.github/workflows/test-wheel-linux.yml +++ b/.github/workflows/test-wheel-linux.yml @@ -121,9 +121,6 @@ jobs: NVIDIA_VISIBLE_DEVICES: ${{ env.NVIDIA_VISIBLE_DEVICES }} PIP_CACHE_DIR: "/tmp/pip-cache" steps: - - name: Ensure GPU is working - run: nvidia-smi - - name: Checkout ${{ github.event.repository.name }} uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10 # v6.0.3 @@ -151,6 +148,9 @@ jobs: apt-get -y install --no-install-recommends util-linux ./ci/tools/install_gpu_driver.sh + - name: Ensure GPU is working + run: nvidia-smi + - name: Set environment variables env: BUILD_CUDA_VER: ${{ inputs.build-ctk-ver }} From 4a23b23a26111a8ead0cc40519a2b558ea9bfe66 Mon Sep 17 00:00:00 2001 From: Leo Fang Date: Sun, 7 Jun 2026 03:30:30 +0000 Subject: [PATCH 04/16] CI: flip two PR-matrix Linux rows to DRIVER=610.43.02 Exercises the custom-driver install path on every PR (not just nightly). Both rows are amd64 / 13.3.0 / local-CTK, on l4 and rtxpro6000 -- both in the 'open' kernel-module flavor (only Volta needs 'legacy'). --- ci/test-matrix.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ci/test-matrix.yml b/ci/test-matrix.yml index 730791ac283..51f0d3f063f 100644 --- a/ci/test-matrix.yml +++ b/ci/test-matrix.yml @@ -38,10 +38,10 @@ linux: - { ARCH: 'amd64', PY_VER: '3.12', CUDA_VER: '13.3.0', LOCAL_CTK: '0', GPU: 'l4', GPU_COUNT: '1', DRIVER: 'latest' } - { ARCH: 'amd64', PY_VER: '3.13', CUDA_VER: '12.9.1', LOCAL_CTK: '0', GPU: 'v100', GPU_COUNT: '1', DRIVER: 'latest' } - { ARCH: 'amd64', PY_VER: '3.13', CUDA_VER: '13.0.2', LOCAL_CTK: '1', GPU: 'rtxpro6000', GPU_COUNT: '1', DRIVER: 'latest' } - - { ARCH: 'amd64', PY_VER: '3.13', CUDA_VER: '13.3.0', LOCAL_CTK: '1', GPU: 'rtxpro6000', GPU_COUNT: '1', DRIVER: 'latest' } + - { ARCH: 'amd64', PY_VER: '3.13', CUDA_VER: '13.3.0', LOCAL_CTK: '1', GPU: 'rtxpro6000', GPU_COUNT: '1', DRIVER: '610.43.02' } - { ARCH: 'amd64', PY_VER: '3.14', CUDA_VER: '12.9.1', LOCAL_CTK: '0', GPU: 't4', GPU_COUNT: '1', DRIVER: 'latest' } - { ARCH: 'amd64', PY_VER: '3.14', CUDA_VER: '13.0.2', LOCAL_CTK: '1', GPU: 'l4', GPU_COUNT: '1', DRIVER: 'latest' } - - { ARCH: 'amd64', PY_VER: '3.14', CUDA_VER: '13.3.0', LOCAL_CTK: '1', GPU: 'l4', GPU_COUNT: '1', DRIVER: 'latest' } + - { ARCH: 'amd64', PY_VER: '3.14', CUDA_VER: '13.3.0', LOCAL_CTK: '1', GPU: 'l4', GPU_COUNT: '1', DRIVER: '610.43.02' } - { ARCH: 'amd64', PY_VER: '3.14t', CUDA_VER: '12.9.1', LOCAL_CTK: '1', GPU: 't4', GPU_COUNT: '1', DRIVER: 'latest' } - { ARCH: 'amd64', PY_VER: '3.14t', CUDA_VER: '13.0.2', LOCAL_CTK: '1', GPU: 'l4', GPU_COUNT: '1', DRIVER: 'latest' } - { ARCH: 'amd64', PY_VER: '3.14t', CUDA_VER: '13.3.0', LOCAL_CTK: '1', GPU: 'l4', GPU_COUNT: '1', DRIVER: 'latest' } From d33a928eb464987b6de6eeceaf7ab9454ffe1e91 Mon Sep 17 00:00:00 2001 From: Leo Fang Date: Sun, 7 Jun 2026 14:16:08 +0000 Subject: [PATCH 05/16] CI: restart nvidia-persistenced on Linux; poll nvidia-smi on Windows Linux: After install_gpu_driver.sh stops nvidia-persistenced and the apt purge removes the package, the .run installer reinstalls the systemd service but leaves it stopped. cuda.core's test_persistence_mode_enabled fails with NVML_ERROR_UNKNOWN on driver 610.43.02 when the daemon is not running; explicitly start it again at the end of host_install(). Windows: configure_driver_mode.ps1's trailing 'Start-Sleep -Seconds 5' is not enough on slower-coming-back-up multi-GPU rows (observed: 2x H100 MCDM). Replace it with a poll-until-success loop on nvidia-smi with a 60s deadline, matching the runner-team nvgha-driver.ps1 pattern. Previously masked because every Windows row used to run the full install pipeline; with custom-DRIVER plumbing, latest/earliest rows skip the install and the cycle is no longer preceded by warm-up time. --- ci/tools/configure_driver_mode.ps1 | 17 +++++++++++++++-- ci/tools/install_gpu_driver.sh | 9 +++++++++ 2 files changed, 24 insertions(+), 2 deletions(-) diff --git a/ci/tools/configure_driver_mode.ps1 b/ci/tools/configure_driver_mode.ps1 index 280e725e11b..42e0914935d 100644 --- a/ci/tools/configure_driver_mode.ps1 +++ b/ci/tools/configure_driver_mode.ps1 @@ -37,8 +37,21 @@ function Set-DriverMode { pnputil /disable-device "$($device.InstanceId)" pnputil /enable-device "$($device.InstanceId)" } - # Give it a minute to settle: - Start-Sleep -Seconds 5 + + # Poll nvidia-smi until NVML can initialize, or give up after ~60s. + # A fixed sleep is not enough on slower-coming-back-up multi-GPU rows + # (e.g. 2x H100 MCDM) where pnputil enable returns before NVML is + # ready. Pattern borrowed from the runner-team `nvgha-driver.ps1`. + Write-Output "Waiting for nvidia-smi/NVML to come back up after device cycle..." + $deadline = (Get-Date).AddSeconds(60) + do { + Start-Sleep -Seconds 2 + & nvidia-smi.exe 2>&1 | Out-Null + } while ($LASTEXITCODE -ne 0 -and (Get-Date) -lt $deadline) + if ($LASTEXITCODE -ne 0) { + Write-Error "nvidia-smi did not return cleanly within 60s of the device cycle" + exit 1 + } } # Run the functions diff --git a/ci/tools/install_gpu_driver.sh b/ci/tools/install_gpu_driver.sh index b7aeb3434f4..f104ed09751 100755 --- a/ci/tools/install_gpu_driver.sh +++ b/ci/tools/install_gpu_driver.sh @@ -114,6 +114,15 @@ host_install() { && sh installer.run --silent --dkms --no-questions \ --accept-license --ui=none --no-cc-version-check --kernel-module-type="$KMT" ) modprobe nvidia nvidia_uvm nvidia_modeset + + # Restore nvidia-persistenced. We stopped it before the install (and the + # purge may have removed it); the .run installer reinstalls the service. + # Some NVML calls -- e.g. nvmlDeviceSetPersistenceMode -- can fail with + # NVML_ERROR_UNKNOWN on newer drivers when the daemon isn't running, and + # cuda.core's test_persistence_mode_enabled trips on that. + if systemctl list-unit-files 2>/dev/null | grep -q '^nvidia-persistenced\.service'; then + systemctl start nvidia-persistenced || true + fi } # Replace the toolkit's bind-mounted nvidia libs/binaries inside this From 00896dc0f6f3fea81f85485c973d1fffcaf48584 Mon Sep 17 00:00:00 2001 From: Leo Fang Date: Sun, 7 Jun 2026 15:07:21 +0000 Subject: [PATCH 06/16] CI: re-enable persistence mode after Linux driver swap Runner-latest L4 images come up with Persistence-M=On (set somewhere in the runner team's image setup, not in cuda-python). Our .run install leaves it Off, which breaks cuda.core's test_persistence_mode_enabled on driver 610.43.02 -- the test calls device.is_persistence_mode_enabled = False on a device that already reports False, and 610.43.02 returns NVML_ERROR_UNKNOWN for that no-op set. Restore the runner baseline by calling `nvidia-smi -pm 1` at the end of host_install() (sets the kernel persistence flag directly via NVML). Also daemon-reload + start nvidia-persistenced.service best-effort so tools that look for the daemon find it; `set -x` around this trailing block so the next run's log confirms which lines fired. --- ci/tools/install_gpu_driver.sh | 25 +++++++++++++++++-------- 1 file changed, 17 insertions(+), 8 deletions(-) diff --git a/ci/tools/install_gpu_driver.sh b/ci/tools/install_gpu_driver.sh index f104ed09751..cef1d6923f8 100755 --- a/ci/tools/install_gpu_driver.sh +++ b/ci/tools/install_gpu_driver.sh @@ -115,14 +115,23 @@ host_install() { --accept-license --ui=none --no-cc-version-check --kernel-module-type="$KMT" ) modprobe nvidia nvidia_uvm nvidia_modeset - # Restore nvidia-persistenced. We stopped it before the install (and the - # purge may have removed it); the .run installer reinstalls the service. - # Some NVML calls -- e.g. nvmlDeviceSetPersistenceMode -- can fail with - # NVML_ERROR_UNKNOWN on newer drivers when the daemon isn't running, and - # cuda.core's test_persistence_mode_enabled trips on that. - if systemctl list-unit-files 2>/dev/null | grep -q '^nvidia-persistenced\.service'; then - systemctl start nvidia-persistenced || true - fi + # Restore the runner image's baseline state: persistence mode ENABLED + # plus nvidia-persistenced running. The runner-team's pre-installed + # drivers come up with `Persistence-M: On`, but our .run install leaves + # it Off, which breaks tests that toggle the value (cuda.core's + # test_persistence_mode_enabled hits NVML_ERROR_UNKNOWN when setting + # the mode to its current value on driver 610.43.02). + # + # `nvidia-smi -pm 1` is the load-bearing call -- it sets the kernel- + # level persistence flag directly via NVML (equivalent to what the + # daemon would do on startup). The systemctl block is best-effort: the + # silent .run installer doesn't always drop the systemd unit, so we + # daemon-reload first and tolerate failure on `start`. + set -x + nvidia-smi -pm 1 || true + systemctl daemon-reload 2>/dev/null || true + systemctl start nvidia-persistenced.service 2>/dev/null || true + set +x } # Replace the toolkit's bind-mounted nvidia libs/binaries inside this From 0d5f0e986d0e36b2dd0954efff59266dae05b9db Mon Sep 17 00:00:00 2001 From: Leo Fang Date: Sun, 7 Jun 2026 16:54:41 +0000 Subject: [PATCH 07/16] CI: preserve SUID bit when refreshing container nvidia binaries refresh_container_libs() used 'cp -f --remove-destination' (verbatim from the runner team's nvgha-driver), which without -p/--preserve strips the SUID/SGID bits on the destination. /usr/bin/nvidia-modprobe ships 4755 and NVML's state-changing calls (e.g. nvmlDeviceSetPersistenceMode) route through it; once SUID is gone the container-side call returns NVML_ERROR_UNKNOWN, which is what cuda.core's test_persistence_mode_enabled was hitting. Add a stat diagnostic line at the end of refresh_container_libs() so the next CI log records nvidia-modprobe's post-refresh mode. --- ci/tools/install_gpu_driver.sh | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/ci/tools/install_gpu_driver.sh b/ci/tools/install_gpu_driver.sh index cef1d6923f8..242a2fe5d8a 100755 --- a/ci/tools/install_gpu_driver.sh +++ b/ci/tools/install_gpu_driver.sh @@ -167,10 +167,21 @@ refresh_container_libs() { [ -n "$src" ] || { echo "skip $tgt: no host source" >&2; continue; } fi umount "$tgt" 2>/dev/null || true - cp -f --remove-destination "$src" "$tgt" \ + # --preserve=mode keeps the SUID bit. /usr/bin/nvidia-modprobe ships + # 4755 and NVML's state-changing calls (e.g. + # nvmlDeviceSetPersistenceMode) go through it; a plain `cp` strips + # SUID and the call then fails with NVML_ERROR_UNKNOWN. The runner + # team's nvgha-driver has the same bug; we differ here. + cp -f --preserve=mode --remove-destination "$src" "$tgt" \ || echo "WARN: refresh failed for $tgt (src=$src)" >&2 done ldconfig + + # Diagnostic: confirm SUID survived on nvidia-modprobe (the load-bearing + # piece). One-liner so the next CI log proves the fix. + if [ -e /usr/bin/nvidia-modprobe ]; then + stat -c 'refresh: %n mode=%a uid=%u' /usr/bin/nvidia-modprobe >&2 + fi } if [ -z "${_NVDRV_NSENTERED:-}" ] && in_container; then From 3dfaa8495fa064b0ff5984a0b4c95433fc8979c8 Mon Sep 17 00:00:00 2001 From: Leo Fang Date: Sun, 7 Jun 2026 18:48:18 +0000 Subject: [PATCH 08/16] CI: exec nvidia-persistenced directly after Linux driver swap The `--silent --no-questions` .run installer drops /usr/bin/nvidia- persistenced but does not reliably install a usable systemd unit, so `systemctl start nvidia-persistenced.service` was a no-op (verified in CI logs: `+ true` after the start). With the daemon down, the /run/nvidia-persistenced/socket bind-mounted into the test container is stale, and NVML state-changing calls (e.g. nvmlDeviceSetPersistenceMode) made by root inside the container return NVML_ERROR_UNKNOWN -- which is what cuda.core's test_persistence_mode_enabled has been failing on. Verified on ComputeLab with the same driver (610.43.02), same GPU arch (Ada L40S), root in container: with the daemon up, the SET call returns NVML_SUCCESS; with the daemon down it returns UnknownError. Fix: exec /usr/bin/nvidia-persistenced directly. The binary self-daemonizes and creates the socket on its own. (Same latent gap exists in nv-gha-runners/vm-images' nvgha-driver; will flag upstream.) --- ci/tools/install_gpu_driver.sh | 29 +++++++++++++++-------------- 1 file changed, 15 insertions(+), 14 deletions(-) diff --git a/ci/tools/install_gpu_driver.sh b/ci/tools/install_gpu_driver.sh index 242a2fe5d8a..27c0b147458 100755 --- a/ci/tools/install_gpu_driver.sh +++ b/ci/tools/install_gpu_driver.sh @@ -115,22 +115,23 @@ host_install() { --accept-license --ui=none --no-cc-version-check --kernel-module-type="$KMT" ) modprobe nvidia nvidia_uvm nvidia_modeset - # Restore the runner image's baseline state: persistence mode ENABLED - # plus nvidia-persistenced running. The runner-team's pre-installed - # drivers come up with `Persistence-M: On`, but our .run install leaves - # it Off, which breaks tests that toggle the value (cuda.core's - # test_persistence_mode_enabled hits NVML_ERROR_UNKNOWN when setting - # the mode to its current value on driver 610.43.02). - # - # `nvidia-smi -pm 1` is the load-bearing call -- it sets the kernel- - # level persistence flag directly via NVML (equivalent to what the - # daemon would do on startup). The systemctl block is best-effort: the - # silent .run installer doesn't always drop the systemd unit, so we - # daemon-reload first and tolerate failure on `start`. + # Bring nvidia-persistenced back up. We stopped it above, and the + # `--silent --no-questions` .run installer drops `/usr/bin/nvidia- + # persistenced` but does not reliably reinstall a usable systemd + # unit -- so a previous attempt at `systemctl start nvidia- + # persistenced.service` was a no-op (see ComputeLab repro on driver + # 610.43.02). Exec the daemon directly; it self-daemonizes and + # creates `/run/nvidia-persistenced/socket`, which NVML clients in + # the test container need for state-changing calls like + # `nvmlDeviceSetPersistenceMode` -- without it those calls return + # NVML_ERROR_UNKNOWN. nv-gha-runners/vm-images' `nvgha-driver` has + # the same gap; their CUDA-runtime validation workload doesn't hit + # an NVML SET write so they haven't surfaced it yet. set -x + /usr/bin/nvidia-persistenced --verbose 2>&1 || true + # Set persistence mode explicitly so we match the runner image's + # `Persistence-M: On` baseline regardless of how the daemon came up. nvidia-smi -pm 1 || true - systemctl daemon-reload 2>/dev/null || true - systemctl start nvidia-persistenced.service 2>/dev/null || true set +x } From 701cf2f0f29c470c5093ddaef3b5da08b031ea46 Mon Sep 17 00:00:00 2001 From: Leo Fang Date: Sun, 7 Jun 2026 19:17:41 +0000 Subject: [PATCH 09/16] CI: pass --user root to nvidia-persistenced after Linux driver swap nvidia-persistenced defaults to `--user nvidia-persistenced`, which our apt-purge of `nvidia-compute-utils-*` removed. Without that user the daemon's setuid(3) post-fork fails and the process exits silently -- the `nvidia-smi -pm 1` right after sees Persistence-M briefly On (daemon held it), then it flips back to Off (daemon gone), and the test container's NVML SET call later returns NVML_ERROR_UNKNOWN. Pass --user root so the daemon doesn't depend on a user account that the purge deleted. Also add a `pgrep nvidia-persistenced` + `ls -la /run/nvidia-persistenced/` diagnostic so the next CI log proves the daemon is alive when the test starts. --- ci/tools/install_gpu_driver.sh | 29 ++++++++++++++++++++--------- 1 file changed, 20 insertions(+), 9 deletions(-) diff --git a/ci/tools/install_gpu_driver.sh b/ci/tools/install_gpu_driver.sh index 27c0b147458..3ddeda9e3ad 100755 --- a/ci/tools/install_gpu_driver.sh +++ b/ci/tools/install_gpu_driver.sh @@ -119,16 +119,27 @@ host_install() { # `--silent --no-questions` .run installer drops `/usr/bin/nvidia- # persistenced` but does not reliably reinstall a usable systemd # unit -- so a previous attempt at `systemctl start nvidia- - # persistenced.service` was a no-op (see ComputeLab repro on driver - # 610.43.02). Exec the daemon directly; it self-daemonizes and - # creates `/run/nvidia-persistenced/socket`, which NVML clients in - # the test container need for state-changing calls like - # `nvmlDeviceSetPersistenceMode` -- without it those calls return - # NVML_ERROR_UNKNOWN. nv-gha-runners/vm-images' `nvgha-driver` has - # the same gap; their CUDA-runtime validation workload doesn't hit - # an NVML SET write so they haven't surfaced it yet. + # persistenced.service` was a no-op. Exec the daemon directly; it + # self-daemonizes and creates `/run/nvidia-persistenced/socket`, + # which NVML clients in the test container need for state-changing + # calls like `nvmlDeviceSetPersistenceMode` -- without it those + # calls return NVML_ERROR_UNKNOWN. + # + # `--user root`: the daemon's default user is `nvidia-persistenced`, + # which our apt purge of `nvidia-compute-utils-*` deleted. Without + # this flag the daemon's setuid(3) call fails post-fork and the + # process exits silently (which leaves Persistence-M flipping back + # to Off the moment we exit the start window). + # + # Same latent gap exists in nv-gha-runners/vm-images' `nvgha-driver`; + # their CUDA-runtime validation workload doesn't issue an NVML SET + # write so they haven't surfaced it yet. set -x - /usr/bin/nvidia-persistenced --verbose 2>&1 || true + /usr/bin/nvidia-persistenced --verbose --user root 2>&1 || true + sleep 1 + # Diagnostics: confirm the daemon is alive + socket present. + pgrep -laf nvidia-persistenced || echo "WARN: nvidia-persistenced not running" + ls -la /run/nvidia-persistenced/ 2>&1 || echo "WARN: /run/nvidia-persistenced missing" # Set persistence mode explicitly so we match the runner image's # `Persistence-M: On` baseline regardless of how the daemon came up. nvidia-smi -pm 1 || true From a3f157382b28f8be433e25866446e852664c5180 Mon Sep 17 00:00:00 2001 From: Leo Fang Date: Sun, 7 Jun 2026 19:20:32 +0000 Subject: [PATCH 10/16] CI: add fast-feedback probe-driver-swap job (workflow_dispatch only) Allocates one L4 GPU + privileged container, runs install_gpu_driver.sh with DRIVER=610.43.02, then drives nvmlDeviceSetPersistenceMode via raw ctypes -- the exact NVML call that cuda.core's test_persistence_mode_enabled exercises. Exits 1 on NVML_ERROR_UNKNOWN so the smoke test fails loudly when the install path leaves the daemon dead. Total runtime ~5 min vs ~30 min for the full test matrix. Triggered by workflow_dispatch only -- this is an opt-in debugging job, not regular PR or nightly traffic. --- .github/workflows/ci.yml | 72 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 72 insertions(+) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 82ab7210c92..27d827de7e0 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -413,6 +413,78 @@ jobs: with: is-release: ${{ github.ref_type == 'tag' }} + # Fast-feedback probe for changes to ci/tools/install_gpu_driver.sh. + # Allocates one L4 GPU + container, runs the driver swap to a + # hard-coded version, then drives nvmlDeviceSetPersistenceMode via + # raw ctypes -- the *exact* NVML call that cuda.core's + # test_persistence_mode_enabled exercises. Total runtime is ~5 min + # vs. ~30 min for a full test matrix. + # + # `workflow_dispatch` only -- this is an opt-in debugging job, not + # part of regular PR or nightly traffic. + probe-driver-swap: + name: Probe custom-DRIVER install + if: ${{ github.event_name == 'workflow_dispatch' && github.repository_owner == 'nvidia' }} + runs-on: "linux-amd64-gpu-l4-latest-1" + timeout-minutes: 15 + defaults: + run: + shell: bash --noprofile --norc -xeuo pipefail {0} + container: + options: -u root --security-opt seccomp=unconfined --shm-size 16g --privileged --pid=host + image: ubuntu:22.04 + env: + NVIDIA_VISIBLE_DEVICES: ${{ env.NVIDIA_VISIBLE_DEVICES }} + steps: + - name: Checkout + uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10 # v6.0.3 + + - name: Install host deps + run: | + apt-get update -qq + apt-get -y install --no-install-recommends util-linux python3 + + - name: Install GPU driver + env: + DRIVER: '610.43.02' + GPU_TYPE: 'l4' + run: ./ci/tools/install_gpu_driver.sh + + - name: Show post-install host + container state + run: | + nvidia-smi --query-gpu=name,driver_version,persistence_mode --format=csv + echo + echo "=== /run/nvidia-persistenced ===" + ls -la /run/nvidia-persistenced/ 2>&1 || echo "MISSING" + echo + echo "=== nvidia-persistenced process ===" + pgrep -laf nvidia-persistenced || echo "(none)" + + - name: Drive nvmlDeviceSetPersistenceMode via ctypes + run: | + python3 <<'PYEOF' + import ctypes, sys + NVML_SUCCESS, NVML_ERROR_NO_PERMISSION, NVML_ERROR_UNKNOWN = 0, 4, 999 + nvml = ctypes.CDLL("libnvidia-ml.so.1") + assert nvml.nvmlInit_v2() == 0, "nvmlInit_v2 failed" + h = ctypes.c_void_p() + assert nvml.nvmlDeviceGetHandleByIndex_v2(0, ctypes.byref(h)) == 0 + m = ctypes.c_uint(99) + nvml.nvmlDeviceGetPersistenceMode(h, ctypes.byref(m)) + print(f"current persistence_mode = {m.value} (1=ENABLED, 0=DISABLED)") + ret = nvml.nvmlDeviceSetPersistenceMode(h, 0) + print(f"SET DISABLED -> {ret} # 0=SUCCESS, 4=NO_PERMISSION, 999=UNKNOWN") + if ret == NVML_ERROR_UNKNOWN: + print("FAIL: NVML_ERROR_UNKNOWN -- daemon-down failure mode reproduced", file=sys.stderr) + sys.exit(1) + if ret != NVML_SUCCESS: + print(f"FAIL: unexpected return code {ret}", file=sys.stderr) + sys.exit(1) + # restore + nvml.nvmlDeviceSetPersistenceMode(h, m.value) + print("OK") + PYEOF + checks: name: Check job status if: always() From c5fef92e8c2d203ca0f1b826a37f08717a8a7fc9 Mon Sep 17 00:00:00 2001 From: Leo Fang Date: Sun, 7 Jun 2026 19:22:56 +0000 Subject: [PATCH 11/16] CI: drop workflow_dispatch gate on probe-driver-swap so it runs on every PR --- .github/workflows/ci.yml | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 27d827de7e0..3a176deff31 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -418,13 +418,11 @@ jobs: # hard-coded version, then drives nvmlDeviceSetPersistenceMode via # raw ctypes -- the *exact* NVML call that cuda.core's # test_persistence_mode_enabled exercises. Total runtime is ~5 min - # vs. ~30 min for a full test matrix. - # - # `workflow_dispatch` only -- this is an opt-in debugging job, not - # part of regular PR or nightly traffic. + # vs. ~30 min for a full test matrix; runs on every PR push so we + # can iterate on `ci/tools/install_gpu_driver.sh` quickly. probe-driver-swap: name: Probe custom-DRIVER install - if: ${{ github.event_name == 'workflow_dispatch' && github.repository_owner == 'nvidia' }} + if: ${{ github.repository_owner == 'nvidia' && !cancelled() }} runs-on: "linux-amd64-gpu-l4-latest-1" timeout-minutes: 15 defaults: From f17dd7f95d981d5546fddc423c47bf3e3a1e9510 Mon Sep 17 00:00:00 2001 From: Leo Fang Date: Sun, 7 Jun 2026 19:34:25 +0000 Subject: [PATCH 12/16] CI: stop refresh_container_libs from clobbering /run/nvidia-persistenced refresh_container_libs() walks /proc/self/mountinfo for entries containing 'nvidia' or 'libcuda'. /run/nvidia-persistenced/socket matches that pattern and was being umount'd + cp'd over -- which breaks the container's view of the daemon's IPC socket (the container ends up with a 0-link unlinked socket inode instead of the live host one). Without a working socket, NVML state-changing calls inside the container return NVML_ERROR_UNKNOWN -- which is exactly what cuda.core's test_persistence_mode_enabled was hitting. Restrict the refresh to /usr/(bin|lib) so it only touches the actual binaries + shared libraries that change version with the driver swap. /dev/nvidia*, /proc/driver/nvidia, /run/nvidia-*, /tmp/nvidia-mps are all left as the toolkit set them up. Same latent gap exists in nv-gha-runners/vm-images' nvgha-driver; their CUDA-runtime validation workload never queries the daemon socket so they haven't surfaced it. --- ci/tools/install_gpu_driver.sh | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/ci/tools/install_gpu_driver.sh b/ci/tools/install_gpu_driver.sh index 3ddeda9e3ad..05d87357fac 100755 --- a/ci/tools/install_gpu_driver.sh +++ b/ci/tools/install_gpu_driver.sh @@ -155,12 +155,21 @@ refresh_container_libs() { # the kernel suffixes field 4 with " (deleted)" once the host unlinks # the old lib -- don't break discovery. Filters skip what we can't or # shouldn't refresh: - # $3 ~ /^0:/ tmpfs/proc/sysfs (e.g. the toolkit hook tmpfs) - # $5 ~ /\.json$/ vulkan/glvnd config remaps (not version-bound) - # $5 ~ /\/(firmware|xorg)\// firmware loads host-side; xorg unused in CUDA containers + # $3 ~ /^0:/ tmpfs/proc/sysfs (e.g. the toolkit hook tmpfs) + # $5 must be under /usr/(bin|lib) binaries + libs only -- explicitly + # NOT /run/nvidia-persistenced/socket + # (cp'ing the daemon's IPC socket unlinks + # the container's view and turns later + # NVML state-changing calls into + # NVML_ERROR_UNKNOWN); NOT /dev/nvidia* + # (character devices); NOT /proc/driver/nvidia + # (procfs); NOT /tmp/nvidia-mps (runtime). + # $5 ~ /\.json$/ vulkan/glvnd config remaps (not version-bound) + # $5 ~ /\/(firmware|xorg)\// firmware loads host-side; xorg unused in CUDA containers local mounts mounts=$(awk ' $3 !~ /^0:/ && + $5 ~ /^\/usr\/(bin|lib)/ && $5 !~ /\.json$/ && $5 !~ /\/(firmware|xorg)\// && $5 ~ /(nvidia|libcuda)/ { print $5 } From 6412f4f43f71fa61f4a3a23f41ebaf5fbf193dcb Mon Sep 17 00:00:00 2001 From: Leo Fang Date: Sun, 7 Jun 2026 19:44:15 +0000 Subject: [PATCH 13/16] CI: take down nvidia-persistenced via pkill, not systemctl The packaged nvidia-persistenced.service has `RuntimeDirectory=nvidia-persistenced`, which makes systemd `unlink()` /run/nvidia-persistenced/ when the unit stops. The container has that directory bind-mounted from the host as of container-start time. When systemd removes the inode and our subsequent `/usr/bin/nvidia-persistenced --user root` call re-creates it, the container's bind mount is stranded on the deleted inode -- its /run/nvidia-persistenced/socket shows up with link count 0 and NVML state-changing calls return NVML_ERROR_UNKNOWN. `pkill -TERM nvidia-persistenced` sends SIGTERM directly to the daemon, which exits cleanly without involving systemd's RuntimeDirectory cleanup. The host dir keeps its inode across the swap; the container's bind mount stays valid; the new daemon's socket is visible to in-container NVML clients. --- ci/tools/install_gpu_driver.sh | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/ci/tools/install_gpu_driver.sh b/ci/tools/install_gpu_driver.sh index 05d87357fac..e7d0359a386 100755 --- a/ci/tools/install_gpu_driver.sh +++ b/ci/tools/install_gpu_driver.sh @@ -89,7 +89,19 @@ in_container() { host_install() { apt-get -y install build-essential dkms "linux-headers-$(uname -r)" psmisc kmod - systemctl stop nvidia-persistenced dcgm-exporter 2>/dev/null || true + # Take down nvidia-persistenced *without* systemctl. The packaged + # systemd unit declares `RuntimeDirectory=nvidia-persistenced`, which + # makes systemd unlink /run/nvidia-persistenced/ on stop. The + # container has /run/nvidia-persistenced/ bind-mounted from host, and + # the bind mount points to the dir's inode at container-start time -- + # if systemd removes the dir and the new daemon recreates it under a + # different inode, the container's bind mount goes stale and its + # /run/nvidia-persistenced/socket loses its link to the live daemon + # endpoint (the file shows up with link count 0 inside the container). + # NVML state-changing calls in the container then return + # NVML_ERROR_UNKNOWN. Sending SIGTERM directly keeps the inode alive. + pkill -TERM nvidia-persistenced 2>/dev/null || true + systemctl stop dcgm-exporter 2>/dev/null || true # if-test instead of `fuser ... || true` so a kill failure surfaces # (fuser exits 1 when nothing holds the device, which is the happy path). if fuser /dev/nvidia* >/dev/null 2>&1; then From 2b34f1f21be63480306c8b2dc30df85cd9e7c37c Mon Sep 17 00:00:00 2001 From: Leo Fang Date: Sun, 7 Jun 2026 19:53:54 +0000 Subject: [PATCH 14/16] CI: re-bind /run/nvidia-persistenced into container after driver swap The container's bind mount of /run/nvidia-persistenced/ is taken at container-start time and pinned to the host directory's then-current inode. Across the install the host directory gets recreated under a fresh inode (the daemon's shutdown + restart cycle replaces it), and the container is stranded on the deleted inode -- socket file shows up with link count 0 inside the container, NVML state-changing calls return NVML_ERROR_UNKNOWN. After refresh_container_libs, umount the stale bind, mkdir the local mount point if missing, and re-bind from /proc/1/root/run/nvidia- persistenced (the host's current view via the privileged container's host-pid-ns access). CAP_SYS_ADMIN required, which custom-DRIVER rows already grant via --privileged --pid=host. --- ci/tools/install_gpu_driver.sh | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/ci/tools/install_gpu_driver.sh b/ci/tools/install_gpu_driver.sh index e7d0359a386..adeaca81ea6 100755 --- a/ci/tools/install_gpu_driver.sh +++ b/ci/tools/install_gpu_driver.sh @@ -227,6 +227,24 @@ if [ -z "${_NVDRV_NSENTERED:-}" ] && in_container; then _NVDRV_NSENTERED=1 nsenter -t 1 -m -p -n -i -u -- bash -s < "$0" \ || { echo "::error::container needs 'options: --privileged --pid=host'" >&2; exit 1; } refresh_container_libs + + # Re-bind /run/nvidia-persistenced from host. The container's original + # bind mount of this dir was taken at container-start time and points + # to the host's then-current inode. Even with `pkill` (instead of + # systemctl) the host dir is recreated by the new daemon under a fresh + # inode -- leaving the container's bind mount stranded on a deleted + # inode (socket file shows up with link count 0). Re-do the bind mount + # so in-container NVML clients see the live daemon endpoint. Needs + # CAP_SYS_ADMIN, which we get from the --privileged --pid=host the + # workflow adds for custom-DRIVER rows. + if [ -d /proc/1/root/run/nvidia-persistenced ]; then + set -x + umount /run/nvidia-persistenced 2>/dev/null || true + mkdir -p /run/nvidia-persistenced + mount --bind /proc/1/root/run/nvidia-persistenced /run/nvidia-persistenced + ls -la /run/nvidia-persistenced/ >&2 + set +x + fi else host_install fi From 8d8a9ef49e74d5037d31b07d629cac6f74c55711 Mon Sep 17 00:00:00 2001 From: Leo Fang Date: Sun, 7 Jun 2026 20:39:05 +0000 Subject: [PATCH 15/16] CI: drop install_gpu_driver.sh experiments that turned out non-load-bearing - Revert `pkill -TERM nvidia-persistenced` to `systemctl stop`; pkill alone didn't prevent the host dir's inode from flipping, the re-bind of /run/nvidia-persistenced/ is what restores the container's view. - Drop `nvidia-smi -pm 1`; the test exercises NVML's set call, which succeeds once the daemon socket is reachable regardless of current Persistence-M state. - Trim `set -x` blocks and `pgrep`/`ls -la`/`stat` diagnostics that served their purpose during debugging. Keeps the load-bearing changes (nsenter bash -s, /usr/(bin|lib) refresh filter, exec nvidia-persistenced --user root, the /run/nvidia-persistenced re-bind, cp --preserve=mode) and brings the diff against Justin's nvgha-driver back down to the strict minimum. --- ci/tools/install_gpu_driver.sh | 93 +++++++++++----------------------- 1 file changed, 30 insertions(+), 63 deletions(-) diff --git a/ci/tools/install_gpu_driver.sh b/ci/tools/install_gpu_driver.sh index adeaca81ea6..db7ae998587 100755 --- a/ci/tools/install_gpu_driver.sh +++ b/ci/tools/install_gpu_driver.sh @@ -89,19 +89,7 @@ in_container() { host_install() { apt-get -y install build-essential dkms "linux-headers-$(uname -r)" psmisc kmod - # Take down nvidia-persistenced *without* systemctl. The packaged - # systemd unit declares `RuntimeDirectory=nvidia-persistenced`, which - # makes systemd unlink /run/nvidia-persistenced/ on stop. The - # container has /run/nvidia-persistenced/ bind-mounted from host, and - # the bind mount points to the dir's inode at container-start time -- - # if systemd removes the dir and the new daemon recreates it under a - # different inode, the container's bind mount goes stale and its - # /run/nvidia-persistenced/socket loses its link to the live daemon - # endpoint (the file shows up with link count 0 inside the container). - # NVML state-changing calls in the container then return - # NVML_ERROR_UNKNOWN. Sending SIGTERM directly keeps the inode alive. - pkill -TERM nvidia-persistenced 2>/dev/null || true - systemctl stop dcgm-exporter 2>/dev/null || true + systemctl stop nvidia-persistenced dcgm-exporter 2>/dev/null || true # if-test instead of `fuser ... || true` so a kill failure surfaces # (fuser exits 1 when nothing holds the device, which is the happy path). if fuser /dev/nvidia* >/dev/null 2>&1; then @@ -127,35 +115,25 @@ host_install() { --accept-license --ui=none --no-cc-version-check --kernel-module-type="$KMT" ) modprobe nvidia nvidia_uvm nvidia_modeset - # Bring nvidia-persistenced back up. We stopped it above, and the - # `--silent --no-questions` .run installer drops `/usr/bin/nvidia- - # persistenced` but does not reliably reinstall a usable systemd - # unit -- so a previous attempt at `systemctl start nvidia- - # persistenced.service` was a no-op. Exec the daemon directly; it - # self-daemonizes and creates `/run/nvidia-persistenced/socket`, - # which NVML clients in the test container need for state-changing - # calls like `nvmlDeviceSetPersistenceMode` -- without it those - # calls return NVML_ERROR_UNKNOWN. + # Bring nvidia-persistenced back up. NVML state-changing calls from + # inside the test container (e.g. nvmlDeviceSetPersistenceMode, which + # cuda.core's test_persistence_mode_enabled exercises) talk to the + # daemon via /run/nvidia-persistenced/socket; without a live daemon + # they return NVML_ERROR_UNKNOWN. # - # `--user root`: the daemon's default user is `nvidia-persistenced`, - # which our apt purge of `nvidia-compute-utils-*` deleted. Without - # this flag the daemon's setuid(3) call fails post-fork and the - # process exits silently (which leaves Persistence-M flipping back - # to Off the moment we exit the start window). + # systemctl can't start the unit (the `--silent --no-questions` .run + # installer drops /usr/bin/nvidia-persistenced but no usable systemd + # unit), so exec the binary directly -- it self-daemonizes. # - # Same latent gap exists in nv-gha-runners/vm-images' `nvgha-driver`; - # their CUDA-runtime validation workload doesn't issue an NVML SET - # write so they haven't surfaced it yet. - set -x - /usr/bin/nvidia-persistenced --verbose --user root 2>&1 || true - sleep 1 - # Diagnostics: confirm the daemon is alive + socket present. - pgrep -laf nvidia-persistenced || echo "WARN: nvidia-persistenced not running" - ls -la /run/nvidia-persistenced/ 2>&1 || echo "WARN: /run/nvidia-persistenced missing" - # Set persistence mode explicitly so we match the runner image's - # `Persistence-M: On` baseline regardless of how the daemon came up. - nvidia-smi -pm 1 || true - set +x + # `--user root` because the default `nvidia-persistenced` user was + # deleted along with `nvidia-compute-utils-*` in the purge above; + # without this flag the daemon's post-fork setuid() fails silently + # and the process exits. + # + # nv-gha-runners/vm-images' `nvgha-driver` has the same latent gap; + # its CUDA-runtime validation workload never issues an NVML SET + # write so it hasn't surfaced there. + /usr/bin/nvidia-persistenced --verbose --user root || true } # Replace the toolkit's bind-mounted nvidia libs/binaries inside this @@ -200,21 +178,13 @@ refresh_container_libs() { [ -n "$src" ] || { echo "skip $tgt: no host source" >&2; continue; } fi umount "$tgt" 2>/dev/null || true - # --preserve=mode keeps the SUID bit. /usr/bin/nvidia-modprobe ships - # 4755 and NVML's state-changing calls (e.g. - # nvmlDeviceSetPersistenceMode) go through it; a plain `cp` strips - # SUID and the call then fails with NVML_ERROR_UNKNOWN. The runner - # team's nvgha-driver has the same bug; we differ here. + # --preserve=mode keeps the SUID bit so the refresh doesn't silently + # de-privilege binaries like nvidia-modprobe that ship 4755 (the + # runner-team's nvgha-driver uses plain `cp` and has the same gap). cp -f --preserve=mode --remove-destination "$src" "$tgt" \ || echo "WARN: refresh failed for $tgt (src=$src)" >&2 done ldconfig - - # Diagnostic: confirm SUID survived on nvidia-modprobe (the load-bearing - # piece). One-liner so the next CI log proves the fix. - if [ -e /usr/bin/nvidia-modprobe ]; then - stat -c 'refresh: %n mode=%a uid=%u' /usr/bin/nvidia-modprobe >&2 - fi } if [ -z "${_NVDRV_NSENTERED:-}" ] && in_container; then @@ -228,22 +198,19 @@ if [ -z "${_NVDRV_NSENTERED:-}" ] && in_container; then || { echo "::error::container needs 'options: --privileged --pid=host'" >&2; exit 1; } refresh_container_libs - # Re-bind /run/nvidia-persistenced from host. The container's original - # bind mount of this dir was taken at container-start time and points - # to the host's then-current inode. Even with `pkill` (instead of - # systemctl) the host dir is recreated by the new daemon under a fresh - # inode -- leaving the container's bind mount stranded on a deleted - # inode (socket file shows up with link count 0). Re-do the bind mount - # so in-container NVML clients see the live daemon endpoint. Needs - # CAP_SYS_ADMIN, which we get from the --privileged --pid=host the - # workflow adds for custom-DRIVER rows. + # Re-bind /run/nvidia-persistenced from the host. The container's + # original bind mount was pinned to the host's at-container-start + # inode; the daemon stop+restart cycle recreates the dir under a + # fresh inode, stranding the bind mount on the deleted one (socket + # shows up with link count 0 inside the container, NVML SET calls + # return NVML_ERROR_UNKNOWN). Re-bind from /proc/1/root so the + # container picks up the live host dir. Needs CAP_SYS_ADMIN, which + # the workflow grants via --privileged --pid=host on custom-DRIVER + # rows. if [ -d /proc/1/root/run/nvidia-persistenced ]; then - set -x umount /run/nvidia-persistenced 2>/dev/null || true mkdir -p /run/nvidia-persistenced mount --bind /proc/1/root/run/nvidia-persistenced /run/nvidia-persistenced - ls -la /run/nvidia-persistenced/ >&2 - set +x fi else host_install From d2c25ebbb45d9c1615bf9544f60f05aae2453c76 Mon Sep 17 00:00:00 2001 From: Leo Fang Date: Sun, 7 Jun 2026 21:37:23 +0000 Subject: [PATCH 16/16] Revert: remove the probe-driver-swap fast-feedback job Added in a3f157382b for fast iteration on install_gpu_driver.sh; no longer needed now that the script has stabilized. --- .github/workflows/ci.yml | 70 ---------------------------------------- 1 file changed, 70 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 3a176deff31..82ab7210c92 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -413,76 +413,6 @@ jobs: with: is-release: ${{ github.ref_type == 'tag' }} - # Fast-feedback probe for changes to ci/tools/install_gpu_driver.sh. - # Allocates one L4 GPU + container, runs the driver swap to a - # hard-coded version, then drives nvmlDeviceSetPersistenceMode via - # raw ctypes -- the *exact* NVML call that cuda.core's - # test_persistence_mode_enabled exercises. Total runtime is ~5 min - # vs. ~30 min for a full test matrix; runs on every PR push so we - # can iterate on `ci/tools/install_gpu_driver.sh` quickly. - probe-driver-swap: - name: Probe custom-DRIVER install - if: ${{ github.repository_owner == 'nvidia' && !cancelled() }} - runs-on: "linux-amd64-gpu-l4-latest-1" - timeout-minutes: 15 - defaults: - run: - shell: bash --noprofile --norc -xeuo pipefail {0} - container: - options: -u root --security-opt seccomp=unconfined --shm-size 16g --privileged --pid=host - image: ubuntu:22.04 - env: - NVIDIA_VISIBLE_DEVICES: ${{ env.NVIDIA_VISIBLE_DEVICES }} - steps: - - name: Checkout - uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10 # v6.0.3 - - - name: Install host deps - run: | - apt-get update -qq - apt-get -y install --no-install-recommends util-linux python3 - - - name: Install GPU driver - env: - DRIVER: '610.43.02' - GPU_TYPE: 'l4' - run: ./ci/tools/install_gpu_driver.sh - - - name: Show post-install host + container state - run: | - nvidia-smi --query-gpu=name,driver_version,persistence_mode --format=csv - echo - echo "=== /run/nvidia-persistenced ===" - ls -la /run/nvidia-persistenced/ 2>&1 || echo "MISSING" - echo - echo "=== nvidia-persistenced process ===" - pgrep -laf nvidia-persistenced || echo "(none)" - - - name: Drive nvmlDeviceSetPersistenceMode via ctypes - run: | - python3 <<'PYEOF' - import ctypes, sys - NVML_SUCCESS, NVML_ERROR_NO_PERMISSION, NVML_ERROR_UNKNOWN = 0, 4, 999 - nvml = ctypes.CDLL("libnvidia-ml.so.1") - assert nvml.nvmlInit_v2() == 0, "nvmlInit_v2 failed" - h = ctypes.c_void_p() - assert nvml.nvmlDeviceGetHandleByIndex_v2(0, ctypes.byref(h)) == 0 - m = ctypes.c_uint(99) - nvml.nvmlDeviceGetPersistenceMode(h, ctypes.byref(m)) - print(f"current persistence_mode = {m.value} (1=ENABLED, 0=DISABLED)") - ret = nvml.nvmlDeviceSetPersistenceMode(h, 0) - print(f"SET DISABLED -> {ret} # 0=SUCCESS, 4=NO_PERMISSION, 999=UNKNOWN") - if ret == NVML_ERROR_UNKNOWN: - print("FAIL: NVML_ERROR_UNKNOWN -- daemon-down failure mode reproduced", file=sys.stderr) - sys.exit(1) - if ret != NVML_SUCCESS: - print(f"FAIL: unexpected return code {ret}", file=sys.stderr) - sys.exit(1) - # restore - nvml.nvmlDeviceSetPersistenceMode(h, m.value) - print("OK") - PYEOF - checks: name: Check job status if: always()