From b1b6070a823898a01e87c49cb1682f6d65a96eb9 Mon Sep 17 00:00:00 2001
From: Leo Fang <leof@nvidia.com>
Date: Sun, 7 Jun 2026 01:48:44 +0000
Subject: [PATCH 01/16] CI: allow specifying custom driver versions in test
 matrix

Extends the DRIVER field in ci/test-matrix.yml beyond 'latest'/'earliest'
to accept an explicit version string (e.g. '580.65.06'). For Linux,
ci/tools/install_gpu_driver.sh (adapted from nv-gha-runners/vm-images
PR #256) swaps the driver in-job via nsenter when the row uses a custom
version; for Windows, ci/tools/install_gpu_driver.ps1 is split into
install + configure_driver_mode, with the install step gated on the
DRIVER value and the mode step always running.

The matrix row is routed to a 'latest' runner image when the DRIVER is
a custom version (the install scripts perform the swap themselves).
Container privileges on Linux (--privileged --pid=host) are added only
on rows with a custom DRIVER. Custom DRIVER + FLAVOR=wsl is rejected
eagerly in the compute-matrix step.

Two existing nightly-numba-cuda rows exercise the new path:
- Linux amd64 / 13.3.0 / l4 -> 580.65.06
- Windows amd64 / 13.3.0 / l4 -> 610.47

Closes #293
Closes #1265
---
 .github/workflows/coverage.yml           |   8 +-
 .github/workflows/test-wheel-linux.yml   |  28 +++-
 .github/workflows/test-wheel-windows.yml |  20 ++-
 ci/test-matrix.yml                       |  13 +-
 ci/tools/configure_driver_mode.ps1       |  45 ++++++
 ci/tools/install_gpu_driver.ps1          |  51 +++----
 ci/tools/install_gpu_driver.sh           | 167 +++++++++++++++++++++++
 7 files changed, 288 insertions(+), 44 deletions(-)
 create mode 100644 ci/tools/configure_driver_mode.ps1
 create mode 100755 ci/tools/install_gpu_driver.sh

diff --git a/.github/workflows/coverage.yml b/.github/workflows/coverage.yml
index 9581cff3088..de1e713e499 100644
--- a/.github/workflows/coverage.yml
+++ b/.github/workflows/coverage.yml
@@ -275,13 +275,15 @@ jobs:
         uses: nv-gha-runners/setup-proxy-cache@main
         continue-on-error: true
 
-      - name: Update driver
+      # DRIVER above is 'latest' so install_gpu_driver.ps1 is intentionally
+      # skipped (it errors on latest/earliest); configure_driver_mode.ps1
+      # still runs to put the pre-installed driver into TCC mode.
+      - name: Configure driver mode
         shell: powershell
         env:
           DRIVER_MODE: "TCC"
-          GPU_TYPE: "a100"
         run: |
-          ci/tools/install_gpu_driver.ps1
+          ci/tools/configure_driver_mode.ps1
 
       - name: Ensure GPU is working
         run: |
diff --git a/.github/workflows/test-wheel-linux.yml b/.github/workflows/test-wheel-linux.yml
index f8002f5124a..4f56cb57740 100644
--- a/.github/workflows/test-wheel-linux.yml
+++ b/.github/workflows/test-wheel-linux.yml
@@ -85,8 +85,13 @@ jobs:
           # Read base matrix from YAML file for the specific architecture
           TEST_MATRIX=$(yq -o json ".linux[\"${MATRIX_TYPE}\"] | map(select(.ARCH == \"${ARCH}\"))" ci/test-matrix.yml)
 
-          # Apply matrix filter and wrap in include structure
-          MATRIX=$(echo "$TEST_MATRIX" | jq -c '${{ inputs.matrix_filter }} | if (. | length) > 0 then {include: .} else "Error: Empty matrix\n" | halt_error(1) end')
+          # Apply matrix filter; reject custom DRIVER + FLAVOR=wsl (the
+          # in-container driver swap doesn't work under WSL); add a
+          # RUNNER_DRIVER field that maps any custom version back to
+          # 'latest' (the install script swaps the driver itself, so we
+          # need to land on the runner that ships with the most recent
+          # pre-installed driver); wrap in include structure.
+          MATRIX=$(echo "$TEST_MATRIX" | jq -c '${{ inputs.matrix_filter }} | if any(.[]; .DRIVER != "latest" and .DRIVER != "earliest" and .FLAVOR == "wsl") then "Error: custom DRIVER is not supported with FLAVOR=wsl\n" | halt_error(1) else . end | map(. + {RUNNER_DRIVER: (if .DRIVER == "latest" or .DRIVER == "earliest" then .DRIVER else "latest" end)}) | if (. | length) > 0 then {include: .} else "Error: Empty matrix\n" | halt_error(1) end')
 
           echo "MATRIX=${MATRIX}" | tee --append "${GITHUB_OUTPUT}"
 
@@ -101,13 +106,16 @@ jobs:
     strategy:
       fail-fast: false
       matrix: ${{ fromJSON(needs.compute-matrix.outputs.MATRIX) }}
-    runs-on: "${{ matrix.FLAVOR || 'linux' }}-${{ matrix.ARCH }}-gpu-${{ matrix.GPU }}-${{ matrix.DRIVER }}-${{ matrix.GPU_COUNT }}"
+    runs-on: "${{ matrix.FLAVOR || 'linux' }}-${{ matrix.ARCH }}-gpu-${{ matrix.GPU }}-${{ matrix.RUNNER_DRIVER }}-${{ matrix.GPU_COUNT }}"
     # The build stage could fail but we want the CI to keep moving.
     if: ${{ github.repository_owner == 'nvidia' && !cancelled() }}
     # Our self-hosted runners require a container
     # TODO: use a different (nvidia?) container
     container:
-      options: -u root --security-opt seccomp=unconfined --shm-size 16g
+      # Custom-DRIVER rows need --privileged --pid=host so install_gpu_driver.sh
+      # can nsenter to the host for the install + refresh the toolkit bind mounts
+      # back inside the container. Stock options for latest/earliest rows.
+      options: ${{ ((matrix.DRIVER == 'latest' || matrix.DRIVER == 'earliest') && '-u root --security-opt seccomp=unconfined --shm-size 16g') || '-u root --security-opt seccomp=unconfined --shm-size 16g --privileged --pid=host' }}
       image: ubuntu:22.04
       env:
         NVIDIA_VISIBLE_DEVICES: ${{ env.NVIDIA_VISIBLE_DEVICES }}
@@ -131,6 +139,18 @@ jobs:
           dependencies: "jq wget libgl1 libegl1 g++"
           dependent_exes: "jq wget"
 
+      - name: Install GPU driver
+        if: ${{ matrix.DRIVER != 'latest' && matrix.DRIVER != 'earliest' }}
+        env:
+          DRIVER: ${{ matrix.DRIVER }}
+          GPU_TYPE: ${{ matrix.GPU }}
+        run: |
+          # util-linux for nsenter; install_gpu_driver.sh re-execs onto the
+          # host (requires --privileged --pid=host on the container, set
+          # conditionally above) and refreshes the toolkit bind mounts here.
+          apt-get -y install --no-install-recommends util-linux
+          ./ci/tools/install_gpu_driver.sh
+
       - name: Set environment variables
         env:
           BUILD_CUDA_VER: ${{ inputs.build-ctk-ver }}
diff --git a/.github/workflows/test-wheel-windows.yml b/.github/workflows/test-wheel-windows.yml
index 320817177f3..5675b395afe 100644
--- a/.github/workflows/test-wheel-windows.yml
+++ b/.github/workflows/test-wheel-windows.yml
@@ -81,8 +81,11 @@ jobs:
           # Read base matrix from YAML file for the specific architecture
           TEST_MATRIX=$(yq -o json ".windows[\"${MATRIX_TYPE}\"] | map(select(.ARCH == \"${ARCH}\"))" ci/test-matrix.yml)
 
-          # Apply matrix filter and wrap in include structure
-          MATRIX=$(echo "$TEST_MATRIX" | jq -c '${{ inputs.matrix_filter }} | if (. | length) > 0 then {include: .} else "Error: Empty matrix\n" | halt_error(1) end')
+          # Apply matrix filter; add a RUNNER_DRIVER field that maps any
+          # custom DRIVER version back to 'latest' (install_gpu_driver.ps1
+          # swaps the driver itself, so the runner must be the one that
+          # ships the most recent pre-installed driver); wrap in include.
+          MATRIX=$(echo "$TEST_MATRIX" | jq -c '${{ inputs.matrix_filter }} | map(. + {RUNNER_DRIVER: (if .DRIVER == "latest" or .DRIVER == "earliest" then .DRIVER else "latest" end)}) | if (. | length) > 0 then {include: .} else "Error: Empty matrix\n" | halt_error(1) end')
 
           echo "MATRIX=${MATRIX}" | tee --append "${GITHUB_OUTPUT}"
 
@@ -95,7 +98,7 @@ jobs:
       fail-fast: false
       matrix: ${{ fromJSON(needs.compute-matrix.outputs.MATRIX) }}
     if: ${{ github.repository_owner == 'nvidia' && !cancelled() }}
-    runs-on: "windows-${{ matrix.ARCH }}-gpu-${{ matrix.GPU }}-${{ matrix.DRIVER }}-${{ matrix.GPU_COUNT }}"
+    runs-on: "windows-${{ matrix.ARCH }}-gpu-${{ matrix.GPU }}-${{ matrix.RUNNER_DRIVER }}-${{ matrix.GPU_COUNT }}"
     steps:
       - name: Checkout ${{ github.event.repository.name }}
         uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10  # v6.0.3
@@ -106,13 +109,20 @@ jobs:
         with:
           enable-apt: true
 
-      - name: Update driver
+      - name: Install GPU driver
+        if: ${{ matrix.DRIVER != 'latest' && matrix.DRIVER != 'earliest' }}
         env:
-          DRIVER_MODE: ${{ matrix.DRIVER_MODE }}
+          DRIVER: ${{ matrix.DRIVER }}
           GPU_TYPE: ${{ matrix.GPU }}
         run: |
           ci/tools/install_gpu_driver.ps1
 
+      - name: Configure driver mode
+        env:
+          DRIVER_MODE: ${{ matrix.DRIVER_MODE }}
+        run: |
+          ci/tools/configure_driver_mode.ps1
+
       - name: Ensure GPU is working
         run: |
           nvidia-smi
diff --git a/ci/test-matrix.yml b/ci/test-matrix.yml
index 95c5e714caa..3d5693a188a 100644
--- a/ci/test-matrix.yml
+++ b/ci/test-matrix.yml
@@ -13,7 +13,16 @@
 # Windows entries also include DRIVER_MODE.
 #
 # Notes:
+# - DRIVER accepts:
+#     * 'latest'   - use the runner's pre-installed latest driver (no install step)
+#     * 'earliest' - use the runner's pre-installed earliest driver (no install step)
+#     * a version string (e.g. '580.65.06')
+#                  - install that version via ci/tools/install_gpu_driver.sh (Linux)
+#                    or ci/tools/install_gpu_driver.ps1 (Windows) at the start of the
+#                    job. The matrix row is routed to the 'latest' runner image (the
+#                    install scripts swap the driver themselves).
 # - DRIVER: 'earliest' does not work with CUDA 12.9.1
+# - DRIVER: a custom version is not supported with FLAVOR=wsl on Linux.
 
 linux:
   pull-request:
@@ -74,7 +83,7 @@ linux:
     - { MODE: 'nightly-pytorch',    ARCH: 'arm64', PY_VER: '3.12', CUDA_VER: '13.0.2', LOCAL_CTK: '0', GPU: 'l4', GPU_COUNT: '1', DRIVER: 'latest', TORCH_VER: '2.9.1',  TORCH_CUDA: 'cu130' }
     # nightly-numba-cuda
     - { MODE: 'nightly-numba-cuda', ARCH: 'amd64', PY_VER: '3.12', CUDA_VER: '12.9.1', LOCAL_CTK: '0', GPU: 'l4', GPU_COUNT: '1', DRIVER: 'latest' }
-    - { MODE: 'nightly-numba-cuda', ARCH: 'amd64', PY_VER: '3.12', CUDA_VER: '13.3.0', LOCAL_CTK: '0', GPU: 'l4', GPU_COUNT: '1', DRIVER: 'latest' }
+    - { MODE: 'nightly-numba-cuda', ARCH: 'amd64', PY_VER: '3.12', CUDA_VER: '13.3.0', LOCAL_CTK: '0', GPU: 'l4', GPU_COUNT: '1', DRIVER: '580.65.06' }
     - { MODE: 'nightly-numba-cuda', ARCH: 'arm64', PY_VER: '3.12', CUDA_VER: '12.9.1', LOCAL_CTK: '0', GPU: 'l4', GPU_COUNT: '1', DRIVER: 'latest' }
     - { MODE: 'nightly-numba-cuda', ARCH: 'arm64', PY_VER: '3.12', CUDA_VER: '13.3.0', LOCAL_CTK: '0', GPU: 'l4', GPU_COUNT: '1', DRIVER: 'latest' }
     # nightly-standard (arm64 l4×2 — nightly-only per runner team request)
@@ -113,4 +122,4 @@ windows:
     - { MODE: 'nightly-pytorch',    ARCH: 'amd64', PY_VER: '3.12', CUDA_VER: '13.0.2', LOCAL_CTK: '0', GPU: 'l4', GPU_COUNT: '1', DRIVER: 'latest', DRIVER_MODE: 'TCC', TORCH_VER: '2.9.1',  TORCH_CUDA: 'cu130' }
     # nightly-numba-cuda
     - { MODE: 'nightly-numba-cuda', ARCH: 'amd64', PY_VER: '3.12', CUDA_VER: '12.9.1', LOCAL_CTK: '0', GPU: 'l4', GPU_COUNT: '1', DRIVER: 'latest', DRIVER_MODE: 'TCC' }
-    - { MODE: 'nightly-numba-cuda', ARCH: 'amd64', PY_VER: '3.12', CUDA_VER: '13.3.0', LOCAL_CTK: '0', GPU: 'l4', GPU_COUNT: '1', DRIVER: 'latest', DRIVER_MODE: 'TCC' }
+    - { MODE: 'nightly-numba-cuda', ARCH: 'amd64', PY_VER: '3.12', CUDA_VER: '13.3.0', LOCAL_CTK: '0', GPU: 'l4', GPU_COUNT: '1', DRIVER: '610.47',  DRIVER_MODE: 'TCC' }
diff --git a/ci/tools/configure_driver_mode.ps1 b/ci/tools/configure_driver_mode.ps1
new file mode 100644
index 00000000000..280e725e11b
--- /dev/null
+++ b/ci/tools/configure_driver_mode.ps1
@@ -0,0 +1,45 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+#
+# configure_driver_mode.ps1 -- set the NVIDIA driver mode on a Windows CI
+# runner and cycle the display devices so the new mode takes effect
+# without rebooting. Always runs (whether or not install_gpu_driver.ps1
+# just ran). When install_gpu_driver.ps1 has run, this single device
+# cycle also activates the freshly-installed driver.
+#
+# Inputs (env):
+#   DRIVER_MODE  One of WDDM, TCC, MCDM.
+
+function Set-DriverMode {
+
+    # Map matrix DRIVER_MODE to nvidia-smi -fdm code.
+    # This assumes we have the prior knowledge on which GPU can use which mode.
+    $driver_mode = $env:DRIVER_MODE
+    if ($driver_mode -eq "WDDM") {
+        Write-Output "Setting driver mode to WDDM..."
+        nvidia-smi -fdm 0
+    } elseif ($driver_mode -eq "TCC") {
+        Write-Output "Setting driver mode to TCC..."
+        nvidia-smi -fdm 1
+    } elseif ($driver_mode -eq "MCDM") {
+        Write-Output "Setting driver mode to MCDM..."
+        nvidia-smi -fdm 2
+    } else {
+        Write-Output "Unknown driver mode: $driver_mode"
+        exit 1
+    }
+
+    # Only restart NVIDIA display adapters, not other display devices (e.g. QEMU VGA)
+    $nvidia_devices = Get-PnpDevice -Class Display -FriendlyName "NVIDIA*"
+    foreach ($device in $nvidia_devices) {
+        Write-Output "Restarting device: $($device.FriendlyName) ($($device.InstanceId))"
+        pnputil /disable-device "$($device.InstanceId)"
+        pnputil /enable-device "$($device.InstanceId)"
+    }
+    # Give it a minute to settle:
+    Start-Sleep -Seconds 5
+}
+
+# Run the functions
+Set-DriverMode
diff --git a/ci/tools/install_gpu_driver.ps1 b/ci/tools/install_gpu_driver.ps1
index c98416c87e2..e61c6bbdbb1 100644
--- a/ci/tools/install_gpu_driver.ps1
+++ b/ci/tools/install_gpu_driver.ps1
@@ -1,13 +1,30 @@
 # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # SPDX-License-Identifier: Apache-2.0
+#
+# install_gpu_driver.ps1 -- install a specific NVIDIA driver version on a
+# Windows CI runner. Driver-mode selection and the post-install device
+# power-cycle are the responsibility of configure_driver_mode.ps1, which
+# the workflow runs immediately after this script (or by itself when
+# DRIVER is 'latest'/'earliest' and the runner already brings up the
+# right driver).
+#
+# Inputs (env):
+#   DRIVER    Driver version, e.g. "610.47". Must NOT be 'latest' or
+#             'earliest' -- those are runner-pre-installed and the
+#             workflow is expected to skip this script for them.
+#   GPU_TYPE  Lower-case GPU label from the matrix (e.g. "l4", "rtx4090").
+#             Selects the data-center vs desktop installer variant.
 
 # Install the driver
 function Install-Driver {
 
-    # Set the correct URL, filename, and arguments to the installer
-    # This driver is picked to support Windows 11 & CUDA 13.0
-    $version = '581.15'
+    # Driver version is plumbed from the matrix via the DRIVER env var.
+    $version = $env:DRIVER
+    if (-not $version -or $version -eq 'latest' -or $version -eq 'earliest') {
+        Write-Error "DRIVER env var must be a specific version string (e.g. '610.47'); got '$version'."
+        exit 1
+    }
 
     # Get GPU type from environment variable
     $gpu_type = $env:GPU_TYPE
@@ -54,33 +71,7 @@ function Install-Driver {
     # Install the file with the specified path from earlier
     Write-Output 'Running the driver installer...'
     Start-Process -FilePath $filepath -ArgumentList $install_args -Wait
-    Write-Output 'Done!'
-
-    # Handle driver mode configuration
-    # This assumes we have the prior knowledge on which GPU can use which mode.
-    $driver_mode = $env:DRIVER_MODE
-    if ($driver_mode -eq "WDDM") {
-        Write-Output "Setting driver mode to WDDM..."
-        nvidia-smi -fdm 0
-    } elseif ($driver_mode -eq "TCC") {
-        Write-Output "Setting driver mode to TCC..."
-        nvidia-smi -fdm 1
-    } elseif ($driver_mode -eq "MCDM") {
-        Write-Output "Setting driver mode to MCDM..."
-        nvidia-smi -fdm 2
-    } else {
-        Write-Output "Unknown driver mode: $driver_mode"
-        exit 1
-    }
-    # Only restart NVIDIA display adapters, not other display devices (e.g. QEMU VGA)
-    $nvidia_devices = Get-PnpDevice -Class Display -FriendlyName "NVIDIA*"
-    foreach ($device in $nvidia_devices) {
-        Write-Output "Restarting device: $($device.FriendlyName) ($($device.InstanceId))"
-        pnputil /disable-device "$($device.InstanceId)"
-        pnputil /enable-device "$($device.InstanceId)"
-    }
-    # Give it a minute to settle:
-    Start-Sleep -Seconds 5
+    Write-Output 'Install complete; driver mode + device cycle handled by configure_driver_mode.ps1.'
 }
 
 # Run the functions
diff --git a/ci/tools/install_gpu_driver.sh b/ci/tools/install_gpu_driver.sh
new file mode 100755
index 00000000000..5dff7043487
--- /dev/null
+++ b/ci/tools/install_gpu_driver.sh
@@ -0,0 +1,167 @@
+#!/bin/bash
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+#
+# install_gpu_driver.sh -- install a specific NVIDIA driver version on a
+# Linux CI runner. Adapted from nv-gha-runners/vm-images PR #256
+# (`nvgha-driver` CLI), trimmed and parameterised for cuda-python's CI.
+#
+# !!! ALPHA !!!
+# Performs live modifications to the host driver stack (kernel module
+# reload, package replacement, and -- inside containers -- toolkit
+# bind-mount refresh) and may cause issues.
+#
+# Inputs (env):
+#   DRIVER    Driver version, e.g. "580.65.06". Must NOT be 'latest' or
+#             'earliest' -- those are runner-pre-installed and the
+#             workflow is expected to skip this script for them.
+#   GPU_TYPE  Lower-case GPU label from the matrix (e.g. "v100", "l4",
+#             "h100"). Used only to pick the kernel module flavor
+#             (Volta needs the proprietary/legacy module; everything
+#             newer can use the open module).
+#
+# Arch is detected from `uname -m`.
+#
+# When the script runs inside a container (the cuda-python Linux jobs do)
+# it re-execs itself on the host via `nsenter`. The job must declare
+# `options: --privileged --pid=host` (the workflow only does this for
+# matrix rows with a custom DRIVER). After the host-side install, the
+# container's bind-mounted nvidia libs/binaries are refreshed in-place so
+# the new driver is visible without restarting the container.
+set -euo pipefail
+
+: "${DRIVER:?DRIVER env var is required (e.g. 580.65.06)}"
+: "${GPU_TYPE:?GPU_TYPE env var is required (e.g. l4)}"
+
+case "$DRIVER" in
+  latest|earliest)
+    echo "::error::install_gpu_driver.sh must not be invoked with DRIVER=$DRIVER (runner-pre-installed)" >&2
+    exit 1
+    ;;
+esac
+
+VERSION="$DRIVER"
+
+# Volta (V100) requires the legacy/proprietary kernel module; all newer
+# GPUs in this matrix support the open module. Extend this if/when older
+# GPUs return to the matrix.
+case "$GPU_TYPE" in
+  v100) KMT=proprietary ;;
+  *)    KMT=open ;;
+esac
+
+case "$(uname -m)" in
+  x86_64)
+    ARCH_DIR=Linux-x86_64
+    ARCH_SUFFIX=x86_64
+    ;;
+  aarch64)
+    ARCH_DIR=Linux-aarch64
+    ARCH_SUFFIX=aarch64
+    ;;
+  *)
+    echo "::error::unsupported arch: $(uname -m)" >&2
+    exit 1
+    ;;
+esac
+
+URL="https://us.download.nvidia.com/XFree86/${ARCH_DIR}/${VERSION}/NVIDIA-Linux-${ARCH_SUFFIX}-${VERSION}.run"
+
+# Re-elevate to root if needed (sudo is preinstalled on the runner image).
+if [ "$(id -u)" != 0 ]; then
+  exec sudo -E DRIVER="$DRIVER" GPU_TYPE="$GPU_TYPE" "$0" "$@"
+fi
+
+echo "install_gpu_driver.sh is ALPHA -- it performs live modifications to the host driver stack and may cause issues" >&2
+echo "DRIVER=${VERSION}  GPU_TYPE=${GPU_TYPE}  KMT=${KMT}  ARCH=${ARCH_SUFFIX}" >&2
+echo "URL=${URL}" >&2
+
+# Toolkit packages we keep across the purge: dockerd's --runtime=nvidia
+# resolves nvidia-container-runtime through these, and removing them
+# breaks `docker exec` against any container started with that runtime.
+KEEP_RE='^(nvidia-container-toolkit(-base)?|libnvidia-container1|libnvidia-container-tools)$'
+
+in_container() {
+  [ -f /.dockerenv ] || grep -qE '/(docker|kubepods|containerd)' /proc/1/cgroup 2>/dev/null
+}
+
+host_install() {
+  apt-get -y install build-essential dkms "linux-headers-$(uname -r)" psmisc kmod
+
+  systemctl stop nvidia-persistenced dcgm-exporter 2>/dev/null || true
+  # if-test instead of `fuser ... || true` so a kill failure surfaces
+  # (fuser exits 1 when nothing holds the device, which is the happy path).
+  if fuser /dev/nvidia* >/dev/null 2>&1; then
+    fuser -kv /dev/nvidia*
+  fi
+  sleep 1
+  for m in nvidia_uvm nvidia_drm nvidia_modeset nvidia; do
+    rmmod "$m" 2>/dev/null || true
+  done
+
+  # Purge existing nvidia/libnvidia packages, except the toolkit pieces
+  # captured by KEEP_RE. Tolerate apt failures: postrm scripts can trip
+  # and the .run installer is about to replace everything anyway.
+  dpkg-query -W -f='${Package}\n' 'nvidia-*' 'libnvidia-*' 2>/dev/null \
+    | awk -v re="$KEEP_RE" '$0 !~ re' \
+    | xargs -r apt-get -y remove --purge || true
+
+  local d
+  d=$(mktemp -d)
+  ( cd "$d" \
+    && wget -q -O installer.run "$URL" \
+    && sh installer.run --silent --dkms --no-questions \
+         --accept-license --ui=none --no-cc-version-check --kernel-module-type="$KMT" )
+  modprobe nvidia nvidia_uvm nvidia_modeset
+}
+
+# Replace the toolkit's bind-mounted nvidia libs/binaries inside this
+# container with copies from the host's new install. `cp` (not
+# `mount --bind`) because procfs-routed binds drop the exec bit.
+refresh_container_libs() {
+  # Walk /proc/self/mountinfo and match the toolkit-injected nvidia
+  # binds via mount point (field 5) so deleted source paths -- which
+  # the kernel suffixes field 4 with " (deleted)" once the host unlinks
+  # the old lib -- don't break discovery. Filters skip what we can't or
+  # shouldn't refresh:
+  #   $3 ~ /^0:/                tmpfs/proc/sysfs (e.g. the toolkit hook tmpfs)
+  #   $5 ~ /\.json$/            vulkan/glvnd config remaps (not version-bound)
+  #   $5 ~ /\/(firmware|xorg)\// firmware loads host-side; xorg unused in CUDA containers
+  local mounts
+  mounts=$(awk '
+    $3 !~ /^0:/                     &&
+    $5 !~ /\.json$/                 &&
+    $5 !~ /\/(firmware|xorg)\//     &&
+    $5 ~ /(nvidia|libcuda)/         { print $5 }
+  ' /proc/self/mountinfo | sort -u)
+
+  for tgt in $mounts; do
+    local src="/proc/1/root$tgt"
+    if [ ! -e "$src" ]; then
+      # Driver swap rewrites the version suffix (libfoo.so.595.71.05 ->
+      # libfoo.so.580.65.06); strip it and find the new file.
+      local base
+      base=$(basename "$tgt")
+      base="${base%.so.*}.so"
+      src=$(find "/proc/1/root$(dirname "$tgt")" -maxdepth 1 -name "${base}.*" 2>/dev/null \
+            | sort -V | tail -n1)
+      [ -n "$src" ] || { echo "skip $tgt: no host source" >&2; continue; }
+    fi
+    umount "$tgt" 2>/dev/null || true
+    cp -f --remove-destination "$src" "$tgt" \
+      || echo "WARN: refresh failed for $tgt (src=$src)" >&2
+  done
+  ldconfig
+}
+
+if [ -z "${_NVDRV_NSENTERED:-}" ] && in_container; then
+  _NVDRV_NSENTERED=1 nsenter -t 1 -m -p -n -i -u -- "$0" \
+    || { echo "::error::container needs 'options: --privileged --pid=host'" >&2; exit 1; }
+  refresh_container_libs
+else
+  host_install
+fi
+
+nvidia-smi >/dev/null
+grep -qF "$VERSION" /proc/driver/nvidia/version

From 3e016b572dcded7701f9b2f12c25cec3cb7e5b1d Mon Sep 17 00:00:00 2001
From: Leo Fang <leof@nvidia.com>
Date: Sun, 7 Jun 2026 03:09:01 +0000
Subject: [PATCH 02/16] CI: fix Linux driver nsenter re-exec, swap Windows
 version, enable ci.yml dispatch

- install_gpu_driver.sh: pipe the script body to the host-side bash via
  stdin (bash -s < "$0") instead of re-execing "$0". The script lives
  in the GH workspace mount (container-only), so the relative path
  doesn't resolve after nsenter switches the mount namespace.
  The < "$0" fd is opened before nsenter and survives the flip.
- test-matrix.yml: Windows nightly-numba-cuda row 610.47 -> 596.36
  (610.47 isn't published on the CDN; install hit 404).
- ci.yml: add workflow_dispatch: trigger so the pipeline can be
  re-run manually. The existing should-skip / detect-changes gates
  already handle non-PR events.
---
 .github/workflows/ci.yml       | 1 +
 ci/test-matrix.yml             | 2 +-
 ci/tools/install_gpu_driver.sh | 8 +++++++-
 3 files changed, 9 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 9f9236b09fe..82ab7210c92 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -24,6 +24,7 @@ on:
   schedule:
     # every 24 hours at midnight UTC
     - cron: "0 0 * * *"
+  workflow_dispatch:
 
 jobs:
   ci-vars:
diff --git a/ci/test-matrix.yml b/ci/test-matrix.yml
index 3d5693a188a..730791ac283 100644
--- a/ci/test-matrix.yml
+++ b/ci/test-matrix.yml
@@ -122,4 +122,4 @@ windows:
     - { MODE: 'nightly-pytorch',    ARCH: 'amd64', PY_VER: '3.12', CUDA_VER: '13.0.2', LOCAL_CTK: '0', GPU: 'l4', GPU_COUNT: '1', DRIVER: 'latest', DRIVER_MODE: 'TCC', TORCH_VER: '2.9.1',  TORCH_CUDA: 'cu130' }
     # nightly-numba-cuda
     - { MODE: 'nightly-numba-cuda', ARCH: 'amd64', PY_VER: '3.12', CUDA_VER: '12.9.1', LOCAL_CTK: '0', GPU: 'l4', GPU_COUNT: '1', DRIVER: 'latest', DRIVER_MODE: 'TCC' }
-    - { MODE: 'nightly-numba-cuda', ARCH: 'amd64', PY_VER: '3.12', CUDA_VER: '13.3.0', LOCAL_CTK: '0', GPU: 'l4', GPU_COUNT: '1', DRIVER: '610.47',  DRIVER_MODE: 'TCC' }
+    - { MODE: 'nightly-numba-cuda', ARCH: 'amd64', PY_VER: '3.12', CUDA_VER: '13.3.0', LOCAL_CTK: '0', GPU: 'l4', GPU_COUNT: '1', DRIVER: '596.36',  DRIVER_MODE: 'TCC' }
diff --git a/ci/tools/install_gpu_driver.sh b/ci/tools/install_gpu_driver.sh
index 5dff7043487..b7aeb3434f4 100755
--- a/ci/tools/install_gpu_driver.sh
+++ b/ci/tools/install_gpu_driver.sh
@@ -156,7 +156,13 @@ refresh_container_libs() {
 }
 
 if [ -z "${_NVDRV_NSENTERED:-}" ] && in_container; then
-  _NVDRV_NSENTERED=1 nsenter -t 1 -m -p -n -i -u -- "$0" \
+  # Re-exec on the host. The runner-team's `nvgha-driver` script lives at a
+  # host-side absolute path so `"$0"` survives the mount-namespace flip;
+  # ours lives in the GH workspace mount (container-only), so we pipe the
+  # script body in via stdin instead -- the `< "$0"` fd is opened before
+  # nsenter and stays valid across the namespace switch. Env vars (DRIVER,
+  # GPU_TYPE, _NVDRV_NSENTERED) are inherited by the host-side bash.
+  _NVDRV_NSENTERED=1 nsenter -t 1 -m -p -n -i -u -- bash -s < "$0" \
     || { echo "::error::container needs 'options: --privileged --pid=host'" >&2; exit 1; }
   refresh_container_libs
 else

From c0ca8696e64c7fb1e275ae628905ba6d86144279 Mon Sep 17 00:00:00 2001
From: Leo Fang <leof@nvidia.com>
Date: Sun, 7 Jun 2026 03:27:53 +0000
Subject: [PATCH 03/16] CI: move 'Ensure GPU is working' after 'Install GPU
 driver' on Linux

So nvidia-smi validates the post-install driver state on custom-DRIVER
rows. Windows test-wheel + coverage already use Install -> Configure ->
Ensure; this brings the Linux test-wheel job into line.
---
 .github/workflows/test-wheel-linux.yml | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/test-wheel-linux.yml b/.github/workflows/test-wheel-linux.yml
index 4f56cb57740..57bc4dc555f 100644
--- a/.github/workflows/test-wheel-linux.yml
+++ b/.github/workflows/test-wheel-linux.yml
@@ -121,9 +121,6 @@ jobs:
         NVIDIA_VISIBLE_DEVICES: ${{ env.NVIDIA_VISIBLE_DEVICES }}
         PIP_CACHE_DIR: "/tmp/pip-cache"
     steps:
-      - name: Ensure GPU is working
-        run: nvidia-smi
-
       - name: Checkout ${{ github.event.repository.name }}
         uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10  # v6.0.3
 
@@ -151,6 +148,9 @@ jobs:
           apt-get -y install --no-install-recommends util-linux
           ./ci/tools/install_gpu_driver.sh
 
+      - name: Ensure GPU is working
+        run: nvidia-smi
+
       - name: Set environment variables
         env:
           BUILD_CUDA_VER: ${{ inputs.build-ctk-ver }}

From 4a23b23a26111a8ead0cc40519a2b558ea9bfe66 Mon Sep 17 00:00:00 2001
From: Leo Fang <leof@nvidia.com>
Date: Sun, 7 Jun 2026 03:30:30 +0000
Subject: [PATCH 04/16] CI: flip two PR-matrix Linux rows to DRIVER=610.43.02

Exercises the custom-driver install path on every PR (not just nightly).
Both rows are amd64 / 13.3.0 / local-CTK, on l4 and rtxpro6000 -- both
in the 'open' kernel-module flavor (only Volta needs 'legacy').
---
 ci/test-matrix.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/ci/test-matrix.yml b/ci/test-matrix.yml
index 730791ac283..51f0d3f063f 100644
--- a/ci/test-matrix.yml
+++ b/ci/test-matrix.yml
@@ -38,10 +38,10 @@ linux:
     - { ARCH: 'amd64', PY_VER: '3.12',  CUDA_VER: '13.3.0', LOCAL_CTK: '0', GPU: 'l4',         GPU_COUNT: '1', DRIVER: 'latest' }
     - { ARCH: 'amd64', PY_VER: '3.13',  CUDA_VER: '12.9.1', LOCAL_CTK: '0', GPU: 'v100',       GPU_COUNT: '1', DRIVER: 'latest' }
     - { ARCH: 'amd64', PY_VER: '3.13',  CUDA_VER: '13.0.2', LOCAL_CTK: '1', GPU: 'rtxpro6000', GPU_COUNT: '1', DRIVER: 'latest' }
-    - { ARCH: 'amd64', PY_VER: '3.13',  CUDA_VER: '13.3.0', LOCAL_CTK: '1', GPU: 'rtxpro6000', GPU_COUNT: '1', DRIVER: 'latest' }
+    - { ARCH: 'amd64', PY_VER: '3.13',  CUDA_VER: '13.3.0', LOCAL_CTK: '1', GPU: 'rtxpro6000', GPU_COUNT: '1', DRIVER: '610.43.02' }
     - { ARCH: 'amd64', PY_VER: '3.14',  CUDA_VER: '12.9.1', LOCAL_CTK: '0', GPU: 't4',         GPU_COUNT: '1', DRIVER: 'latest' }
     - { ARCH: 'amd64', PY_VER: '3.14',  CUDA_VER: '13.0.2', LOCAL_CTK: '1', GPU: 'l4',         GPU_COUNT: '1', DRIVER: 'latest' }
-    - { ARCH: 'amd64', PY_VER: '3.14',  CUDA_VER: '13.3.0', LOCAL_CTK: '1', GPU: 'l4',         GPU_COUNT: '1', DRIVER: 'latest' }
+    - { ARCH: 'amd64', PY_VER: '3.14',  CUDA_VER: '13.3.0', LOCAL_CTK: '1', GPU: 'l4',         GPU_COUNT: '1', DRIVER: '610.43.02' }
     - { ARCH: 'amd64', PY_VER: '3.14t', CUDA_VER: '12.9.1', LOCAL_CTK: '1', GPU: 't4',         GPU_COUNT: '1', DRIVER: 'latest' }
     - { ARCH: 'amd64', PY_VER: '3.14t', CUDA_VER: '13.0.2', LOCAL_CTK: '1', GPU: 'l4',         GPU_COUNT: '1', DRIVER: 'latest' }
     - { ARCH: 'amd64', PY_VER: '3.14t', CUDA_VER: '13.3.0', LOCAL_CTK: '1', GPU: 'l4',         GPU_COUNT: '1', DRIVER: 'latest' }

From d33a928eb464987b6de6eeceaf7ab9454ffe1e91 Mon Sep 17 00:00:00 2001
From: Leo Fang <leof@nvidia.com>
Date: Sun, 7 Jun 2026 14:16:08 +0000
Subject: [PATCH 05/16] CI: restart nvidia-persistenced on Linux; poll
 nvidia-smi on Windows

Linux: After install_gpu_driver.sh stops nvidia-persistenced and the apt
purge removes the package, the .run installer reinstalls the systemd
service but leaves it stopped. cuda.core's test_persistence_mode_enabled
fails with NVML_ERROR_UNKNOWN on driver 610.43.02 when the daemon is
not running; explicitly start it again at the end of host_install().

Windows: configure_driver_mode.ps1's trailing 'Start-Sleep -Seconds 5'
is not enough on slower-coming-back-up multi-GPU rows (observed: 2x
H100 MCDM). Replace it with a poll-until-success loop on nvidia-smi
with a 60s deadline, matching the runner-team nvgha-driver.ps1 pattern.
Previously masked because every Windows row used to run the full
install pipeline; with custom-DRIVER plumbing, latest/earliest rows
skip the install and the cycle is no longer preceded by warm-up time.
---
 ci/tools/configure_driver_mode.ps1 | 17 +++++++++++++++--
 ci/tools/install_gpu_driver.sh     |  9 +++++++++
 2 files changed, 24 insertions(+), 2 deletions(-)

diff --git a/ci/tools/configure_driver_mode.ps1 b/ci/tools/configure_driver_mode.ps1
index 280e725e11b..42e0914935d 100644
--- a/ci/tools/configure_driver_mode.ps1
+++ b/ci/tools/configure_driver_mode.ps1
@@ -37,8 +37,21 @@ function Set-DriverMode {
         pnputil /disable-device "$($device.InstanceId)"
         pnputil /enable-device "$($device.InstanceId)"
     }
-    # Give it a minute to settle:
-    Start-Sleep -Seconds 5
+
+    # Poll nvidia-smi until NVML can initialize, or give up after ~60s.
+    # A fixed sleep is not enough on slower-coming-back-up multi-GPU rows
+    # (e.g. 2x H100 MCDM) where pnputil enable returns before NVML is
+    # ready. Pattern borrowed from the runner-team `nvgha-driver.ps1`.
+    Write-Output "Waiting for nvidia-smi/NVML to come back up after device cycle..."
+    $deadline = (Get-Date).AddSeconds(60)
+    do {
+        Start-Sleep -Seconds 2
+        & nvidia-smi.exe 2>&1 | Out-Null
+    } while ($LASTEXITCODE -ne 0 -and (Get-Date) -lt $deadline)
+    if ($LASTEXITCODE -ne 0) {
+        Write-Error "nvidia-smi did not return cleanly within 60s of the device cycle"
+        exit 1
+    }
 }
 
 # Run the functions
diff --git a/ci/tools/install_gpu_driver.sh b/ci/tools/install_gpu_driver.sh
index b7aeb3434f4..f104ed09751 100755
--- a/ci/tools/install_gpu_driver.sh
+++ b/ci/tools/install_gpu_driver.sh
@@ -114,6 +114,15 @@ host_install() {
     && sh installer.run --silent --dkms --no-questions \
          --accept-license --ui=none --no-cc-version-check --kernel-module-type="$KMT" )
   modprobe nvidia nvidia_uvm nvidia_modeset
+
+  # Restore nvidia-persistenced. We stopped it before the install (and the
+  # purge may have removed it); the .run installer reinstalls the service.
+  # Some NVML calls -- e.g. nvmlDeviceSetPersistenceMode -- can fail with
+  # NVML_ERROR_UNKNOWN on newer drivers when the daemon isn't running, and
+  # cuda.core's test_persistence_mode_enabled trips on that.
+  if systemctl list-unit-files 2>/dev/null | grep -q '^nvidia-persistenced\.service'; then
+    systemctl start nvidia-persistenced || true
+  fi
 }
 
 # Replace the toolkit's bind-mounted nvidia libs/binaries inside this

From 00896dc0f6f3fea81f85485c973d1fffcaf48584 Mon Sep 17 00:00:00 2001
From: Leo Fang <leof@nvidia.com>
Date: Sun, 7 Jun 2026 15:07:21 +0000
Subject: [PATCH 06/16] CI: re-enable persistence mode after Linux driver swap

Runner-latest L4 images come up with Persistence-M=On (set somewhere in
the runner team's image setup, not in cuda-python). Our .run install
leaves it Off, which breaks cuda.core's test_persistence_mode_enabled
on driver 610.43.02 -- the test calls device.is_persistence_mode_enabled
= False on a device that already reports False, and 610.43.02 returns
NVML_ERROR_UNKNOWN for that no-op set.

Restore the runner baseline by calling `nvidia-smi -pm 1` at the end of
host_install() (sets the kernel persistence flag directly via NVML).
Also daemon-reload + start nvidia-persistenced.service best-effort so
tools that look for the daemon find it; `set -x` around this trailing
block so the next run's log confirms which lines fired.
---
 ci/tools/install_gpu_driver.sh | 25 +++++++++++++++++--------
 1 file changed, 17 insertions(+), 8 deletions(-)

diff --git a/ci/tools/install_gpu_driver.sh b/ci/tools/install_gpu_driver.sh
index f104ed09751..cef1d6923f8 100755
--- a/ci/tools/install_gpu_driver.sh
+++ b/ci/tools/install_gpu_driver.sh
@@ -115,14 +115,23 @@ host_install() {
          --accept-license --ui=none --no-cc-version-check --kernel-module-type="$KMT" )
   modprobe nvidia nvidia_uvm nvidia_modeset
 
-  # Restore nvidia-persistenced. We stopped it before the install (and the
-  # purge may have removed it); the .run installer reinstalls the service.
-  # Some NVML calls -- e.g. nvmlDeviceSetPersistenceMode -- can fail with
-  # NVML_ERROR_UNKNOWN on newer drivers when the daemon isn't running, and
-  # cuda.core's test_persistence_mode_enabled trips on that.
-  if systemctl list-unit-files 2>/dev/null | grep -q '^nvidia-persistenced\.service'; then
-    systemctl start nvidia-persistenced || true
-  fi
+  # Restore the runner image's baseline state: persistence mode ENABLED
+  # plus nvidia-persistenced running. The runner-team's pre-installed
+  # drivers come up with `Persistence-M: On`, but our .run install leaves
+  # it Off, which breaks tests that toggle the value (cuda.core's
+  # test_persistence_mode_enabled hits NVML_ERROR_UNKNOWN when setting
+  # the mode to its current value on driver 610.43.02).
+  #
+  # `nvidia-smi -pm 1` is the load-bearing call -- it sets the kernel-
+  # level persistence flag directly via NVML (equivalent to what the
+  # daemon would do on startup). The systemctl block is best-effort: the
+  # silent .run installer doesn't always drop the systemd unit, so we
+  # daemon-reload first and tolerate failure on `start`.
+  set -x
+  nvidia-smi -pm 1 || true
+  systemctl daemon-reload 2>/dev/null || true
+  systemctl start nvidia-persistenced.service 2>/dev/null || true
+  set +x
 }
 
 # Replace the toolkit's bind-mounted nvidia libs/binaries inside this

From 0d5f0e986d0e36b2dd0954efff59266dae05b9db Mon Sep 17 00:00:00 2001
From: Leo Fang <leof@nvidia.com>
Date: Sun, 7 Jun 2026 16:54:41 +0000
Subject: [PATCH 07/16] CI: preserve SUID bit when refreshing container nvidia
 binaries

refresh_container_libs() used 'cp -f --remove-destination' (verbatim
from the runner team's nvgha-driver), which without -p/--preserve
strips the SUID/SGID bits on the destination. /usr/bin/nvidia-modprobe
ships 4755 and NVML's state-changing calls (e.g.
nvmlDeviceSetPersistenceMode) route through it; once SUID is gone the
container-side call returns NVML_ERROR_UNKNOWN, which is what cuda.core's
test_persistence_mode_enabled was hitting.

Add a stat diagnostic line at the end of refresh_container_libs() so
the next CI log records nvidia-modprobe's post-refresh mode.
---
 ci/tools/install_gpu_driver.sh | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

diff --git a/ci/tools/install_gpu_driver.sh b/ci/tools/install_gpu_driver.sh
index cef1d6923f8..242a2fe5d8a 100755
--- a/ci/tools/install_gpu_driver.sh
+++ b/ci/tools/install_gpu_driver.sh
@@ -167,10 +167,21 @@ refresh_container_libs() {
       [ -n "$src" ] || { echo "skip $tgt: no host source" >&2; continue; }
     fi
     umount "$tgt" 2>/dev/null || true
-    cp -f --remove-destination "$src" "$tgt" \
+    # --preserve=mode keeps the SUID bit. /usr/bin/nvidia-modprobe ships
+    # 4755 and NVML's state-changing calls (e.g.
+    # nvmlDeviceSetPersistenceMode) go through it; a plain `cp` strips
+    # SUID and the call then fails with NVML_ERROR_UNKNOWN. The runner
+    # team's nvgha-driver has the same bug; we differ here.
+    cp -f --preserve=mode --remove-destination "$src" "$tgt" \
       || echo "WARN: refresh failed for $tgt (src=$src)" >&2
   done
   ldconfig
+
+  # Diagnostic: confirm SUID survived on nvidia-modprobe (the load-bearing
+  # piece). One-liner so the next CI log proves the fix.
+  if [ -e /usr/bin/nvidia-modprobe ]; then
+    stat -c 'refresh: %n mode=%a uid=%u' /usr/bin/nvidia-modprobe >&2
+  fi
 }
 
 if [ -z "${_NVDRV_NSENTERED:-}" ] && in_container; then

From 3dfaa8495fa064b0ff5984a0b4c95433fc8979c8 Mon Sep 17 00:00:00 2001
From: Leo Fang <leof@nvidia.com>
Date: Sun, 7 Jun 2026 18:48:18 +0000
Subject: [PATCH 08/16] CI: exec nvidia-persistenced directly after Linux
 driver swap

The `--silent --no-questions` .run installer drops /usr/bin/nvidia-
persistenced but does not reliably install a usable systemd unit, so
`systemctl start nvidia-persistenced.service` was a no-op (verified
in CI logs: `+ true` after the start). With the daemon down, the
/run/nvidia-persistenced/socket bind-mounted into the test container
is stale, and NVML state-changing calls (e.g.
nvmlDeviceSetPersistenceMode) made by root inside the container
return NVML_ERROR_UNKNOWN -- which is what cuda.core's
test_persistence_mode_enabled has been failing on.

Verified on ComputeLab with the same driver (610.43.02), same GPU
arch (Ada L40S), root in container: with the daemon up, the SET call
returns NVML_SUCCESS; with the daemon down it returns UnknownError.

Fix: exec /usr/bin/nvidia-persistenced directly. The binary
self-daemonizes and creates the socket on its own. (Same latent gap
exists in nv-gha-runners/vm-images' nvgha-driver; will flag upstream.)
---
 ci/tools/install_gpu_driver.sh | 29 +++++++++++++++--------------
 1 file changed, 15 insertions(+), 14 deletions(-)

diff --git a/ci/tools/install_gpu_driver.sh b/ci/tools/install_gpu_driver.sh
index 242a2fe5d8a..27c0b147458 100755
--- a/ci/tools/install_gpu_driver.sh
+++ b/ci/tools/install_gpu_driver.sh
@@ -115,22 +115,23 @@ host_install() {
          --accept-license --ui=none --no-cc-version-check --kernel-module-type="$KMT" )
   modprobe nvidia nvidia_uvm nvidia_modeset
 
-  # Restore the runner image's baseline state: persistence mode ENABLED
-  # plus nvidia-persistenced running. The runner-team's pre-installed
-  # drivers come up with `Persistence-M: On`, but our .run install leaves
-  # it Off, which breaks tests that toggle the value (cuda.core's
-  # test_persistence_mode_enabled hits NVML_ERROR_UNKNOWN when setting
-  # the mode to its current value on driver 610.43.02).
-  #
-  # `nvidia-smi -pm 1` is the load-bearing call -- it sets the kernel-
-  # level persistence flag directly via NVML (equivalent to what the
-  # daemon would do on startup). The systemctl block is best-effort: the
-  # silent .run installer doesn't always drop the systemd unit, so we
-  # daemon-reload first and tolerate failure on `start`.
+  # Bring nvidia-persistenced back up. We stopped it above, and the
+  # `--silent --no-questions` .run installer drops `/usr/bin/nvidia-
+  # persistenced` but does not reliably reinstall a usable systemd
+  # unit -- so a previous attempt at `systemctl start nvidia-
+  # persistenced.service` was a no-op (see ComputeLab repro on driver
+  # 610.43.02). Exec the daemon directly; it self-daemonizes and
+  # creates `/run/nvidia-persistenced/socket`, which NVML clients in
+  # the test container need for state-changing calls like
+  # `nvmlDeviceSetPersistenceMode` -- without it those calls return
+  # NVML_ERROR_UNKNOWN. nv-gha-runners/vm-images' `nvgha-driver` has
+  # the same gap; their CUDA-runtime validation workload doesn't hit
+  # an NVML SET write so they haven't surfaced it yet.
   set -x
+  /usr/bin/nvidia-persistenced --verbose 2>&1 || true
+  # Set persistence mode explicitly so we match the runner image's
+  # `Persistence-M: On` baseline regardless of how the daemon came up.
   nvidia-smi -pm 1 || true
-  systemctl daemon-reload 2>/dev/null || true
-  systemctl start nvidia-persistenced.service 2>/dev/null || true
   set +x
 }
 

From 701cf2f0f29c470c5093ddaef3b5da08b031ea46 Mon Sep 17 00:00:00 2001
From: Leo Fang <leof@nvidia.com>
Date: Sun, 7 Jun 2026 19:17:41 +0000
Subject: [PATCH 09/16] CI: pass --user root to nvidia-persistenced after Linux
 driver swap

nvidia-persistenced defaults to `--user nvidia-persistenced`, which
our apt-purge of `nvidia-compute-utils-*` removed. Without that user
the daemon's setuid(3) post-fork fails and the process exits silently
-- the `nvidia-smi -pm 1` right after sees Persistence-M briefly On
(daemon held it), then it flips back to Off (daemon gone), and the
test container's NVML SET call later returns NVML_ERROR_UNKNOWN.

Pass --user root so the daemon doesn't depend on a user account that
the purge deleted. Also add a `pgrep nvidia-persistenced` + `ls -la
/run/nvidia-persistenced/` diagnostic so the next CI log proves the
daemon is alive when the test starts.
---
 ci/tools/install_gpu_driver.sh | 29 ++++++++++++++++++++---------
 1 file changed, 20 insertions(+), 9 deletions(-)

diff --git a/ci/tools/install_gpu_driver.sh b/ci/tools/install_gpu_driver.sh
index 27c0b147458..3ddeda9e3ad 100755
--- a/ci/tools/install_gpu_driver.sh
+++ b/ci/tools/install_gpu_driver.sh
@@ -119,16 +119,27 @@ host_install() {
   # `--silent --no-questions` .run installer drops `/usr/bin/nvidia-
   # persistenced` but does not reliably reinstall a usable systemd
   # unit -- so a previous attempt at `systemctl start nvidia-
-  # persistenced.service` was a no-op (see ComputeLab repro on driver
-  # 610.43.02). Exec the daemon directly; it self-daemonizes and
-  # creates `/run/nvidia-persistenced/socket`, which NVML clients in
-  # the test container need for state-changing calls like
-  # `nvmlDeviceSetPersistenceMode` -- without it those calls return
-  # NVML_ERROR_UNKNOWN. nv-gha-runners/vm-images' `nvgha-driver` has
-  # the same gap; their CUDA-runtime validation workload doesn't hit
-  # an NVML SET write so they haven't surfaced it yet.
+  # persistenced.service` was a no-op. Exec the daemon directly; it
+  # self-daemonizes and creates `/run/nvidia-persistenced/socket`,
+  # which NVML clients in the test container need for state-changing
+  # calls like `nvmlDeviceSetPersistenceMode` -- without it those
+  # calls return NVML_ERROR_UNKNOWN.
+  #
+  # `--user root`: the daemon's default user is `nvidia-persistenced`,
+  # which our apt purge of `nvidia-compute-utils-*` deleted. Without
+  # this flag the daemon's setuid(3) call fails post-fork and the
+  # process exits silently (which leaves Persistence-M flipping back
+  # to Off the moment we exit the start window).
+  #
+  # Same latent gap exists in nv-gha-runners/vm-images' `nvgha-driver`;
+  # their CUDA-runtime validation workload doesn't issue an NVML SET
+  # write so they haven't surfaced it yet.
   set -x
-  /usr/bin/nvidia-persistenced --verbose 2>&1 || true
+  /usr/bin/nvidia-persistenced --verbose --user root 2>&1 || true
+  sleep 1
+  # Diagnostics: confirm the daemon is alive + socket present.
+  pgrep -laf nvidia-persistenced || echo "WARN: nvidia-persistenced not running"
+  ls -la /run/nvidia-persistenced/ 2>&1 || echo "WARN: /run/nvidia-persistenced missing"
   # Set persistence mode explicitly so we match the runner image's
   # `Persistence-M: On` baseline regardless of how the daemon came up.
   nvidia-smi -pm 1 || true

From a3f157382b28f8be433e25866446e852664c5180 Mon Sep 17 00:00:00 2001
From: Leo Fang <leof@nvidia.com>
Date: Sun, 7 Jun 2026 19:20:32 +0000
Subject: [PATCH 10/16] CI: add fast-feedback probe-driver-swap job
 (workflow_dispatch only)

Allocates one L4 GPU + privileged container, runs install_gpu_driver.sh
with DRIVER=610.43.02, then drives nvmlDeviceSetPersistenceMode via
raw ctypes -- the exact NVML call that cuda.core's
test_persistence_mode_enabled exercises. Exits 1 on
NVML_ERROR_UNKNOWN so the smoke test fails loudly when the install
path leaves the daemon dead.

Total runtime ~5 min vs ~30 min for the full test matrix.

Triggered by workflow_dispatch only -- this is an opt-in debugging
job, not regular PR or nightly traffic.
---
 .github/workflows/ci.yml | 72 ++++++++++++++++++++++++++++++++++++++++
 1 file changed, 72 insertions(+)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 82ab7210c92..27d827de7e0 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -413,6 +413,78 @@ jobs:
     with:
       is-release: ${{ github.ref_type == 'tag' }}
 
+  # Fast-feedback probe for changes to ci/tools/install_gpu_driver.sh.
+  # Allocates one L4 GPU + container, runs the driver swap to a
+  # hard-coded version, then drives nvmlDeviceSetPersistenceMode via
+  # raw ctypes -- the *exact* NVML call that cuda.core's
+  # test_persistence_mode_enabled exercises. Total runtime is ~5 min
+  # vs. ~30 min for a full test matrix.
+  #
+  # `workflow_dispatch` only -- this is an opt-in debugging job, not
+  # part of regular PR or nightly traffic.
+  probe-driver-swap:
+    name: Probe custom-DRIVER install
+    if: ${{ github.event_name == 'workflow_dispatch' && github.repository_owner == 'nvidia' }}
+    runs-on: "linux-amd64-gpu-l4-latest-1"
+    timeout-minutes: 15
+    defaults:
+      run:
+        shell: bash --noprofile --norc -xeuo pipefail {0}
+    container:
+      options: -u root --security-opt seccomp=unconfined --shm-size 16g --privileged --pid=host
+      image: ubuntu:22.04
+      env:
+        NVIDIA_VISIBLE_DEVICES: ${{ env.NVIDIA_VISIBLE_DEVICES }}
+    steps:
+      - name: Checkout
+        uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10  # v6.0.3
+
+      - name: Install host deps
+        run: |
+          apt-get update -qq
+          apt-get -y install --no-install-recommends util-linux python3
+
+      - name: Install GPU driver
+        env:
+          DRIVER: '610.43.02'
+          GPU_TYPE: 'l4'
+        run: ./ci/tools/install_gpu_driver.sh
+
+      - name: Show post-install host + container state
+        run: |
+          nvidia-smi --query-gpu=name,driver_version,persistence_mode --format=csv
+          echo
+          echo "=== /run/nvidia-persistenced ==="
+          ls -la /run/nvidia-persistenced/ 2>&1 || echo "MISSING"
+          echo
+          echo "=== nvidia-persistenced process ==="
+          pgrep -laf nvidia-persistenced || echo "(none)"
+
+      - name: Drive nvmlDeviceSetPersistenceMode via ctypes
+        run: |
+          python3 <<'PYEOF'
+          import ctypes, sys
+          NVML_SUCCESS, NVML_ERROR_NO_PERMISSION, NVML_ERROR_UNKNOWN = 0, 4, 999
+          nvml = ctypes.CDLL("libnvidia-ml.so.1")
+          assert nvml.nvmlInit_v2() == 0, "nvmlInit_v2 failed"
+          h = ctypes.c_void_p()
+          assert nvml.nvmlDeviceGetHandleByIndex_v2(0, ctypes.byref(h)) == 0
+          m = ctypes.c_uint(99)
+          nvml.nvmlDeviceGetPersistenceMode(h, ctypes.byref(m))
+          print(f"current persistence_mode = {m.value} (1=ENABLED, 0=DISABLED)")
+          ret = nvml.nvmlDeviceSetPersistenceMode(h, 0)
+          print(f"SET DISABLED -> {ret}  # 0=SUCCESS, 4=NO_PERMISSION, 999=UNKNOWN")
+          if ret == NVML_ERROR_UNKNOWN:
+              print("FAIL: NVML_ERROR_UNKNOWN -- daemon-down failure mode reproduced", file=sys.stderr)
+              sys.exit(1)
+          if ret != NVML_SUCCESS:
+              print(f"FAIL: unexpected return code {ret}", file=sys.stderr)
+              sys.exit(1)
+          # restore
+          nvml.nvmlDeviceSetPersistenceMode(h, m.value)
+          print("OK")
+          PYEOF
+
   checks:
     name: Check job status
     if: always()

From c5fef92e8c2d203ca0f1b826a37f08717a8a7fc9 Mon Sep 17 00:00:00 2001
From: Leo Fang <leof@nvidia.com>
Date: Sun, 7 Jun 2026 19:22:56 +0000
Subject: [PATCH 11/16] CI: drop workflow_dispatch gate on probe-driver-swap so
 it runs on every PR

---
 .github/workflows/ci.yml | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 27d827de7e0..3a176deff31 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -418,13 +418,11 @@ jobs:
   # hard-coded version, then drives nvmlDeviceSetPersistenceMode via
   # raw ctypes -- the *exact* NVML call that cuda.core's
   # test_persistence_mode_enabled exercises. Total runtime is ~5 min
-  # vs. ~30 min for a full test matrix.
-  #
-  # `workflow_dispatch` only -- this is an opt-in debugging job, not
-  # part of regular PR or nightly traffic.
+  # vs. ~30 min for a full test matrix; runs on every PR push so we
+  # can iterate on `ci/tools/install_gpu_driver.sh` quickly.
   probe-driver-swap:
     name: Probe custom-DRIVER install
-    if: ${{ github.event_name == 'workflow_dispatch' && github.repository_owner == 'nvidia' }}
+    if: ${{ github.repository_owner == 'nvidia' && !cancelled() }}
     runs-on: "linux-amd64-gpu-l4-latest-1"
     timeout-minutes: 15
     defaults:

From f17dd7f95d981d5546fddc423c47bf3e3a1e9510 Mon Sep 17 00:00:00 2001
From: Leo Fang <leof@nvidia.com>
Date: Sun, 7 Jun 2026 19:34:25 +0000
Subject: [PATCH 12/16] CI: stop refresh_container_libs from clobbering
 /run/nvidia-persistenced

refresh_container_libs() walks /proc/self/mountinfo for entries
containing 'nvidia' or 'libcuda'. /run/nvidia-persistenced/socket
matches that pattern and was being umount'd + cp'd over -- which
breaks the container's view of the daemon's IPC socket (the
container ends up with a 0-link unlinked socket inode instead of
the live host one). Without a working socket, NVML state-changing
calls inside the container return NVML_ERROR_UNKNOWN -- which is
exactly what cuda.core's test_persistence_mode_enabled was hitting.

Restrict the refresh to /usr/(bin|lib) so it only touches the
actual binaries + shared libraries that change version with the
driver swap. /dev/nvidia*, /proc/driver/nvidia, /run/nvidia-*,
/tmp/nvidia-mps are all left as the toolkit set them up.

Same latent gap exists in nv-gha-runners/vm-images' nvgha-driver;
their CUDA-runtime validation workload never queries the daemon
socket so they haven't surfaced it.
---
 ci/tools/install_gpu_driver.sh | 15 ++++++++++++---
 1 file changed, 12 insertions(+), 3 deletions(-)

diff --git a/ci/tools/install_gpu_driver.sh b/ci/tools/install_gpu_driver.sh
index 3ddeda9e3ad..05d87357fac 100755
--- a/ci/tools/install_gpu_driver.sh
+++ b/ci/tools/install_gpu_driver.sh
@@ -155,12 +155,21 @@ refresh_container_libs() {
   # the kernel suffixes field 4 with " (deleted)" once the host unlinks
   # the old lib -- don't break discovery. Filters skip what we can't or
   # shouldn't refresh:
-  #   $3 ~ /^0:/                tmpfs/proc/sysfs (e.g. the toolkit hook tmpfs)
-  #   $5 ~ /\.json$/            vulkan/glvnd config remaps (not version-bound)
-  #   $5 ~ /\/(firmware|xorg)\// firmware loads host-side; xorg unused in CUDA containers
+  #   $3 ~ /^0:/                  tmpfs/proc/sysfs (e.g. the toolkit hook tmpfs)
+  #   $5 must be under /usr/(bin|lib)  binaries + libs only -- explicitly
+  #                                NOT /run/nvidia-persistenced/socket
+  #                                (cp'ing the daemon's IPC socket unlinks
+  #                                the container's view and turns later
+  #                                NVML state-changing calls into
+  #                                NVML_ERROR_UNKNOWN); NOT /dev/nvidia*
+  #                                (character devices); NOT /proc/driver/nvidia
+  #                                (procfs); NOT /tmp/nvidia-mps (runtime).
+  #   $5 ~ /\.json$/              vulkan/glvnd config remaps (not version-bound)
+  #   $5 ~ /\/(firmware|xorg)\//  firmware loads host-side; xorg unused in CUDA containers
   local mounts
   mounts=$(awk '
     $3 !~ /^0:/                     &&
+    $5 ~ /^\/usr\/(bin|lib)/        &&
     $5 !~ /\.json$/                 &&
     $5 !~ /\/(firmware|xorg)\//     &&
     $5 ~ /(nvidia|libcuda)/         { print $5 }

From 6412f4f43f71fa61f4a3a23f41ebaf5fbf193dcb Mon Sep 17 00:00:00 2001
From: Leo Fang <leof@nvidia.com>
Date: Sun, 7 Jun 2026 19:44:15 +0000
Subject: [PATCH 13/16] CI: take down nvidia-persistenced via pkill, not
 systemctl

The packaged nvidia-persistenced.service has
`RuntimeDirectory=nvidia-persistenced`, which makes systemd `unlink()`
/run/nvidia-persistenced/ when the unit stops. The container has that
directory bind-mounted from the host as of container-start time. When
systemd removes the inode and our subsequent
`/usr/bin/nvidia-persistenced --user root` call re-creates it, the
container's bind mount is stranded on the deleted inode -- its
/run/nvidia-persistenced/socket shows up with link count 0 and NVML
state-changing calls return NVML_ERROR_UNKNOWN.

`pkill -TERM nvidia-persistenced` sends SIGTERM directly to the
daemon, which exits cleanly without involving systemd's
RuntimeDirectory cleanup. The host dir keeps its inode across the
swap; the container's bind mount stays valid; the new daemon's
socket is visible to in-container NVML clients.
---
 ci/tools/install_gpu_driver.sh | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

diff --git a/ci/tools/install_gpu_driver.sh b/ci/tools/install_gpu_driver.sh
index 05d87357fac..e7d0359a386 100755
--- a/ci/tools/install_gpu_driver.sh
+++ b/ci/tools/install_gpu_driver.sh
@@ -89,7 +89,19 @@ in_container() {
 host_install() {
   apt-get -y install build-essential dkms "linux-headers-$(uname -r)" psmisc kmod
 
-  systemctl stop nvidia-persistenced dcgm-exporter 2>/dev/null || true
+  # Take down nvidia-persistenced *without* systemctl. The packaged
+  # systemd unit declares `RuntimeDirectory=nvidia-persistenced`, which
+  # makes systemd unlink /run/nvidia-persistenced/ on stop. The
+  # container has /run/nvidia-persistenced/ bind-mounted from host, and
+  # the bind mount points to the dir's inode at container-start time --
+  # if systemd removes the dir and the new daemon recreates it under a
+  # different inode, the container's bind mount goes stale and its
+  # /run/nvidia-persistenced/socket loses its link to the live daemon
+  # endpoint (the file shows up with link count 0 inside the container).
+  # NVML state-changing calls in the container then return
+  # NVML_ERROR_UNKNOWN. Sending SIGTERM directly keeps the inode alive.
+  pkill -TERM nvidia-persistenced 2>/dev/null || true
+  systemctl stop dcgm-exporter 2>/dev/null || true
   # if-test instead of `fuser ... || true` so a kill failure surfaces
   # (fuser exits 1 when nothing holds the device, which is the happy path).
   if fuser /dev/nvidia* >/dev/null 2>&1; then

From 2b34f1f21be63480306c8b2dc30df85cd9e7c37c Mon Sep 17 00:00:00 2001
From: Leo Fang <leof@nvidia.com>
Date: Sun, 7 Jun 2026 19:53:54 +0000
Subject: [PATCH 14/16] CI: re-bind /run/nvidia-persistenced into container
 after driver swap

The container's bind mount of /run/nvidia-persistenced/ is taken at
container-start time and pinned to the host directory's then-current
inode. Across the install the host directory gets recreated under a
fresh inode (the daemon's shutdown + restart cycle replaces it), and
the container is stranded on the deleted inode -- socket file shows
up with link count 0 inside the container, NVML state-changing calls
return NVML_ERROR_UNKNOWN.

After refresh_container_libs, umount the stale bind, mkdir the local
mount point if missing, and re-bind from /proc/1/root/run/nvidia-
persistenced (the host's current view via the privileged container's
host-pid-ns access). CAP_SYS_ADMIN required, which custom-DRIVER rows
already grant via --privileged --pid=host.
---
 ci/tools/install_gpu_driver.sh | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

diff --git a/ci/tools/install_gpu_driver.sh b/ci/tools/install_gpu_driver.sh
index e7d0359a386..adeaca81ea6 100755
--- a/ci/tools/install_gpu_driver.sh
+++ b/ci/tools/install_gpu_driver.sh
@@ -227,6 +227,24 @@ if [ -z "${_NVDRV_NSENTERED:-}" ] && in_container; then
   _NVDRV_NSENTERED=1 nsenter -t 1 -m -p -n -i -u -- bash -s < "$0" \
     || { echo "::error::container needs 'options: --privileged --pid=host'" >&2; exit 1; }
   refresh_container_libs
+
+  # Re-bind /run/nvidia-persistenced from host. The container's original
+  # bind mount of this dir was taken at container-start time and points
+  # to the host's then-current inode. Even with `pkill` (instead of
+  # systemctl) the host dir is recreated by the new daemon under a fresh
+  # inode -- leaving the container's bind mount stranded on a deleted
+  # inode (socket file shows up with link count 0). Re-do the bind mount
+  # so in-container NVML clients see the live daemon endpoint. Needs
+  # CAP_SYS_ADMIN, which we get from the --privileged --pid=host the
+  # workflow adds for custom-DRIVER rows.
+  if [ -d /proc/1/root/run/nvidia-persistenced ]; then
+    set -x
+    umount /run/nvidia-persistenced 2>/dev/null || true
+    mkdir -p /run/nvidia-persistenced
+    mount --bind /proc/1/root/run/nvidia-persistenced /run/nvidia-persistenced
+    ls -la /run/nvidia-persistenced/ >&2
+    set +x
+  fi
 else
   host_install
 fi

From 8d8a9ef49e74d5037d31b07d629cac6f74c55711 Mon Sep 17 00:00:00 2001
From: Leo Fang <leof@nvidia.com>
Date: Sun, 7 Jun 2026 20:39:05 +0000
Subject: [PATCH 15/16] CI: drop install_gpu_driver.sh experiments that turned
 out non-load-bearing

- Revert `pkill -TERM nvidia-persistenced` to `systemctl stop`; pkill
  alone didn't prevent the host dir's inode from flipping, the re-bind
  of /run/nvidia-persistenced/ is what restores the container's view.
- Drop `nvidia-smi -pm 1`; the test exercises NVML's set call, which
  succeeds once the daemon socket is reachable regardless of current
  Persistence-M state.
- Trim `set -x` blocks and `pgrep`/`ls -la`/`stat` diagnostics that
  served their purpose during debugging.

Keeps the load-bearing changes (nsenter bash -s, /usr/(bin|lib)
refresh filter, exec nvidia-persistenced --user root, the
/run/nvidia-persistenced re-bind, cp --preserve=mode) and brings the
diff against Justin's nvgha-driver back down to the strict minimum.
---
 ci/tools/install_gpu_driver.sh | 93 +++++++++++-----------------------
 1 file changed, 30 insertions(+), 63 deletions(-)

diff --git a/ci/tools/install_gpu_driver.sh b/ci/tools/install_gpu_driver.sh
index adeaca81ea6..db7ae998587 100755
--- a/ci/tools/install_gpu_driver.sh
+++ b/ci/tools/install_gpu_driver.sh
@@ -89,19 +89,7 @@ in_container() {
 host_install() {
   apt-get -y install build-essential dkms "linux-headers-$(uname -r)" psmisc kmod
 
-  # Take down nvidia-persistenced *without* systemctl. The packaged
-  # systemd unit declares `RuntimeDirectory=nvidia-persistenced`, which
-  # makes systemd unlink /run/nvidia-persistenced/ on stop. The
-  # container has /run/nvidia-persistenced/ bind-mounted from host, and
-  # the bind mount points to the dir's inode at container-start time --
-  # if systemd removes the dir and the new daemon recreates it under a
-  # different inode, the container's bind mount goes stale and its
-  # /run/nvidia-persistenced/socket loses its link to the live daemon
-  # endpoint (the file shows up with link count 0 inside the container).
-  # NVML state-changing calls in the container then return
-  # NVML_ERROR_UNKNOWN. Sending SIGTERM directly keeps the inode alive.
-  pkill -TERM nvidia-persistenced 2>/dev/null || true
-  systemctl stop dcgm-exporter 2>/dev/null || true
+  systemctl stop nvidia-persistenced dcgm-exporter 2>/dev/null || true
   # if-test instead of `fuser ... || true` so a kill failure surfaces
   # (fuser exits 1 when nothing holds the device, which is the happy path).
   if fuser /dev/nvidia* >/dev/null 2>&1; then
@@ -127,35 +115,25 @@ host_install() {
          --accept-license --ui=none --no-cc-version-check --kernel-module-type="$KMT" )
   modprobe nvidia nvidia_uvm nvidia_modeset
 
-  # Bring nvidia-persistenced back up. We stopped it above, and the
-  # `--silent --no-questions` .run installer drops `/usr/bin/nvidia-
-  # persistenced` but does not reliably reinstall a usable systemd
-  # unit -- so a previous attempt at `systemctl start nvidia-
-  # persistenced.service` was a no-op. Exec the daemon directly; it
-  # self-daemonizes and creates `/run/nvidia-persistenced/socket`,
-  # which NVML clients in the test container need for state-changing
-  # calls like `nvmlDeviceSetPersistenceMode` -- without it those
-  # calls return NVML_ERROR_UNKNOWN.
+  # Bring nvidia-persistenced back up. NVML state-changing calls from
+  # inside the test container (e.g. nvmlDeviceSetPersistenceMode, which
+  # cuda.core's test_persistence_mode_enabled exercises) talk to the
+  # daemon via /run/nvidia-persistenced/socket; without a live daemon
+  # they return NVML_ERROR_UNKNOWN.
   #
-  # `--user root`: the daemon's default user is `nvidia-persistenced`,
-  # which our apt purge of `nvidia-compute-utils-*` deleted. Without
-  # this flag the daemon's setuid(3) call fails post-fork and the
-  # process exits silently (which leaves Persistence-M flipping back
-  # to Off the moment we exit the start window).
+  # systemctl can't start the unit (the `--silent --no-questions` .run
+  # installer drops /usr/bin/nvidia-persistenced but no usable systemd
+  # unit), so exec the binary directly -- it self-daemonizes.
   #
-  # Same latent gap exists in nv-gha-runners/vm-images' `nvgha-driver`;
-  # their CUDA-runtime validation workload doesn't issue an NVML SET
-  # write so they haven't surfaced it yet.
-  set -x
-  /usr/bin/nvidia-persistenced --verbose --user root 2>&1 || true
-  sleep 1
-  # Diagnostics: confirm the daemon is alive + socket present.
-  pgrep -laf nvidia-persistenced || echo "WARN: nvidia-persistenced not running"
-  ls -la /run/nvidia-persistenced/ 2>&1 || echo "WARN: /run/nvidia-persistenced missing"
-  # Set persistence mode explicitly so we match the runner image's
-  # `Persistence-M: On` baseline regardless of how the daemon came up.
-  nvidia-smi -pm 1 || true
-  set +x
+  # `--user root` because the default `nvidia-persistenced` user was
+  # deleted along with `nvidia-compute-utils-*` in the purge above;
+  # without this flag the daemon's post-fork setuid() fails silently
+  # and the process exits.
+  #
+  # nv-gha-runners/vm-images' `nvgha-driver` has the same latent gap;
+  # its CUDA-runtime validation workload never issues an NVML SET
+  # write so it hasn't surfaced there.
+  /usr/bin/nvidia-persistenced --verbose --user root || true
 }
 
 # Replace the toolkit's bind-mounted nvidia libs/binaries inside this
@@ -200,21 +178,13 @@ refresh_container_libs() {
       [ -n "$src" ] || { echo "skip $tgt: no host source" >&2; continue; }
     fi
     umount "$tgt" 2>/dev/null || true
-    # --preserve=mode keeps the SUID bit. /usr/bin/nvidia-modprobe ships
-    # 4755 and NVML's state-changing calls (e.g.
-    # nvmlDeviceSetPersistenceMode) go through it; a plain `cp` strips
-    # SUID and the call then fails with NVML_ERROR_UNKNOWN. The runner
-    # team's nvgha-driver has the same bug; we differ here.
+    # --preserve=mode keeps the SUID bit so the refresh doesn't silently
+    # de-privilege binaries like nvidia-modprobe that ship 4755 (the
+    # runner-team's nvgha-driver uses plain `cp` and has the same gap).
     cp -f --preserve=mode --remove-destination "$src" "$tgt" \
       || echo "WARN: refresh failed for $tgt (src=$src)" >&2
   done
   ldconfig
-
-  # Diagnostic: confirm SUID survived on nvidia-modprobe (the load-bearing
-  # piece). One-liner so the next CI log proves the fix.
-  if [ -e /usr/bin/nvidia-modprobe ]; then
-    stat -c 'refresh: %n mode=%a uid=%u' /usr/bin/nvidia-modprobe >&2
-  fi
 }
 
 if [ -z "${_NVDRV_NSENTERED:-}" ] && in_container; then
@@ -228,22 +198,19 @@ if [ -z "${_NVDRV_NSENTERED:-}" ] && in_container; then
     || { echo "::error::container needs 'options: --privileged --pid=host'" >&2; exit 1; }
   refresh_container_libs
 
-  # Re-bind /run/nvidia-persistenced from host. The container's original
-  # bind mount of this dir was taken at container-start time and points
-  # to the host's then-current inode. Even with `pkill` (instead of
-  # systemctl) the host dir is recreated by the new daemon under a fresh
-  # inode -- leaving the container's bind mount stranded on a deleted
-  # inode (socket file shows up with link count 0). Re-do the bind mount
-  # so in-container NVML clients see the live daemon endpoint. Needs
-  # CAP_SYS_ADMIN, which we get from the --privileged --pid=host the
-  # workflow adds for custom-DRIVER rows.
+  # Re-bind /run/nvidia-persistenced from the host. The container's
+  # original bind mount was pinned to the host's at-container-start
+  # inode; the daemon stop+restart cycle recreates the dir under a
+  # fresh inode, stranding the bind mount on the deleted one (socket
+  # shows up with link count 0 inside the container, NVML SET calls
+  # return NVML_ERROR_UNKNOWN). Re-bind from /proc/1/root so the
+  # container picks up the live host dir. Needs CAP_SYS_ADMIN, which
+  # the workflow grants via --privileged --pid=host on custom-DRIVER
+  # rows.
   if [ -d /proc/1/root/run/nvidia-persistenced ]; then
-    set -x
     umount /run/nvidia-persistenced 2>/dev/null || true
     mkdir -p /run/nvidia-persistenced
     mount --bind /proc/1/root/run/nvidia-persistenced /run/nvidia-persistenced
-    ls -la /run/nvidia-persistenced/ >&2
-    set +x
   fi
 else
   host_install

From d2c25ebbb45d9c1615bf9544f60f05aae2453c76 Mon Sep 17 00:00:00 2001
From: Leo Fang <leof@nvidia.com>
Date: Sun, 7 Jun 2026 21:37:23 +0000
Subject: [PATCH 16/16] Revert: remove the probe-driver-swap fast-feedback job

Added in a3f157382b for fast iteration on install_gpu_driver.sh; no
longer needed now that the script has stabilized.
---
 .github/workflows/ci.yml | 70 ----------------------------------------
 1 file changed, 70 deletions(-)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 3a176deff31..82ab7210c92 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -413,76 +413,6 @@ jobs:
     with:
       is-release: ${{ github.ref_type == 'tag' }}
 
-  # Fast-feedback probe for changes to ci/tools/install_gpu_driver.sh.
-  # Allocates one L4 GPU + container, runs the driver swap to a
-  # hard-coded version, then drives nvmlDeviceSetPersistenceMode via
-  # raw ctypes -- the *exact* NVML call that cuda.core's
-  # test_persistence_mode_enabled exercises. Total runtime is ~5 min
-  # vs. ~30 min for a full test matrix; runs on every PR push so we
-  # can iterate on `ci/tools/install_gpu_driver.sh` quickly.
-  probe-driver-swap:
-    name: Probe custom-DRIVER install
-    if: ${{ github.repository_owner == 'nvidia' && !cancelled() }}
-    runs-on: "linux-amd64-gpu-l4-latest-1"
-    timeout-minutes: 15
-    defaults:
-      run:
-        shell: bash --noprofile --norc -xeuo pipefail {0}
-    container:
-      options: -u root --security-opt seccomp=unconfined --shm-size 16g --privileged --pid=host
-      image: ubuntu:22.04
-      env:
-        NVIDIA_VISIBLE_DEVICES: ${{ env.NVIDIA_VISIBLE_DEVICES }}
-    steps:
-      - name: Checkout
-        uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10  # v6.0.3
-
-      - name: Install host deps
-        run: |
-          apt-get update -qq
-          apt-get -y install --no-install-recommends util-linux python3
-
-      - name: Install GPU driver
-        env:
-          DRIVER: '610.43.02'
-          GPU_TYPE: 'l4'
-        run: ./ci/tools/install_gpu_driver.sh
-
-      - name: Show post-install host + container state
-        run: |
-          nvidia-smi --query-gpu=name,driver_version,persistence_mode --format=csv
-          echo
-          echo "=== /run/nvidia-persistenced ==="
-          ls -la /run/nvidia-persistenced/ 2>&1 || echo "MISSING"
-          echo
-          echo "=== nvidia-persistenced process ==="
-          pgrep -laf nvidia-persistenced || echo "(none)"
-
-      - name: Drive nvmlDeviceSetPersistenceMode via ctypes
-        run: |
-          python3 <<'PYEOF'
-          import ctypes, sys
-          NVML_SUCCESS, NVML_ERROR_NO_PERMISSION, NVML_ERROR_UNKNOWN = 0, 4, 999
-          nvml = ctypes.CDLL("libnvidia-ml.so.1")
-          assert nvml.nvmlInit_v2() == 0, "nvmlInit_v2 failed"
-          h = ctypes.c_void_p()
-          assert nvml.nvmlDeviceGetHandleByIndex_v2(0, ctypes.byref(h)) == 0
-          m = ctypes.c_uint(99)
-          nvml.nvmlDeviceGetPersistenceMode(h, ctypes.byref(m))
-          print(f"current persistence_mode = {m.value} (1=ENABLED, 0=DISABLED)")
-          ret = nvml.nvmlDeviceSetPersistenceMode(h, 0)
-          print(f"SET DISABLED -> {ret}  # 0=SUCCESS, 4=NO_PERMISSION, 999=UNKNOWN")
-          if ret == NVML_ERROR_UNKNOWN:
-              print("FAIL: NVML_ERROR_UNKNOWN -- daemon-down failure mode reproduced", file=sys.stderr)
-              sys.exit(1)
-          if ret != NVML_SUCCESS:
-              print(f"FAIL: unexpected return code {ret}", file=sys.stderr)
-              sys.exit(1)
-          # restore
-          nvml.nvmlDeviceSetPersistenceMode(h, m.value)
-          print("OK")
-          PYEOF
-
   checks:
     name: Check job status
     if: always()