diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 9f9236b09f..496d4480b9 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -24,6 +24,7 @@ on:
   schedule:
     # every 24 hours at midnight UTC
     - cron: "0 0 * * *"
+  workflow_dispatch: {}
 
 jobs:
   ci-vars:
diff --git a/.github/workflows/coverage.yml b/.github/workflows/coverage.yml
index 9581cff308..de1e713e49 100644
--- a/.github/workflows/coverage.yml
+++ b/.github/workflows/coverage.yml
@@ -275,13 +275,15 @@ jobs:
         uses: nv-gha-runners/setup-proxy-cache@main
         continue-on-error: true
 
-      - name: Update driver
+      # DRIVER above is 'latest' so install_gpu_driver.ps1 is intentionally
+      # skipped (it errors on latest/earliest); configure_driver_mode.ps1
+      # still runs to put the pre-installed driver into TCC mode.
+      - name: Configure driver mode
         shell: powershell
         env:
           DRIVER_MODE: "TCC"
-          GPU_TYPE: "a100"
         run: |
-          ci/tools/install_gpu_driver.ps1
+          ci/tools/configure_driver_mode.ps1
 
       - name: Ensure GPU is working
         run: |
diff --git a/.github/workflows/test-wheel-linux.yml b/.github/workflows/test-wheel-linux.yml
index f8002f5124..e2f1007e53 100644
--- a/.github/workflows/test-wheel-linux.yml
+++ b/.github/workflows/test-wheel-linux.yml
@@ -85,8 +85,13 @@ jobs:
           # Read base matrix from YAML file for the specific architecture
           TEST_MATRIX=$(yq -o json ".linux[\"${MATRIX_TYPE}\"] | map(select(.ARCH == \"${ARCH}\"))" ci/test-matrix.yml)
 
-          # Apply matrix filter and wrap in include structure
-          MATRIX=$(echo "$TEST_MATRIX" | jq -c '${{ inputs.matrix_filter }} | if (. | length) > 0 then {include: .} else "Error: Empty matrix\n" | halt_error(1) end')
+          # Apply matrix filter; reject custom DRIVER + FLAVOR=wsl (the
+          # in-container driver swap doesn't work under WSL); add a
+          # RUNNER_DRIVER field that maps any custom version back to
+          # 'latest' (the install script swaps the driver itself, so we
+          # need to land on the runner that ships with the most recent
+          # pre-installed driver); wrap in include structure.
+          MATRIX=$(echo "$TEST_MATRIX" | jq -c '${{ inputs.matrix_filter }} | if any(.[]; .DRIVER != "latest" and .DRIVER != "earliest" and .FLAVOR == "wsl") then "Error: custom DRIVER is not supported with FLAVOR=wsl\n" | halt_error(1) else . end | map(. + {RUNNER_DRIVER: (if .DRIVER == "latest" or .DRIVER == "earliest" then .DRIVER else "latest" end)}) | if (. | length) > 0 then {include: .} else "Error: Empty matrix\n" | halt_error(1) end')
 
           echo "MATRIX=${MATRIX}" | tee --append "${GITHUB_OUTPUT}"
 
@@ -101,21 +106,21 @@ jobs:
     strategy:
       fail-fast: false
       matrix: ${{ fromJSON(needs.compute-matrix.outputs.MATRIX) }}
-    runs-on: "${{ matrix.FLAVOR || 'linux' }}-${{ matrix.ARCH }}-gpu-${{ matrix.GPU }}-${{ matrix.DRIVER }}-${{ matrix.GPU_COUNT }}"
+    runs-on: "${{ matrix.FLAVOR || 'linux' }}-${{ matrix.ARCH }}-gpu-${{ matrix.GPU }}-${{ matrix.RUNNER_DRIVER }}-${{ matrix.GPU_COUNT }}"
     # The build stage could fail but we want the CI to keep moving.
     if: ${{ github.repository_owner == 'nvidia' && !cancelled() }}
     # Our self-hosted runners require a container
     # TODO: use a different (nvidia?) container
     container:
-      options: -u root --security-opt seccomp=unconfined --shm-size 16g
+      # Custom-DRIVER rows need --privileged --pid=host so install_gpu_driver.sh
+      # can nsenter to the host for the install + refresh the toolkit bind mounts
+      # back inside the container. Stock options for latest/earliest rows.
+      options: ${{ ((matrix.DRIVER == 'latest' || matrix.DRIVER == 'earliest') && '-u root --security-opt seccomp=unconfined --shm-size 16g') || '-u root --security-opt seccomp=unconfined --shm-size 16g --privileged --pid=host' }}
       image: ubuntu:22.04
       env:
         NVIDIA_VISIBLE_DEVICES: ${{ env.NVIDIA_VISIBLE_DEVICES }}
         PIP_CACHE_DIR: "/tmp/pip-cache"
     steps:
-      - name: Ensure GPU is working
-        run: nvidia-smi
-
       - name: Checkout ${{ github.event.repository.name }}
         uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10  # v6.0.3
 
@@ -127,10 +132,22 @@ jobs:
         uses: ./.github/actions/install_unix_deps
         continue-on-error: false
         with:
-          # for artifact fetching, graphics libs, g++ required for cffi in example
-          dependencies: "jq wget libgl1 libegl1 g++"
+          # for artifact fetching, graphics libs, g++ required for cffi in
+          # example; util-linux for `nsenter` (custom-DRIVER rows re-exec
+          # install_gpu_driver.sh onto the host through nsenter)
+          dependencies: "jq wget libgl1 libegl1 g++ util-linux"
           dependent_exes: "jq wget"
 
+      - name: Install GPU driver
+        if: ${{ matrix.DRIVER != 'latest' && matrix.DRIVER != 'earliest' }}
+        env:
+          DRIVER: ${{ matrix.DRIVER }}
+          GPU_TYPE: ${{ matrix.GPU }}
+        run: ./ci/tools/install_gpu_driver.sh
+
+      - name: Ensure GPU is working
+        run: nvidia-smi
+
       - name: Set environment variables
         env:
           BUILD_CUDA_VER: ${{ inputs.build-ctk-ver }}
diff --git a/.github/workflows/test-wheel-windows.yml b/.github/workflows/test-wheel-windows.yml
index 320817177f..5675b395af 100644
--- a/.github/workflows/test-wheel-windows.yml
+++ b/.github/workflows/test-wheel-windows.yml
@@ -81,8 +81,11 @@ jobs:
           # Read base matrix from YAML file for the specific architecture
           TEST_MATRIX=$(yq -o json ".windows[\"${MATRIX_TYPE}\"] | map(select(.ARCH == \"${ARCH}\"))" ci/test-matrix.yml)
 
-          # Apply matrix filter and wrap in include structure
-          MATRIX=$(echo "$TEST_MATRIX" | jq -c '${{ inputs.matrix_filter }} | if (. | length) > 0 then {include: .} else "Error: Empty matrix\n" | halt_error(1) end')
+          # Apply matrix filter; add a RUNNER_DRIVER field that maps any
+          # custom DRIVER version back to 'latest' (install_gpu_driver.ps1
+          # swaps the driver itself, so the runner must be the one that
+          # ships the most recent pre-installed driver); wrap in include.
+          MATRIX=$(echo "$TEST_MATRIX" | jq -c '${{ inputs.matrix_filter }} | map(. + {RUNNER_DRIVER: (if .DRIVER == "latest" or .DRIVER == "earliest" then .DRIVER else "latest" end)}) | if (. | length) > 0 then {include: .} else "Error: Empty matrix\n" | halt_error(1) end')
 
           echo "MATRIX=${MATRIX}" | tee --append "${GITHUB_OUTPUT}"
 
@@ -95,7 +98,7 @@ jobs:
       fail-fast: false
       matrix: ${{ fromJSON(needs.compute-matrix.outputs.MATRIX) }}
     if: ${{ github.repository_owner == 'nvidia' && !cancelled() }}
-    runs-on: "windows-${{ matrix.ARCH }}-gpu-${{ matrix.GPU }}-${{ matrix.DRIVER }}-${{ matrix.GPU_COUNT }}"
+    runs-on: "windows-${{ matrix.ARCH }}-gpu-${{ matrix.GPU }}-${{ matrix.RUNNER_DRIVER }}-${{ matrix.GPU_COUNT }}"
     steps:
       - name: Checkout ${{ github.event.repository.name }}
         uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10  # v6.0.3
@@ -106,13 +109,20 @@ jobs:
         with:
           enable-apt: true
 
-      - name: Update driver
+      - name: Install GPU driver
+        if: ${{ matrix.DRIVER != 'latest' && matrix.DRIVER != 'earliest' }}
         env:
-          DRIVER_MODE: ${{ matrix.DRIVER_MODE }}
+          DRIVER: ${{ matrix.DRIVER }}
           GPU_TYPE: ${{ matrix.GPU }}
         run: |
           ci/tools/install_gpu_driver.ps1
 
+      - name: Configure driver mode
+        env:
+          DRIVER_MODE: ${{ matrix.DRIVER_MODE }}
+        run: |
+          ci/tools/configure_driver_mode.ps1
+
       - name: Ensure GPU is working
         run: |
           nvidia-smi
diff --git a/ci/test-matrix.yml b/ci/test-matrix.yml
index 95c5e714ca..51f0d3f063 100644
--- a/ci/test-matrix.yml
+++ b/ci/test-matrix.yml
@@ -13,7 +13,16 @@
 # Windows entries also include DRIVER_MODE.
 #
 # Notes:
+# - DRIVER accepts:
+#     * 'latest'   - use the runner's pre-installed latest driver (no install step)
+#     * 'earliest' - use the runner's pre-installed earliest driver (no install step)
+#     * a version string (e.g. '580.65.06')
+#                  - install that version via ci/tools/install_gpu_driver.sh (Linux)
+#                    or ci/tools/install_gpu_driver.ps1 (Windows) at the start of the
+#                    job. The matrix row is routed to the 'latest' runner image (the
+#                    install scripts swap the driver themselves).
 # - DRIVER: 'earliest' does not work with CUDA 12.9.1
+# - DRIVER: a custom version is not supported with FLAVOR=wsl on Linux.
 
 linux:
   pull-request:
@@ -29,10 +38,10 @@ linux:
     - { ARCH: 'amd64', PY_VER: '3.12',  CUDA_VER: '13.3.0', LOCAL_CTK: '0', GPU: 'l4',         GPU_COUNT: '1', DRIVER: 'latest' }
     - { ARCH: 'amd64', PY_VER: '3.13',  CUDA_VER: '12.9.1', LOCAL_CTK: '0', GPU: 'v100',       GPU_COUNT: '1', DRIVER: 'latest' }
     - { ARCH: 'amd64', PY_VER: '3.13',  CUDA_VER: '13.0.2', LOCAL_CTK: '1', GPU: 'rtxpro6000', GPU_COUNT: '1', DRIVER: 'latest' }
-    - { ARCH: 'amd64', PY_VER: '3.13',  CUDA_VER: '13.3.0', LOCAL_CTK: '1', GPU: 'rtxpro6000', GPU_COUNT: '1', DRIVER: 'latest' }
+    - { ARCH: 'amd64', PY_VER: '3.13',  CUDA_VER: '13.3.0', LOCAL_CTK: '1', GPU: 'rtxpro6000', GPU_COUNT: '1', DRIVER: '610.43.02' }
     - { ARCH: 'amd64', PY_VER: '3.14',  CUDA_VER: '12.9.1', LOCAL_CTK: '0', GPU: 't4',         GPU_COUNT: '1', DRIVER: 'latest' }
     - { ARCH: 'amd64', PY_VER: '3.14',  CUDA_VER: '13.0.2', LOCAL_CTK: '1', GPU: 'l4',         GPU_COUNT: '1', DRIVER: 'latest' }
-    - { ARCH: 'amd64', PY_VER: '3.14',  CUDA_VER: '13.3.0', LOCAL_CTK: '1', GPU: 'l4',         GPU_COUNT: '1', DRIVER: 'latest' }
+    - { ARCH: 'amd64', PY_VER: '3.14',  CUDA_VER: '13.3.0', LOCAL_CTK: '1', GPU: 'l4',         GPU_COUNT: '1', DRIVER: '610.43.02' }
     - { ARCH: 'amd64', PY_VER: '3.14t', CUDA_VER: '12.9.1', LOCAL_CTK: '1', GPU: 't4',         GPU_COUNT: '1', DRIVER: 'latest' }
     - { ARCH: 'amd64', PY_VER: '3.14t', CUDA_VER: '13.0.2', LOCAL_CTK: '1', GPU: 'l4',         GPU_COUNT: '1', DRIVER: 'latest' }
     - { ARCH: 'amd64', PY_VER: '3.14t', CUDA_VER: '13.3.0', LOCAL_CTK: '1', GPU: 'l4',         GPU_COUNT: '1', DRIVER: 'latest' }
@@ -74,7 +83,7 @@ linux:
     - { MODE: 'nightly-pytorch',    ARCH: 'arm64', PY_VER: '3.12', CUDA_VER: '13.0.2', LOCAL_CTK: '0', GPU: 'l4', GPU_COUNT: '1', DRIVER: 'latest', TORCH_VER: '2.9.1',  TORCH_CUDA: 'cu130' }
     # nightly-numba-cuda
     - { MODE: 'nightly-numba-cuda', ARCH: 'amd64', PY_VER: '3.12', CUDA_VER: '12.9.1', LOCAL_CTK: '0', GPU: 'l4', GPU_COUNT: '1', DRIVER: 'latest' }
-    - { MODE: 'nightly-numba-cuda', ARCH: 'amd64', PY_VER: '3.12', CUDA_VER: '13.3.0', LOCAL_CTK: '0', GPU: 'l4', GPU_COUNT: '1', DRIVER: 'latest' }
+    - { MODE: 'nightly-numba-cuda', ARCH: 'amd64', PY_VER: '3.12', CUDA_VER: '13.3.0', LOCAL_CTK: '0', GPU: 'l4', GPU_COUNT: '1', DRIVER: '580.65.06' }
     - { MODE: 'nightly-numba-cuda', ARCH: 'arm64', PY_VER: '3.12', CUDA_VER: '12.9.1', LOCAL_CTK: '0', GPU: 'l4', GPU_COUNT: '1', DRIVER: 'latest' }
     - { MODE: 'nightly-numba-cuda', ARCH: 'arm64', PY_VER: '3.12', CUDA_VER: '13.3.0', LOCAL_CTK: '0', GPU: 'l4', GPU_COUNT: '1', DRIVER: 'latest' }
     # nightly-standard (arm64 l4×2 — nightly-only per runner team request)
@@ -113,4 +122,4 @@ windows:
     - { MODE: 'nightly-pytorch',    ARCH: 'amd64', PY_VER: '3.12', CUDA_VER: '13.0.2', LOCAL_CTK: '0', GPU: 'l4', GPU_COUNT: '1', DRIVER: 'latest', DRIVER_MODE: 'TCC', TORCH_VER: '2.9.1',  TORCH_CUDA: 'cu130' }
     # nightly-numba-cuda
     - { MODE: 'nightly-numba-cuda', ARCH: 'amd64', PY_VER: '3.12', CUDA_VER: '12.9.1', LOCAL_CTK: '0', GPU: 'l4', GPU_COUNT: '1', DRIVER: 'latest', DRIVER_MODE: 'TCC' }
-    - { MODE: 'nightly-numba-cuda', ARCH: 'amd64', PY_VER: '3.12', CUDA_VER: '13.3.0', LOCAL_CTK: '0', GPU: 'l4', GPU_COUNT: '1', DRIVER: 'latest', DRIVER_MODE: 'TCC' }
+    - { MODE: 'nightly-numba-cuda', ARCH: 'amd64', PY_VER: '3.12', CUDA_VER: '13.3.0', LOCAL_CTK: '0', GPU: 'l4', GPU_COUNT: '1', DRIVER: '596.36',  DRIVER_MODE: 'TCC' }
diff --git a/ci/tools/configure_driver_mode.ps1 b/ci/tools/configure_driver_mode.ps1
new file mode 100644
index 0000000000..42e0914935
--- /dev/null
+++ b/ci/tools/configure_driver_mode.ps1
@@ -0,0 +1,58 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+#
+# configure_driver_mode.ps1 -- set the NVIDIA driver mode on a Windows CI
+# runner and cycle the display devices so the new mode takes effect
+# without rebooting. Always runs (whether or not install_gpu_driver.ps1
+# just ran). When install_gpu_driver.ps1 has run, this single device
+# cycle also activates the freshly-installed driver.
+#
+# Inputs (env):
+#   DRIVER_MODE  One of WDDM, TCC, MCDM.
+
+function Set-DriverMode {
+
+    # Map matrix DRIVER_MODE to nvidia-smi -fdm code.
+    # This assumes we have the prior knowledge on which GPU can use which mode.
+    $driver_mode = $env:DRIVER_MODE
+    if ($driver_mode -eq "WDDM") {
+        Write-Output "Setting driver mode to WDDM..."
+        nvidia-smi -fdm 0
+    } elseif ($driver_mode -eq "TCC") {
+        Write-Output "Setting driver mode to TCC..."
+        nvidia-smi -fdm 1
+    } elseif ($driver_mode -eq "MCDM") {
+        Write-Output "Setting driver mode to MCDM..."
+        nvidia-smi -fdm 2
+    } else {
+        Write-Output "Unknown driver mode: $driver_mode"
+        exit 1
+    }
+
+    # Only restart NVIDIA display adapters, not other display devices (e.g. QEMU VGA)
+    $nvidia_devices = Get-PnpDevice -Class Display -FriendlyName "NVIDIA*"
+    foreach ($device in $nvidia_devices) {
+        Write-Output "Restarting device: $($device.FriendlyName) ($($device.InstanceId))"
+        pnputil /disable-device "$($device.InstanceId)"
+        pnputil /enable-device "$($device.InstanceId)"
+    }
+
+    # Poll nvidia-smi until NVML can initialize, or give up after ~60s.
+    # A fixed sleep is not enough on slower-coming-back-up multi-GPU rows
+    # (e.g. 2x H100 MCDM) where pnputil enable returns before NVML is
+    # ready. Pattern borrowed from the runner-team `nvgha-driver.ps1`.
+    Write-Output "Waiting for nvidia-smi/NVML to come back up after device cycle..."
+    $deadline = (Get-Date).AddSeconds(60)
+    do {
+        Start-Sleep -Seconds 2
+        & nvidia-smi.exe 2>&1 | Out-Null
+    } while ($LASTEXITCODE -ne 0 -and (Get-Date) -lt $deadline)
+    if ($LASTEXITCODE -ne 0) {
+        Write-Error "nvidia-smi did not return cleanly within 60s of the device cycle"
+        exit 1
+    }
+}
+
+# Run the functions
+Set-DriverMode
diff --git a/ci/tools/install_gpu_driver.ps1 b/ci/tools/install_gpu_driver.ps1
index c98416c87e..e61c6bbdbb 100644
--- a/ci/tools/install_gpu_driver.ps1
+++ b/ci/tools/install_gpu_driver.ps1
@@ -1,13 +1,30 @@
 # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # SPDX-License-Identifier: Apache-2.0
+#
+# install_gpu_driver.ps1 -- install a specific NVIDIA driver version on a
+# Windows CI runner. Driver-mode selection and the post-install device
+# power-cycle are the responsibility of configure_driver_mode.ps1, which
+# the workflow runs immediately after this script (or by itself when
+# DRIVER is 'latest'/'earliest' and the runner already brings up the
+# right driver).
+#
+# Inputs (env):
+#   DRIVER    Driver version, e.g. "610.47". Must NOT be 'latest' or
+#             'earliest' -- those are runner-pre-installed and the
+#             workflow is expected to skip this script for them.
+#   GPU_TYPE  Lower-case GPU label from the matrix (e.g. "l4", "rtx4090").
+#             Selects the data-center vs desktop installer variant.
 
 # Install the driver
 function Install-Driver {
 
-    # Set the correct URL, filename, and arguments to the installer
-    # This driver is picked to support Windows 11 & CUDA 13.0
-    $version = '581.15'
+    # Driver version is plumbed from the matrix via the DRIVER env var.
+    $version = $env:DRIVER
+    if (-not $version -or $version -eq 'latest' -or $version -eq 'earliest') {
+        Write-Error "DRIVER env var must be a specific version string (e.g. '610.47'); got '$version'."
+        exit 1
+    }
 
     # Get GPU type from environment variable
     $gpu_type = $env:GPU_TYPE
@@ -54,33 +71,7 @@ function Install-Driver {
     # Install the file with the specified path from earlier
     Write-Output 'Running the driver installer...'
     Start-Process -FilePath $filepath -ArgumentList $install_args -Wait
-    Write-Output 'Done!'
-
-    # Handle driver mode configuration
-    # This assumes we have the prior knowledge on which GPU can use which mode.
-    $driver_mode = $env:DRIVER_MODE
-    if ($driver_mode -eq "WDDM") {
-        Write-Output "Setting driver mode to WDDM..."
-        nvidia-smi -fdm 0
-    } elseif ($driver_mode -eq "TCC") {
-        Write-Output "Setting driver mode to TCC..."
-        nvidia-smi -fdm 1
-    } elseif ($driver_mode -eq "MCDM") {
-        Write-Output "Setting driver mode to MCDM..."
-        nvidia-smi -fdm 2
-    } else {
-        Write-Output "Unknown driver mode: $driver_mode"
-        exit 1
-    }
-    # Only restart NVIDIA display adapters, not other display devices (e.g. QEMU VGA)
-    $nvidia_devices = Get-PnpDevice -Class Display -FriendlyName "NVIDIA*"
-    foreach ($device in $nvidia_devices) {
-        Write-Output "Restarting device: $($device.FriendlyName) ($($device.InstanceId))"
-        pnputil /disable-device "$($device.InstanceId)"
-        pnputil /enable-device "$($device.InstanceId)"
-    }
-    # Give it a minute to settle:
-    Start-Sleep -Seconds 5
+    Write-Output 'Install complete; driver mode + device cycle handled by configure_driver_mode.ps1.'
 }
 
 # Run the functions
diff --git a/ci/tools/install_gpu_driver.sh b/ci/tools/install_gpu_driver.sh
new file mode 100755
index 0000000000..777edd6351
--- /dev/null
+++ b/ci/tools/install_gpu_driver.sh
@@ -0,0 +1,220 @@
+#!/bin/bash
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+#
+# install_gpu_driver.sh -- install a specific NVIDIA driver version on a
+# Linux CI runner. Adapted from nv-gha-runners/vm-images PR #256
+# (`nvgha-driver` CLI), trimmed and parameterised for cuda-python's CI.
+#
+# !!! ALPHA !!!
+# Performs live modifications to the host driver stack (kernel module
+# reload, package replacement, and -- inside containers -- toolkit
+# bind-mount refresh) and may cause issues.
+#
+# Inputs (env):
+#   DRIVER    Driver version, e.g. "580.65.06". Must NOT be 'latest' or
+#             'earliest' -- those are runner-pre-installed and the
+#             workflow is expected to skip this script for them.
+#   GPU_TYPE  Lower-case GPU label from the matrix (e.g. "v100", "l4",
+#             "h100"). Used only to pick the kernel module flavor
+#             (Volta needs the proprietary/legacy module; everything
+#             newer can use the open module).
+#
+# Arch is detected from `uname -m`.
+#
+# When the script runs inside a container (the cuda-python Linux jobs do)
+# it re-execs itself on the host via `nsenter`. The job must declare
+# `options: --privileged --pid=host` (the workflow only does this for
+# matrix rows with a custom DRIVER). After the host-side install, the
+# container's bind-mounted nvidia libs/binaries are refreshed in-place so
+# the new driver is visible without restarting the container.
+set -euo pipefail
+
+: "${DRIVER:?DRIVER env var is required (e.g. 580.65.06)}"
+: "${GPU_TYPE:?GPU_TYPE env var is required (e.g. l4)}"
+
+case "$DRIVER" in
+  latest|earliest)
+    echo "::error::install_gpu_driver.sh must not be invoked with DRIVER=$DRIVER (runner-pre-installed)" >&2
+    exit 1
+    ;;
+esac
+
+VERSION="$DRIVER"
+
+# Volta (V100) requires the legacy/proprietary kernel module; all newer
+# GPUs in this matrix support the open module. Extend this if/when older
+# GPUs return to the matrix.
+case "$GPU_TYPE" in
+  v100) KMT=proprietary ;;
+  *)    KMT=open ;;
+esac
+
+case "$(uname -m)" in
+  x86_64)
+    ARCH_DIR=Linux-x86_64
+    ARCH_SUFFIX=x86_64
+    ;;
+  aarch64)
+    ARCH_DIR=Linux-aarch64
+    ARCH_SUFFIX=aarch64
+    ;;
+  *)
+    echo "::error::unsupported arch: $(uname -m)" >&2
+    exit 1
+    ;;
+esac
+
+URL="https://us.download.nvidia.com/XFree86/${ARCH_DIR}/${VERSION}/NVIDIA-Linux-${ARCH_SUFFIX}-${VERSION}.run"
+
+# Re-elevate to root if needed (sudo is preinstalled on the runner image).
+if [ "$(id -u)" != 0 ]; then
+  exec sudo -E DRIVER="$DRIVER" GPU_TYPE="$GPU_TYPE" "$0" "$@"
+fi
+
+echo "install_gpu_driver.sh is ALPHA -- it performs live modifications to the host driver stack and may cause issues" >&2
+echo "DRIVER=${VERSION}  GPU_TYPE=${GPU_TYPE}  KMT=${KMT}  ARCH=${ARCH_SUFFIX}" >&2
+echo "URL=${URL}" >&2
+
+# Toolkit packages we keep across the purge: dockerd's --runtime=nvidia
+# resolves nvidia-container-runtime through these, and removing them
+# breaks `docker exec` against any container started with that runtime.
+KEEP_RE='^(nvidia-container-toolkit(-base)?|libnvidia-container1|libnvidia-container-tools)$'
+
+in_container() {
+  [ -f /.dockerenv ] || grep -qE '/(docker|kubepods|containerd)' /proc/1/cgroup 2>/dev/null
+}
+
+host_install() {
+  apt-get -y install build-essential dkms "linux-headers-$(uname -r)" psmisc kmod
+
+  systemctl stop nvidia-persistenced dcgm-exporter || true
+  # if-test instead of `fuser ... || true` so a kill failure surfaces
+  # (fuser exits 1 when nothing holds the device, which is the happy path).
+  if fuser /dev/nvidia* >/dev/null 2>&1; then
+    fuser -kv /dev/nvidia*
+  fi
+  sleep 1
+  for m in nvidia_uvm nvidia_drm nvidia_modeset nvidia; do
+    rmmod "$m" 2>/dev/null || true
+  done
+
+  # Purge existing nvidia/libnvidia packages, except the toolkit pieces
+  # captured by KEEP_RE. Tolerate apt failures: postrm scripts can trip
+  # and the .run installer is about to replace everything anyway.
+  dpkg-query -W -f='${Package}\n' 'nvidia-*' 'libnvidia-*' 2>/dev/null \
+    | awk -v re="$KEEP_RE" '$0 !~ re' \
+    | xargs -r apt-get -y remove --purge || true
+
+  local d
+  d=$(mktemp -d)
+  ( cd "$d" \
+    && wget -q -O installer.run "$URL" \
+    && sh installer.run --silent --dkms --no-questions \
+         --accept-license --ui=none --no-cc-version-check --kernel-module-type="$KMT" )
+  modprobe nvidia nvidia_uvm nvidia_modeset
+
+  # Bring nvidia-persistenced back up. NVML state-changing calls from
+  # inside the test container (e.g. nvmlDeviceSetPersistenceMode, which
+  # cuda.core's test_persistence_mode_enabled exercises) talk to the
+  # daemon via /run/nvidia-persistenced/socket; without a live daemon
+  # they return NVML_ERROR_UNKNOWN.
+  #
+  # systemctl can't start the unit (the `--silent --no-questions` .run
+  # installer drops /usr/bin/nvidia-persistenced but no usable systemd
+  # unit), so exec the binary directly -- it self-daemonizes.
+  #
+  # `--user root` because the default `nvidia-persistenced` user was
+  # deleted along with `nvidia-compute-utils-*` in the purge above;
+  # without this flag the daemon's post-fork setuid() fails silently
+  # and the process exits.
+  #
+  # nv-gha-runners/vm-images' `nvgha-driver` has the same latent gap;
+  # its CUDA-runtime validation workload never issues an NVML SET
+  # write so it hasn't surfaced there.
+  /usr/bin/nvidia-persistenced --verbose --user root || true
+}
+
+# Replace the toolkit's bind-mounted nvidia libs/binaries inside this
+# container with copies from the host's new install. `cp` (not
+# `mount --bind`) because procfs-routed binds drop the exec bit.
+refresh_container_libs() {
+  # Walk /proc/self/mountinfo and match the toolkit-injected nvidia
+  # binds via mount point (field 5) so deleted source paths -- which
+  # the kernel suffixes field 4 with " (deleted)" once the host unlinks
+  # the old lib -- don't break discovery. Filters skip what we can't or
+  # shouldn't refresh:
+  #   $3 ~ /^0:/                  tmpfs/proc/sysfs (e.g. the toolkit hook tmpfs)
+  #   $5 must be under /usr/(bin|lib)  binaries + libs only -- explicitly
+  #                                NOT /run/nvidia-persistenced/socket
+  #                                (cp'ing the daemon's IPC socket unlinks
+  #                                the container's view and turns later
+  #                                NVML state-changing calls into
+  #                                NVML_ERROR_UNKNOWN); NOT /dev/nvidia*
+  #                                (character devices); NOT /proc/driver/nvidia
+  #                                (procfs); NOT /tmp/nvidia-mps (runtime).
+  #   $5 ~ /\.json$/              vulkan/glvnd config remaps (not version-bound)
+  #   $5 ~ /\/(firmware|xorg)\//  firmware loads host-side; xorg unused in CUDA containers
+  local mounts
+  mounts=$(awk '
+    $3 !~ /^0:/                     &&
+    $5 ~ /^\/usr\/(bin|lib)/        &&
+    $5 !~ /\.json$/                 &&
+    $5 !~ /\/(firmware|xorg)\//     &&
+    $5 ~ /(nvidia|libcuda)/         { print $5 }
+  ' /proc/self/mountinfo | sort -u)
+
+  for tgt in $mounts; do
+    local src="/proc/1/root$tgt"
+    if [ ! -e "$src" ]; then
+      # Driver swap rewrites the version suffix (libfoo.so.595.71.05 ->
+      # libfoo.so.580.65.06); strip it and find the new file.
+      local base
+      base=$(basename "$tgt")
+      base="${base%.so.*}.so"
+      src=$(find "/proc/1/root$(dirname "$tgt")" -maxdepth 1 -name "${base}.*" 2>/dev/null \
+            | sort -V | tail -n1)
+      [ -n "$src" ] || { echo "skip $tgt: no host source" >&2; continue; }
+    fi
+    umount "$tgt" 2>/dev/null || true
+    # --preserve=mode keeps the SUID bit so the refresh doesn't silently
+    # de-privilege binaries like nvidia-modprobe that ship 4755 (the
+    # runner-team's nvgha-driver uses plain `cp` and has the same gap).
+    cp -f --preserve=mode --remove-destination "$src" "$tgt" \
+      || echo "WARN: refresh failed for $tgt (src=$src)" >&2
+  done
+  ldconfig
+}
+
+if [ -z "${_NVDRV_NSENTERED:-}" ] && in_container; then
+  # Re-exec on the host. The runner-team's `nvgha-driver` script lives at a
+  # host-side absolute path so `"$0"` survives the mount-namespace flip;
+  # ours lives in the GH workspace mount (container-only), so we pipe the
+  # script body in via stdin instead -- the `< "$0"` fd is opened before
+  # nsenter and stays valid across the namespace switch. Env vars (DRIVER,
+  # GPU_TYPE, _NVDRV_NSENTERED) are inherited by the host-side bash.
+  _NVDRV_NSENTERED=1 nsenter -t 1 -m -p -n -i -u -- bash -s < "$0" \
+    || { echo "::error::container needs 'options: --privileged --pid=host'" >&2; exit 1; }
+  refresh_container_libs
+
+  # Re-bind /run/nvidia-persistenced from the host. The container's
+  # original bind mount was pinned to the host's at-container-start
+  # inode; the daemon stop+restart cycle recreates the dir under a
+  # fresh inode, stranding the bind mount on the deleted one (socket
+  # shows up with link count 0 inside the container, NVML SET calls
+  # return NVML_ERROR_UNKNOWN). Re-bind from /proc/1/root so the
+  # container picks up the live host dir. Needs CAP_SYS_ADMIN, which
+  # the workflow grants via --privileged --pid=host on custom-DRIVER
+  # rows.
+  if [ -d /proc/1/root/run/nvidia-persistenced ]; then
+    umount /run/nvidia-persistenced 2>/dev/null || true
+    mkdir -p /run/nvidia-persistenced
+    mount --bind /proc/1/root/run/nvidia-persistenced /run/nvidia-persistenced
+  fi
+else
+  host_install
+fi
+
+nvidia-smi >/dev/null
+grep -qF "$VERSION" /proc/driver/nvidia/version