Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
16 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ on:
schedule:
# every 24 hours at midnight UTC
- cron: "0 0 * * *"
workflow_dispatch:

jobs:
ci-vars:
Expand Down
8 changes: 5 additions & 3 deletions .github/workflows/coverage.yml
Original file line number Diff line number Diff line change
Expand Up @@ -275,13 +275,15 @@ jobs:
uses: nv-gha-runners/setup-proxy-cache@main
continue-on-error: true

- name: Update driver
# DRIVER above is 'latest' so install_gpu_driver.ps1 is intentionally
# skipped (it errors on latest/earliest); configure_driver_mode.ps1
# still runs to put the pre-installed driver into TCC mode.
- name: Configure driver mode
shell: powershell
env:
DRIVER_MODE: "TCC"
GPU_TYPE: "a100"
run: |
ci/tools/install_gpu_driver.ps1
ci/tools/configure_driver_mode.ps1

- name: Ensure GPU is working
run: |
Expand Down
34 changes: 27 additions & 7 deletions .github/workflows/test-wheel-linux.yml
Original file line number Diff line number Diff line change
Expand Up @@ -85,8 +85,13 @@ jobs:
# Read base matrix from YAML file for the specific architecture
TEST_MATRIX=$(yq -o json ".linux[\"${MATRIX_TYPE}\"] | map(select(.ARCH == \"${ARCH}\"))" ci/test-matrix.yml)

# Apply matrix filter and wrap in include structure
MATRIX=$(echo "$TEST_MATRIX" | jq -c '${{ inputs.matrix_filter }} | if (. | length) > 0 then {include: .} else "Error: Empty matrix\n" | halt_error(1) end')
# Apply matrix filter; reject custom DRIVER + FLAVOR=wsl (the
# in-container driver swap doesn't work under WSL); add a
# RUNNER_DRIVER field that maps any custom version back to
# 'latest' (the install script swaps the driver itself, so we
# need to land on the runner that ships with the most recent
# pre-installed driver); wrap in include structure.
MATRIX=$(echo "$TEST_MATRIX" | jq -c '${{ inputs.matrix_filter }} | if any(.[]; .DRIVER != "latest" and .DRIVER != "earliest" and .FLAVOR == "wsl") then "Error: custom DRIVER is not supported with FLAVOR=wsl\n" | halt_error(1) else . end | map(. + {RUNNER_DRIVER: (if .DRIVER == "latest" or .DRIVER == "earliest" then .DRIVER else "latest" end)}) | if (. | length) > 0 then {include: .} else "Error: Empty matrix\n" | halt_error(1) end')

echo "MATRIX=${MATRIX}" | tee --append "${GITHUB_OUTPUT}"

Expand All @@ -101,21 +106,21 @@ jobs:
strategy:
fail-fast: false
matrix: ${{ fromJSON(needs.compute-matrix.outputs.MATRIX) }}
runs-on: "${{ matrix.FLAVOR || 'linux' }}-${{ matrix.ARCH }}-gpu-${{ matrix.GPU }}-${{ matrix.DRIVER }}-${{ matrix.GPU_COUNT }}"
runs-on: "${{ matrix.FLAVOR || 'linux' }}-${{ matrix.ARCH }}-gpu-${{ matrix.GPU }}-${{ matrix.RUNNER_DRIVER }}-${{ matrix.GPU_COUNT }}"
# The build stage could fail but we want the CI to keep moving.
if: ${{ github.repository_owner == 'nvidia' && !cancelled() }}
# Our self-hosted runners require a container
# TODO: use a different (nvidia?) container
container:
options: -u root --security-opt seccomp=unconfined --shm-size 16g
# Custom-DRIVER rows need --privileged --pid=host so install_gpu_driver.sh
# can nsenter to the host for the install + refresh the toolkit bind mounts
# back inside the container. Stock options for latest/earliest rows.
options: ${{ ((matrix.DRIVER == 'latest' || matrix.DRIVER == 'earliest') && '-u root --security-opt seccomp=unconfined --shm-size 16g') || '-u root --security-opt seccomp=unconfined --shm-size 16g --privileged --pid=host' }}
image: ubuntu:22.04
env:
NVIDIA_VISIBLE_DEVICES: ${{ env.NVIDIA_VISIBLE_DEVICES }}
PIP_CACHE_DIR: "/tmp/pip-cache"
steps:
- name: Ensure GPU is working
run: nvidia-smi

- name: Checkout ${{ github.event.repository.name }}
uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10 # v6.0.3

Expand All @@ -131,6 +136,21 @@ jobs:
dependencies: "jq wget libgl1 libegl1 g++"
dependent_exes: "jq wget"

- name: Install GPU driver
if: ${{ matrix.DRIVER != 'latest' && matrix.DRIVER != 'earliest' }}
env:
DRIVER: ${{ matrix.DRIVER }}
GPU_TYPE: ${{ matrix.GPU }}
run: |
# util-linux for nsenter; install_gpu_driver.sh re-execs onto the
# host (requires --privileged --pid=host on the container, set
# conditionally above) and refreshes the toolkit bind mounts here.
apt-get -y install --no-install-recommends util-linux
./ci/tools/install_gpu_driver.sh

- name: Ensure GPU is working
run: nvidia-smi

- name: Set environment variables
env:
BUILD_CUDA_VER: ${{ inputs.build-ctk-ver }}
Expand Down
20 changes: 15 additions & 5 deletions .github/workflows/test-wheel-windows.yml
Original file line number Diff line number Diff line change
Expand Up @@ -81,8 +81,11 @@ jobs:
# Read base matrix from YAML file for the specific architecture
TEST_MATRIX=$(yq -o json ".windows[\"${MATRIX_TYPE}\"] | map(select(.ARCH == \"${ARCH}\"))" ci/test-matrix.yml)

# Apply matrix filter and wrap in include structure
MATRIX=$(echo "$TEST_MATRIX" | jq -c '${{ inputs.matrix_filter }} | if (. | length) > 0 then {include: .} else "Error: Empty matrix\n" | halt_error(1) end')
# Apply matrix filter; add a RUNNER_DRIVER field that maps any
# custom DRIVER version back to 'latest' (install_gpu_driver.ps1
# swaps the driver itself, so the runner must be the one that
# ships the most recent pre-installed driver); wrap in include.
MATRIX=$(echo "$TEST_MATRIX" | jq -c '${{ inputs.matrix_filter }} | map(. + {RUNNER_DRIVER: (if .DRIVER == "latest" or .DRIVER == "earliest" then .DRIVER else "latest" end)}) | if (. | length) > 0 then {include: .} else "Error: Empty matrix\n" | halt_error(1) end')

echo "MATRIX=${MATRIX}" | tee --append "${GITHUB_OUTPUT}"

Expand All @@ -95,7 +98,7 @@ jobs:
fail-fast: false
matrix: ${{ fromJSON(needs.compute-matrix.outputs.MATRIX) }}
if: ${{ github.repository_owner == 'nvidia' && !cancelled() }}
runs-on: "windows-${{ matrix.ARCH }}-gpu-${{ matrix.GPU }}-${{ matrix.DRIVER }}-${{ matrix.GPU_COUNT }}"
runs-on: "windows-${{ matrix.ARCH }}-gpu-${{ matrix.GPU }}-${{ matrix.RUNNER_DRIVER }}-${{ matrix.GPU_COUNT }}"
steps:
- name: Checkout ${{ github.event.repository.name }}
uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10 # v6.0.3
Expand All @@ -106,13 +109,20 @@ jobs:
with:
enable-apt: true

- name: Update driver
- name: Install GPU driver
if: ${{ matrix.DRIVER != 'latest' && matrix.DRIVER != 'earliest' }}
env:
DRIVER_MODE: ${{ matrix.DRIVER_MODE }}
DRIVER: ${{ matrix.DRIVER }}
GPU_TYPE: ${{ matrix.GPU }}
run: |
ci/tools/install_gpu_driver.ps1

- name: Configure driver mode
env:
DRIVER_MODE: ${{ matrix.DRIVER_MODE }}
run: |
ci/tools/configure_driver_mode.ps1

- name: Ensure GPU is working
run: |
nvidia-smi
Expand Down
17 changes: 13 additions & 4 deletions ci/test-matrix.yml
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,16 @@
# Windows entries also include DRIVER_MODE.
#
# Notes:
# - DRIVER accepts:
# * 'latest' - use the runner's pre-installed latest driver (no install step)
# * 'earliest' - use the runner's pre-installed earliest driver (no install step)
# * a version string (e.g. '580.65.06')
# - install that version via ci/tools/install_gpu_driver.sh (Linux)
# or ci/tools/install_gpu_driver.ps1 (Windows) at the start of the
# job. The matrix row is routed to the 'latest' runner image (the
# install scripts swap the driver themselves).
# - DRIVER: 'earliest' does not work with CUDA 12.9.1
# - DRIVER: a custom version is not supported with FLAVOR=wsl on Linux.

linux:
pull-request:
Expand All @@ -29,10 +38,10 @@ linux:
- { ARCH: 'amd64', PY_VER: '3.12', CUDA_VER: '13.3.0', LOCAL_CTK: '0', GPU: 'l4', GPU_COUNT: '1', DRIVER: 'latest' }
- { ARCH: 'amd64', PY_VER: '3.13', CUDA_VER: '12.9.1', LOCAL_CTK: '0', GPU: 'v100', GPU_COUNT: '1', DRIVER: 'latest' }
- { ARCH: 'amd64', PY_VER: '3.13', CUDA_VER: '13.0.2', LOCAL_CTK: '1', GPU: 'rtxpro6000', GPU_COUNT: '1', DRIVER: 'latest' }
- { ARCH: 'amd64', PY_VER: '3.13', CUDA_VER: '13.3.0', LOCAL_CTK: '1', GPU: 'rtxpro6000', GPU_COUNT: '1', DRIVER: 'latest' }
- { ARCH: 'amd64', PY_VER: '3.13', CUDA_VER: '13.3.0', LOCAL_CTK: '1', GPU: 'rtxpro6000', GPU_COUNT: '1', DRIVER: '610.43.02' }
- { ARCH: 'amd64', PY_VER: '3.14', CUDA_VER: '12.9.1', LOCAL_CTK: '0', GPU: 't4', GPU_COUNT: '1', DRIVER: 'latest' }
- { ARCH: 'amd64', PY_VER: '3.14', CUDA_VER: '13.0.2', LOCAL_CTK: '1', GPU: 'l4', GPU_COUNT: '1', DRIVER: 'latest' }
- { ARCH: 'amd64', PY_VER: '3.14', CUDA_VER: '13.3.0', LOCAL_CTK: '1', GPU: 'l4', GPU_COUNT: '1', DRIVER: 'latest' }
- { ARCH: 'amd64', PY_VER: '3.14', CUDA_VER: '13.3.0', LOCAL_CTK: '1', GPU: 'l4', GPU_COUNT: '1', DRIVER: '610.43.02' }
- { ARCH: 'amd64', PY_VER: '3.14t', CUDA_VER: '12.9.1', LOCAL_CTK: '1', GPU: 't4', GPU_COUNT: '1', DRIVER: 'latest' }
- { ARCH: 'amd64', PY_VER: '3.14t', CUDA_VER: '13.0.2', LOCAL_CTK: '1', GPU: 'l4', GPU_COUNT: '1', DRIVER: 'latest' }
- { ARCH: 'amd64', PY_VER: '3.14t', CUDA_VER: '13.3.0', LOCAL_CTK: '1', GPU: 'l4', GPU_COUNT: '1', DRIVER: 'latest' }
Expand Down Expand Up @@ -74,7 +83,7 @@ linux:
- { MODE: 'nightly-pytorch', ARCH: 'arm64', PY_VER: '3.12', CUDA_VER: '13.0.2', LOCAL_CTK: '0', GPU: 'l4', GPU_COUNT: '1', DRIVER: 'latest', TORCH_VER: '2.9.1', TORCH_CUDA: 'cu130' }
# nightly-numba-cuda
- { MODE: 'nightly-numba-cuda', ARCH: 'amd64', PY_VER: '3.12', CUDA_VER: '12.9.1', LOCAL_CTK: '0', GPU: 'l4', GPU_COUNT: '1', DRIVER: 'latest' }
- { MODE: 'nightly-numba-cuda', ARCH: 'amd64', PY_VER: '3.12', CUDA_VER: '13.3.0', LOCAL_CTK: '0', GPU: 'l4', GPU_COUNT: '1', DRIVER: 'latest' }
- { MODE: 'nightly-numba-cuda', ARCH: 'amd64', PY_VER: '3.12', CUDA_VER: '13.3.0', LOCAL_CTK: '0', GPU: 'l4', GPU_COUNT: '1', DRIVER: '580.65.06' }
Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

- { MODE: 'nightly-numba-cuda', ARCH: 'arm64', PY_VER: '3.12', CUDA_VER: '12.9.1', LOCAL_CTK: '0', GPU: 'l4', GPU_COUNT: '1', DRIVER: 'latest' }
- { MODE: 'nightly-numba-cuda', ARCH: 'arm64', PY_VER: '3.12', CUDA_VER: '13.3.0', LOCAL_CTK: '0', GPU: 'l4', GPU_COUNT: '1', DRIVER: 'latest' }
# nightly-standard (arm64 l4×2 — nightly-only per runner team request)
Expand Down Expand Up @@ -113,4 +122,4 @@ windows:
- { MODE: 'nightly-pytorch', ARCH: 'amd64', PY_VER: '3.12', CUDA_VER: '13.0.2', LOCAL_CTK: '0', GPU: 'l4', GPU_COUNT: '1', DRIVER: 'latest', DRIVER_MODE: 'TCC', TORCH_VER: '2.9.1', TORCH_CUDA: 'cu130' }
# nightly-numba-cuda
- { MODE: 'nightly-numba-cuda', ARCH: 'amd64', PY_VER: '3.12', CUDA_VER: '12.9.1', LOCAL_CTK: '0', GPU: 'l4', GPU_COUNT: '1', DRIVER: 'latest', DRIVER_MODE: 'TCC' }
- { MODE: 'nightly-numba-cuda', ARCH: 'amd64', PY_VER: '3.12', CUDA_VER: '13.3.0', LOCAL_CTK: '0', GPU: 'l4', GPU_COUNT: '1', DRIVER: 'latest', DRIVER_MODE: 'TCC' }
- { MODE: 'nightly-numba-cuda', ARCH: 'amd64', PY_VER: '3.12', CUDA_VER: '13.3.0', LOCAL_CTK: '0', GPU: 'l4', GPU_COUNT: '1', DRIVER: '596.36', DRIVER_MODE: 'TCC' }
Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

58 changes: 58 additions & 0 deletions ci/tools/configure_driver_mode.ps1
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
#
# SPDX-License-Identifier: Apache-2.0
#
# configure_driver_mode.ps1 -- set the NVIDIA driver mode on a Windows CI
# runner and cycle the display devices so the new mode takes effect
# without rebooting. Always runs (whether or not install_gpu_driver.ps1
# just ran). When install_gpu_driver.ps1 has run, this single device
# cycle also activates the freshly-installed driver.
#
# Inputs (env):
# DRIVER_MODE One of WDDM, TCC, MCDM.

function Set-DriverMode {

# Map matrix DRIVER_MODE to nvidia-smi -fdm code.
# This assumes we have the prior knowledge on which GPU can use which mode.
$driver_mode = $env:DRIVER_MODE
if ($driver_mode -eq "WDDM") {
Write-Output "Setting driver mode to WDDM..."
nvidia-smi -fdm 0
} elseif ($driver_mode -eq "TCC") {
Write-Output "Setting driver mode to TCC..."
nvidia-smi -fdm 1
} elseif ($driver_mode -eq "MCDM") {
Write-Output "Setting driver mode to MCDM..."
nvidia-smi -fdm 2
} else {
Write-Output "Unknown driver mode: $driver_mode"
exit 1
}

# Only restart NVIDIA display adapters, not other display devices (e.g. QEMU VGA)
$nvidia_devices = Get-PnpDevice -Class Display -FriendlyName "NVIDIA*"
foreach ($device in $nvidia_devices) {
Write-Output "Restarting device: $($device.FriendlyName) ($($device.InstanceId))"
pnputil /disable-device "$($device.InstanceId)"
pnputil /enable-device "$($device.InstanceId)"
}
Comment thread
leofang marked this conversation as resolved.

# Poll nvidia-smi until NVML can initialize, or give up after ~60s.
# A fixed sleep is not enough on slower-coming-back-up multi-GPU rows
# (e.g. 2x H100 MCDM) where pnputil enable returns before NVML is
# ready. Pattern borrowed from the runner-team `nvgha-driver.ps1`.
Write-Output "Waiting for nvidia-smi/NVML to come back up after device cycle..."
$deadline = (Get-Date).AddSeconds(60)
do {
Start-Sleep -Seconds 2
& nvidia-smi.exe 2>&1 | Out-Null
} while ($LASTEXITCODE -ne 0 -and (Get-Date) -lt $deadline)
if ($LASTEXITCODE -ne 0) {
Write-Error "nvidia-smi did not return cleanly within 60s of the device cycle"
exit 1
}
}

# Run the functions
Set-DriverMode
51 changes: 21 additions & 30 deletions ci/tools/install_gpu_driver.ps1
Original file line number Diff line number Diff line change
@@ -1,13 +1,30 @@
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
#
# SPDX-License-Identifier: Apache-2.0
#
# install_gpu_driver.ps1 -- install a specific NVIDIA driver version on a
# Windows CI runner. Driver-mode selection and the post-install device
# power-cycle are the responsibility of configure_driver_mode.ps1, which
# the workflow runs immediately after this script (or by itself when
# DRIVER is 'latest'/'earliest' and the runner already brings up the
# right driver).
#
# Inputs (env):
# DRIVER Driver version, e.g. "610.47". Must NOT be 'latest' or
# 'earliest' -- those are runner-pre-installed and the
# workflow is expected to skip this script for them.
# GPU_TYPE Lower-case GPU label from the matrix (e.g. "l4", "rtx4090").
# Selects the data-center vs desktop installer variant.

# Install the driver
function Install-Driver {

# Set the correct URL, filename, and arguments to the installer
# This driver is picked to support Windows 11 & CUDA 13.0
$version = '581.15'
# Driver version is plumbed from the matrix via the DRIVER env var.
$version = $env:DRIVER
if (-not $version -or $version -eq 'latest' -or $version -eq 'earliest') {
Write-Error "DRIVER env var must be a specific version string (e.g. '610.47'); got '$version'."
exit 1
}

# Get GPU type from environment variable
$gpu_type = $env:GPU_TYPE
Expand Down Expand Up @@ -54,33 +71,7 @@ function Install-Driver {
# Install the file with the specified path from earlier
Write-Output 'Running the driver installer...'
Start-Process -FilePath $filepath -ArgumentList $install_args -Wait
Write-Output 'Done!'

# Handle driver mode configuration
# This assumes we have the prior knowledge on which GPU can use which mode.
$driver_mode = $env:DRIVER_MODE
if ($driver_mode -eq "WDDM") {
Write-Output "Setting driver mode to WDDM..."
nvidia-smi -fdm 0
} elseif ($driver_mode -eq "TCC") {
Write-Output "Setting driver mode to TCC..."
nvidia-smi -fdm 1
} elseif ($driver_mode -eq "MCDM") {
Write-Output "Setting driver mode to MCDM..."
nvidia-smi -fdm 2
} else {
Write-Output "Unknown driver mode: $driver_mode"
exit 1
}
# Only restart NVIDIA display adapters, not other display devices (e.g. QEMU VGA)
$nvidia_devices = Get-PnpDevice -Class Display -FriendlyName "NVIDIA*"
foreach ($device in $nvidia_devices) {
Write-Output "Restarting device: $($device.FriendlyName) ($($device.InstanceId))"
pnputil /disable-device "$($device.InstanceId)"
pnputil /enable-device "$($device.InstanceId)"
}
# Give it a minute to settle:
Start-Sleep -Seconds 5
Write-Output 'Install complete; driver mode + device cycle handled by configure_driver_mode.ps1.'
}

# Run the functions
Expand Down
Loading