From 4568e9cdb6794c77cbcf88b1f89f7a5a81debb82 Mon Sep 17 00:00:00 2001 From: Leo Fang Date: Thu, 2 Jul 2026 15:01:42 +0000 Subject: [PATCH 1/3] ci/test-matrix.yml: re-enable arm64 gh200 nightly-standard row Reverts the disable in 4c70cfac05 now that the runner team has fixed the pool-side hang on stream-ordered memory allocator calls. --- ci/test-matrix.yml | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/ci/test-matrix.yml b/ci/test-matrix.yml index c9eafd4f521..54b49a405e3 100644 --- a/ci/test-matrix.yml +++ b/ci/test-matrix.yml @@ -97,9 +97,7 @@ linux: - { ARCH: 'arm64', PY_VER: '3.12', CUDA_VER: '12.9.1', LOCAL_CTK: '0', GPU: 'l4', GPU_COUNT: '1', DRIVER: 'latest', ENV: { MODE: 'nightly-numba-cuda' } } - { ARCH: 'arm64', PY_VER: '3.12', CUDA_VER: '13.3.0', LOCAL_CTK: '0', GPU: 'l4', GPU_COUNT: '1', DRIVER: 'latest', ENV: { MODE: 'nightly-numba-cuda' } } # nightly-standard (arm64 nightly-only runners — per runner team request) - # TODO: gh200 row disabled — currently hangs on stream-ordered memory - # allocator (cudaMallocAsync); runner pool needs fixing first. - # - { ARCH: 'arm64', PY_VER: '3.14', CUDA_VER: '13.3.0', LOCAL_CTK: '1', GPU: 'gh200', GPU_COUNT: '1', DRIVER: 'latest', ENV: { MODE: 'nightly-standard' } } + - { ARCH: 'arm64', PY_VER: '3.14', CUDA_VER: '13.3.0', LOCAL_CTK: '1', GPU: 'gh200', GPU_COUNT: '1', DRIVER: 'latest', ENV: { MODE: 'nightly-standard' } } - { ARCH: 'arm64', PY_VER: '3.14', CUDA_VER: '13.3.0', LOCAL_CTK: '1', GPU: 'l4', GPU_COUNT: '2', DRIVER: 'latest', ENV: { MODE: 'nightly-standard' } } - { ARCH: 'arm64', PY_VER: '3.14t', CUDA_VER: '13.3.0', LOCAL_CTK: '1', GPU: 'l4', GPU_COUNT: '2', DRIVER: 'latest', ENV: { MODE: 'nightly-standard' } } From 8d51cf761dda86fc29eeb3e68aba8860e366ef19 Mon Sep 17 00:00:00 2001 From: Leo Fang Date: Thu, 2 Jul 2026 15:01:55 +0000 Subject: [PATCH 2/3] Temporarily add push trigger to ci-nightly.yml for testing Remove before merging. --- .github/workflows/ci-nightly.yml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/.github/workflows/ci-nightly.yml b/.github/workflows/ci-nightly.yml index c6ecb9fdff8..79e73863fc3 100644 --- a/.github/workflows/ci-nightly.yml +++ b/.github/workflows/ci-nightly.yml @@ -17,6 +17,10 @@ concurrency: cancel-in-progress: true on: + push: + branches: + - "main" + - "pull-request/[0-9]+" schedule: # 2:17 AM UTC daily, after the midnight main CI build finishes. # Avoid minute 0 because GitHub documents high scheduled-workflow load From f1482141ca84c31f468278171037ab0e6a739b43 Mon Sep 17 00:00:00 2001 From: Leo Fang Date: Thu, 2 Jul 2026 19:41:45 +0000 Subject: [PATCH 3/3] CI: deselect test_get_bar_size_in_kb on gh200 runners The cufile BAR-size query returns CUDA_ERROR_NOT_SUPPORTED on Grace+Hopper (unified memory, no discrete PCIe BAR). Tracked in #2299; remove this deselect once the test skipif is fixed upstream. Uses PYTEST_ADDOPTS so no changes to run-tests are needed. --- .github/workflows/test-wheel-linux.yml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.github/workflows/test-wheel-linux.yml b/.github/workflows/test-wheel-linux.yml index 982d4d1c491..08e95aba16e 100644 --- a/.github/workflows/test-wheel-linux.yml +++ b/.github/workflows/test-wheel-linux.yml @@ -351,6 +351,9 @@ jobs: env: CUDA_VER: ${{ matrix.CUDA_VER }} LOCAL_CTK: ${{ matrix.LOCAL_CTK }} + # #2299: BAR-size query returns CUDA_ERROR_NOT_SUPPORTED on G+H; + # skip the test on gh200 runners until upstream cufile guards it. + PYTEST_ADDOPTS: ${{ matrix.GPU == 'gh200' && '--deselect tests/test_cufile.py::test_get_bar_size_in_kb' || '' }} run: run-tests bindings - name: Run cuda.bindings benchmarks (smoke test)