From 4568e9cdb6794c77cbcf88b1f89f7a5a81debb82 Mon Sep 17 00:00:00 2001
From: Leo Fang <leof@nvidia.com>
Date: Thu, 2 Jul 2026 15:01:42 +0000
Subject: [PATCH 1/3] ci/test-matrix.yml: re-enable arm64 gh200
 nightly-standard row

Reverts the disable in 4c70cfac05 now that the runner team has fixed the
pool-side hang on stream-ordered memory allocator calls.
---
 ci/test-matrix.yml | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/ci/test-matrix.yml b/ci/test-matrix.yml
index c9eafd4f521..54b49a405e3 100644
--- a/ci/test-matrix.yml
+++ b/ci/test-matrix.yml
@@ -97,9 +97,7 @@ linux:
     - { ARCH: 'arm64', PY_VER: '3.12',  CUDA_VER: '12.9.1', LOCAL_CTK: '0', GPU: 'l4',         GPU_COUNT: '1', DRIVER: 'latest',     ENV: { MODE: 'nightly-numba-cuda' } }
     - { ARCH: 'arm64', PY_VER: '3.12',  CUDA_VER: '13.3.0', LOCAL_CTK: '0', GPU: 'l4',         GPU_COUNT: '1', DRIVER: 'latest',     ENV: { MODE: 'nightly-numba-cuda' } }
     # nightly-standard (arm64 nightly-only runners — per runner team request)
-    # TODO: gh200 row disabled — currently hangs on stream-ordered memory
-    #       allocator (cudaMallocAsync); runner pool needs fixing first.
-    # - { ARCH: 'arm64', PY_VER: '3.14',  CUDA_VER: '13.3.0', LOCAL_CTK: '1', GPU: 'gh200',      GPU_COUNT: '1', DRIVER: 'latest',     ENV: { MODE: 'nightly-standard' } }
+    - { ARCH: 'arm64', PY_VER: '3.14',  CUDA_VER: '13.3.0', LOCAL_CTK: '1', GPU: 'gh200',      GPU_COUNT: '1', DRIVER: 'latest',     ENV: { MODE: 'nightly-standard' } }
     - { ARCH: 'arm64', PY_VER: '3.14',  CUDA_VER: '13.3.0', LOCAL_CTK: '1', GPU: 'l4',         GPU_COUNT: '2', DRIVER: 'latest',     ENV: { MODE: 'nightly-standard' } }
     - { ARCH: 'arm64', PY_VER: '3.14t', CUDA_VER: '13.3.0', LOCAL_CTK: '1', GPU: 'l4',         GPU_COUNT: '2', DRIVER: 'latest',     ENV: { MODE: 'nightly-standard' } }
 

From 8d51cf761dda86fc29eeb3e68aba8860e366ef19 Mon Sep 17 00:00:00 2001
From: Leo Fang <leof@nvidia.com>
Date: Thu, 2 Jul 2026 15:01:55 +0000
Subject: [PATCH 2/3] Temporarily add push trigger to ci-nightly.yml for
 testing

Remove before merging.
---
 .github/workflows/ci-nightly.yml | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/.github/workflows/ci-nightly.yml b/.github/workflows/ci-nightly.yml
index c6ecb9fdff8..79e73863fc3 100644
--- a/.github/workflows/ci-nightly.yml
+++ b/.github/workflows/ci-nightly.yml
@@ -17,6 +17,10 @@ concurrency:
   cancel-in-progress: true
 
 on:
+  push:
+    branches:
+      - "main"
+      - "pull-request/[0-9]+"
   schedule:
     # 2:17 AM UTC daily, after the midnight main CI build finishes.
     # Avoid minute 0 because GitHub documents high scheduled-workflow load

From f1482141ca84c31f468278171037ab0e6a739b43 Mon Sep 17 00:00:00 2001
From: Leo Fang <leof@nvidia.com>
Date: Thu, 2 Jul 2026 19:41:45 +0000
Subject: [PATCH 3/3] CI: deselect test_get_bar_size_in_kb on gh200 runners

The cufile BAR-size query returns CUDA_ERROR_NOT_SUPPORTED on
Grace+Hopper (unified memory, no discrete PCIe BAR). Tracked in #2299;
remove this deselect once the test skipif is fixed upstream.

Uses PYTEST_ADDOPTS so no changes to run-tests are needed.
---
 .github/workflows/test-wheel-linux.yml | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/.github/workflows/test-wheel-linux.yml b/.github/workflows/test-wheel-linux.yml
index 982d4d1c491..08e95aba16e 100644
--- a/.github/workflows/test-wheel-linux.yml
+++ b/.github/workflows/test-wheel-linux.yml
@@ -351,6 +351,9 @@ jobs:
         env:
           CUDA_VER: ${{ matrix.CUDA_VER }}
           LOCAL_CTK: ${{ matrix.LOCAL_CTK }}
+          # #2299: BAR-size query returns CUDA_ERROR_NOT_SUPPORTED on G+H;
+          # skip the test on gh200 runners until upstream cufile guards it.
+          PYTEST_ADDOPTS: ${{ matrix.GPU == 'gh200' && '--deselect tests/test_cufile.py::test_get_bar_size_in_kb' || '' }}
         run: run-tests bindings
 
       - name: Run cuda.bindings benchmarks (smoke test)