From dd1ba6ba297b84bcc015004831ed7b3ce1963585 Mon Sep 17 00:00:00 2001
From: Michel Schanen <mschanen@anl.gov>
Date: Fri, 12 Jun 2026 14:23:18 +0000
Subject: [PATCH] Add opt-in GPU spreading for the parallel test suite
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

ONEAPI_TEST_SPREAD_GPUS=1 pins each test worker process to a distinct GPU via
ZE_AFFINITY_MASK (claimed round-robin through an atomic mkdir counter, set before
`using oneAPI` so the Level Zero driver picks it up at init). This spreads the suite
across all tiles instead of oversubscribing device 0.

device() is task-local and Malt runs each test in a fresh task, so a device! in
init_worker_code would not stick — process-level pinning is the robust approach.

Default (unset) keeps every worker on the first device, preserving single-tile
oversubscription which is useful for surfacing contention bugs.

Verified: 6 concurrent claimers -> 6 distinct device UUIDs; real harness with
--jobs=4 spreads cleanly (SUCCESS).

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 test/runtests.jl | 40 +++++++++++++++++++++++++++++++++++++++-
 1 file changed, 39 insertions(+), 1 deletion(-)

diff --git a/test/runtests.jl b/test/runtests.jl
index b7015787..27b97776 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -24,7 +24,45 @@ end
 
 args = parse_args(ARGS)
 
+# Optional: spread test workers across all available GPUs, one worker per device
+# (round-robin), by pinning each worker *process* to a device with ZE_AFFINITY_MASK.
+# `device()` is task-local and Malt runs each test in a fresh task, so a `device!` in
+# `init_worker_code` would not stick — pinning the process via the driver is the robust
+# way to make every task on a worker use the same GPU.
+#
+# Enabled with ONEAPI_TEST_SPREAD_GPUS=1. When unset (the default) every worker stays on
+# the first device, which oversubscribes a single tile — useful for surfacing
+# contention/oversubscription bugs.
+const spread_gpus = lowercase(get(ENV, "ONEAPI_TEST_SPREAD_GPUS", "")) in ("1", "true", "yes")
+worker_env = Vector{Pair{String, String}}()
+device_claim_code = :()
+if spread_gpus
+    ndev = length(oneAPI.devices())
+    # shared, node-local directory used as an atomic round-robin counter (mkdir is atomic)
+    devdir = mktempdir(; prefix = "oneapi_test_gpus_")
+    push!(worker_env, "ONEAPI_TEST_DEVDIR" => devdir)
+    push!(worker_env, "ONEAPI_TEST_NDEV" => string(ndev))
+    @info "Spreading test workers across $ndev GPU(s) via ZE_AFFINITY_MASK (ONEAPI_TEST_SPREAD_GPUS=1)"
+    # NOTE: runs on the worker as the very first thing, before `using oneAPI` — so the
+    # Level Zero driver picks up ZE_AFFINITY_MASK at init and the process sees only its tile.
+    device_claim_code = quote
+        let dir = ENV["ONEAPI_TEST_DEVDIR"], ndev = parse(Int, ENV["ONEAPI_TEST_NDEV"])
+            i = 0
+            while true
+                try
+                    mkdir(joinpath(dir, string(i)))
+                    break
+                catch
+                    i += 1
+                end
+            end
+            ENV["ZE_AFFINITY_MASK"] = string(i % ndev)
+        end
+    end
+end
+
 init_worker_code = quote
+    $device_claim_code
     using oneAPI, Adapt
 
     import GPUArrays
@@ -105,4 +143,4 @@ init_code = quote
            ..@grab_output, ..@on_device, ..sink
 end
 
-runtests(oneAPI, args; testsuite, init_code, init_worker_code)
+runtests(oneAPI, args; testsuite, init_code, init_worker_code, env = worker_env)