From dd1ba6ba297b84bcc015004831ed7b3ce1963585 Mon Sep 17 00:00:00 2001 From: Michel Schanen Date: Fri, 12 Jun 2026 14:23:18 +0000 Subject: [PATCH] Add opt-in GPU spreading for the parallel test suite MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ONEAPI_TEST_SPREAD_GPUS=1 pins each test worker process to a distinct GPU via ZE_AFFINITY_MASK (claimed round-robin through an atomic mkdir counter, set before `using oneAPI` so the Level Zero driver picks it up at init). This spreads the suite across all tiles instead of oversubscribing device 0. device() is task-local and Malt runs each test in a fresh task, so a device! in init_worker_code would not stick — process-level pinning is the robust approach. Default (unset) keeps every worker on the first device, preserving single-tile oversubscription which is useful for surfacing contention bugs. Verified: 6 concurrent claimers -> 6 distinct device UUIDs; real harness with --jobs=4 spreads cleanly (SUCCESS). Co-Authored-By: Claude Opus 4.8 (1M context) --- test/runtests.jl | 40 +++++++++++++++++++++++++++++++++++++++- 1 file changed, 39 insertions(+), 1 deletion(-) diff --git a/test/runtests.jl b/test/runtests.jl index b7015787..27b97776 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -24,7 +24,45 @@ end args = parse_args(ARGS) +# Optional: spread test workers across all available GPUs, one worker per device +# (round-robin), by pinning each worker *process* to a device with ZE_AFFINITY_MASK. +# `device()` is task-local and Malt runs each test in a fresh task, so a `device!` in +# `init_worker_code` would not stick — pinning the process via the driver is the robust +# way to make every task on a worker use the same GPU. +# +# Enabled with ONEAPI_TEST_SPREAD_GPUS=1. When unset (the default) every worker stays on +# the first device, which oversubscribes a single tile — useful for surfacing +# contention/oversubscription bugs. +const spread_gpus = lowercase(get(ENV, "ONEAPI_TEST_SPREAD_GPUS", "")) in ("1", "true", "yes") +worker_env = Vector{Pair{String, String}}() +device_claim_code = :() +if spread_gpus + ndev = length(oneAPI.devices()) + # shared, node-local directory used as an atomic round-robin counter (mkdir is atomic) + devdir = mktempdir(; prefix = "oneapi_test_gpus_") + push!(worker_env, "ONEAPI_TEST_DEVDIR" => devdir) + push!(worker_env, "ONEAPI_TEST_NDEV" => string(ndev)) + @info "Spreading test workers across $ndev GPU(s) via ZE_AFFINITY_MASK (ONEAPI_TEST_SPREAD_GPUS=1)" + # NOTE: runs on the worker as the very first thing, before `using oneAPI` — so the + # Level Zero driver picks up ZE_AFFINITY_MASK at init and the process sees only its tile. + device_claim_code = quote + let dir = ENV["ONEAPI_TEST_DEVDIR"], ndev = parse(Int, ENV["ONEAPI_TEST_NDEV"]) + i = 0 + while true + try + mkdir(joinpath(dir, string(i))) + break + catch + i += 1 + end + end + ENV["ZE_AFFINITY_MASK"] = string(i % ndev) + end + end +end + init_worker_code = quote + $device_claim_code using oneAPI, Adapt import GPUArrays @@ -105,4 +143,4 @@ init_code = quote ..@grab_output, ..@on_device, ..sink end -runtests(oneAPI, args; testsuite, init_code, init_worker_code) +runtests(oneAPI, args; testsuite, init_code, init_worker_code, env = worker_env)