From aec725fa62f9f50decd3d82dc4be5f6032af26dd Mon Sep 17 00:00:00 2001 From: Michel Schanen Date: Thu, 18 Jun 2026 15:43:37 +0000 Subject: [PATCH] Make host USM resident before kernel use Host USM allocations (HostBuffer) and the host pattern buffer used by `fill!` were never made resident on the device, unlike DeviceBuffer and SharedBuffer allocations (which already call `make_resident`). A GPU kernel that reads a non-resident host buffer can take a NotPresent pagefault under GC/allocation churn, even though host USM is nominally device-accessible. Make host USM resident at allocation time and for `fill!`'s pattern buffer, removing the asymmetry with device/shared USM. Co-Authored-By: Claude Opus 4.8 (1M context) --- src/array.jl | 7 +++++-- src/pool.jl | 8 +++++++- 2 files changed, 12 insertions(+), 3 deletions(-) diff --git a/src/array.jl b/src/array.jl index 3a8ffddb..a6bb9b3d 100644 --- a/src/array.jl +++ b/src/array.jl @@ -526,9 +526,12 @@ function Base.fill!(A::oneDenseArray{T}, val) where T val = convert(T, val) sizeof(T) == 0 && return A - # execute! is async, so we need to allocate the pattern in USM memory - # and keep it alive until the operation completes. + # execute! is async, so we need to allocate the pattern in USM memory and keep it alive + # until the operation completes. The fill reads this host buffer on the GPU, so it must + # be made resident on the device like any other USM a kernel reads (see + # `allocate(::Type{oneL0.HostBuffer}, ...)`). buf = oneL0.host_alloc(context(A), sizeof(T), Base.datatype_alignment(T)) + oneL0.make_resident(context(A), device(), buf) unsafe_store!(convert(Ptr{T}, buf), val) unsafe_fill!(context(A), device(), pointer(A), convert(ZePtr{T}, buf), length(A)) synchronize(global_queue(context(A), device())) diff --git a/src/pool.jl b/src/pool.jl index 165a7f07..ed3b75ba 100644 --- a/src/pool.jl +++ b/src/pool.jl @@ -55,7 +55,13 @@ end function allocate(::Type{oneL0.HostBuffer}, ctx, dev, bytes::Int, alignment::Int) bytes == 0 && return oneL0.HostBuffer(ZE_NULL, bytes, ctx) - host_alloc(ctx, bytes, alignment) + buf = host_alloc(ctx, bytes, alignment) + # Host USM must be made resident on the device, exactly like the device and shared + # allocations above. A GPU kernel that reads a non-resident host buffer can take a + # NotPresent pagefault under GC/allocation churn, even though host USM is nominally + # device-accessible. + make_resident(ctx, dev, buf) + return buf end function release(buf::oneL0.AbstractBuffer)