JuliaGPU · michel2323 · Jun 18, 2026
diff --git a/src/mapreduce.jl b/src/mapreduce.jl
@@ -33,7 +33,11 @@
     # perform a reduction
     d = 1
     while d < items
-        barrier(0)
+        # Fence local memory: `barrier(0)` lowers to an OpControlBarrier without the
+        # WorkgroupMemory storage-class bit, which does not order the shared-local tree
+        # accesses across the barrier. Fence local memory so each tree step sees the
+        # previous step's `shared[]` writes.
+        barrier(SPIRVIntrinsics.LOCAL_MEM_FENCE)
         index = 2 * d * (item-1) + 1
         @inbounds if index <= items
             other_val = if index + d <= items

diff --git a/src/oneAPIKernels.jl b/src/oneAPIKernels.jl
@@ -214,7 +214,13 @@ end
 ## Synchronization and Printing
 
 @device_override @inline function KA.__synchronize()
-    barrier(0)
+    # Fence both local and global memory across the workgroup barrier, matching CUDA
+    # `__syncthreads` semantics. `barrier(0)` lowers to `OpControlBarrier` with
+    # `SequentiallyConsistent` but WITHOUT any storage-class bit, which the SPIR-V spec
+    # treats as ordering *no* memory — so shared-local or global writes are not guaranteed
+    # visible to other work-items after the barrier. `LOCAL_MEM_FENCE | GLOBAL_MEM_FENCE`
+    # ORs in the WorkgroupMemory/CrossWorkgroupMemory fence bits.
+    barrier(SPIRVIntrinsics.LOCAL_MEM_FENCE | SPIRVIntrinsics.GLOBAL_MEM_FENCE)
 end
 
 @device_override @inline function KA.__print(args...)