From 6d66a23075721c1bd9d0f0971971c567ee6c0fde Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mos=C3=A8=20Giordano?= <mose@gnu.org>
Date: Tue, 9 Jun 2026 23:45:00 +0100
Subject: [PATCH 01/11] Improve public API docs and expand the manual.

Add missing docstrings for backends, kernel handles, and reflection macros;
expand quickstart, kernels, and API pages with examples.

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 .github/workflows/ci.yml            |  34 ++++--
 docs/Project.toml                   |   4 +
 docs/make.jl                        |   2 -
 docs/src/api.md                     |  48 +++++++-
 docs/src/examples/memcopy_static.md |   2 +-
 docs/src/implementations.md         |   2 +-
 docs/src/kernels.md                 | 111 ++++++++++++++++---
 docs/src/quickstart.md              |  51 ++++++++-
 src/KernelAbstractions.jl           | 164 ++++++++++++++++++++++++----
 src/nditeration.jl                  |  18 +++
 src/reflection.jl                   |  46 ++++----
 11 files changed, 405 insertions(+), 77 deletions(-)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index ccc66a7f6..2aabc3845 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -1,4 +1,5 @@
 name: CI-KA
+
 on:
   push:
     branches:
@@ -6,14 +7,17 @@ on:
       - release-*
     tags: '*'
   pull_request:
+
 defaults:
   run:
     shell: bash
+
 concurrency:
   # Skip intermediate builds: always.
   # Cancel intermediate builds: only if it is a pull request build.
   group: ${{ github.workflow }}-${{ github.ref }}
   cancel-in-progress: ${{ startsWith(github.ref, 'refs/pull/') }}
+
 jobs:
   CI:
     name: CI
@@ -159,6 +163,7 @@ jobs:
       - uses: codecov/codecov-action@v6
         with:
           files: lcov.info
+
   docs:
     name: Documentation
     runs-on: ubuntu-latest
@@ -167,12 +172,19 @@ jobs:
       - uses: julia-actions/setup-julia@v3
         with:
           version: '1'
-      - run: |
-          julia --project=docs -e 'import Pkg; Pkg.develop(path=".")'
-          julia --project=docs docs/make.jl
+      - uses: julia-actions/cache@v3
+      - name: "Instantiate docs environment"
+        shell: julia --project=docs --color=yes {0}
+        run: |
+          using Pkg
+          Pkg.instantiate()
+      - name: "Build docs"
+        run: |
+          julia --project=docs --color=yes docs/make.jl
         env:
           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
           DOCUMENTER_KEY: ${{ secrets.DOCUMENTER_KEY }}
+
   doctests:
     name: Doctests
     runs-on: ubuntu-latest
@@ -181,9 +193,13 @@ jobs:
       - uses: julia-actions/setup-julia@v3
         with:
           version: '1'
-      - run: |
-          julia --project=docs -e 'import Pkg; Pkg.develop(path=".")'
-          julia --project=docs -e '
-            using Documenter: doctest
-            using KernelAbstractions
-            doctest(KernelAbstractions; manual = true)'
+      - uses: julia-actions/cache@v3
+      - name: "Run doctests"
+        shell: julia --project=docs --color=yes {0}
+        run: |
+          using Pkg
+          Pkg.instantiate()
+
+          using Documenter: doctest
+          using KernelAbstractions
+          doctest(KernelAbstractions; manual = true)
diff --git a/docs/Project.toml b/docs/Project.toml
index 1814eb330..96dfe1047 100644
--- a/docs/Project.toml
+++ b/docs/Project.toml
@@ -1,5 +1,9 @@
 [deps]
 Documenter = "e30172f5-a6a5-5a46-863b-614d45cd2de4"
+KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c"
 
 [compat]
 Documenter = "1"
+
+[sources]
+KernelAbstractions = {path = ".."}
diff --git a/docs/make.jl b/docs/make.jl
index 6188fdbfd..dd6753d25 100644
--- a/docs/make.jl
+++ b/docs/make.jl
@@ -1,5 +1,3 @@
-push!(Base.LOAD_PATH, dirname(@__DIR__))
-
 using KernelAbstractions
 using Documenter
 
diff --git a/docs/src/api.md b/docs/src/api.md
index 4e107075b..ee068f336 100644
--- a/docs/src/api.md
+++ b/docs/src/api.md
@@ -13,21 +13,63 @@
 @uniform
 @groupsize
 @ndrange
-synchronize
-allocate
 ```
 
 ## Host language
 
+### Backends and arrays
+
 ```@docs
+Backend
+GPU
+CPU
+POCLBackend
+get_backend
+KernelAbstractions.allocate
 KernelAbstractions.zeros
+KernelAbstractions.ones
+KernelAbstractions.copyto!
+KernelAbstractions.pagelock!
+KernelAbstractions.unsafe_free!
+KernelAbstractions.functional
 KernelAbstractions.supports_unified
+KernelAbstractions.supports_atomics
+KernelAbstractions.supports_float64
 ```
 
-## Internal
+### Devices and execution
+
+```@docs
+synchronize
+KernelAbstractions.device
+KernelAbstractions.ndevices
+KernelAbstractions.device!
+KernelAbstractions.priority!
+```
+
+### Kernel handles
 
 ```@docs
 KernelAbstractions.Kernel
+KernelAbstractions.workgroupsize
+KernelAbstractions.ndrange
+KernelAbstractions.backend
+```
+
+## Reflection
+
+These macros help inspect the generated kernel code. GPU LLVM reflection is only supported
+on the CPU backend via [`@ka_code_llvm`](@ref).
+
+```@docs
+@ka_code_typed
+@ka_code_llvm
+```
+
+## Internal
+
+```@docs
 KernelAbstractions.partition
 KernelAbstractions.@context
+KernelAbstractions.argconvert
 ```
diff --git a/docs/src/examples/memcopy_static.md b/docs/src/examples/memcopy_static.md
index cb84c387d..deba4a8e5 100644
--- a/docs/src/examples/memcopy_static.md
+++ b/docs/src/examples/memcopy_static.md
@@ -1,4 +1,4 @@
-# Memcopy with static NDRange
+# [Memcopy with static NDRange](@id memcopy_static)
 
 The first example simple copies memory from `B` to `A`. In contrast to the previous examples
 it uses a fully static kernel configuration. Specializing the kernel on the iteration range itself.
diff --git a/docs/src/implementations.md b/docs/src/implementations.md
index cc2c05d24..0b640d4f3 100644
--- a/docs/src/implementations.md
+++ b/docs/src/implementations.md
@@ -1,4 +1,4 @@
-# Notes for backend implementations
+# [Notes for backend implementations](@id implementations_notes)
 
 ## Semantics of `KernelAbstractions.synchronize`
 
diff --git a/docs/src/kernels.md b/docs/src/kernels.md
index a45ea3fec..54e206a61 100644
--- a/docs/src/kernels.md
+++ b/docs/src/kernels.md
@@ -1,23 +1,108 @@
-# Writing kernels 
+# Writing kernels
 
-These kernel language constructs are intended to be used as part
-of [`@kernel`](@ref) functions and not valid outside that context.
+These kernel language constructs are intended to be used inside [`@kernel`](@ref) functions.
+They are not valid in ordinary Julia code (except when using experimental `@kernel cpu=false`).
 
 ## Constant arguments
 
-Kernel functions allow for input arguments to be marked with the
-[`@Const`](@ref) macro. It informs the compiler that the memory
-accessed through that marked input argument, will not be written
-to as part of the kernel. This has the implication that input arguments
-are **not** allowed to alias each other. If you are used to CUDA C this
-is similar to `const restrict`.
+Kernel functions allow input arguments to be marked with the [`@Const`](@ref) macro. It informs
+the compiler that the memory accessed through that argument will not be written to as part of
+the kernel, and that it does not alias any other memory in the kernel. If you are used to CUDA C,
+this is similar to `const restrict`.
+
+```julia
+@kernel function saxpy!(a, @Const(X), Y)
+    I = @index(Global)
+    @inbounds Y[I] = a * X[I] + Y[I]
+end
+```
 
 ## Indexing
 
-There are several [`@index`](@ref) variants.
+The [`@index`](@ref) macro returns the index of the current work item. Choose a **granularity**
+and an optional **kind**:
+
+| Granularity | Meaning |
+|-------------|---------|
+| `Global` | Index over the full `ndrange` (use for global memory) |
+| `Group` | Index of the current workgroup |
+| `Local` | Index within the current workgroup |
+
+| Kind | Result type |
+|------|-------------|
+| `Linear` (default) | `Int` linear index |
+| `Cartesian` | `CartesianIndex` for multi-dimensional `ndrange` |
+| `NTuple` | `NTuple` of `Int` indices |
+
+```julia
+@kernel function fill_diagonal!(A, val)
+    I = @index(Global, Cartesian)
+    if I[1] == I[2]
+        @inbounds A[I] = val
+    end
+end
+
+@kernel function linear_example(A)
+    I = @index(Global, Linear)   # 1, 2, 3, ...
+    g = @index(Group, Linear)    # workgroup id
+    l = @index(Local, Linear)    # lane within workgroup
+    @inbounds A[I] = g + l
+end
+```
+
+Inside a kernel, [`@groupsize`](@ref) and [`@ndrange`](@ref) query the launch configuration:
+
+```julia
+@kernel function scale!(A, factor)
+    N = prod(@groupsize())
+    I = @index(Global, Linear)
+    lmem = @localmem Float32 (N,)
+    i = @index(Local, Linear)
+    lmem[i] = factor
+    @synchronize()
+    @inbounds A[I] = lmem[i]
+end
+```
+
+## Local memory, synchronization, and private memory
+
+[`@localmem`](@ref) declares storage shared by all work items in a workgroup. Reads and writes
+must be separated by [`@synchronize`](@ref) if they are performed by different work items:
+
+```julia
+@kernel function reverse_block!(A)
+    I = @index(Global, Linear)
+    i = @index(Local, Linear)
+    N = prod(@groupsize())
+    buf = @localmem Int (N,)
+    buf[i] = i
+    @synchronize()
+    @inbounds A[I] = buf[N - i + 1]
+end
+```
+
+[`@private`](@ref) and [`@uniform`](@ref) are deprecated for KernelAbstractions 1.0. Prefer
+`MArray` for per-lane scratch storage that does not need to survive across `@synchronize`.
+
+## Launching kernels
+
+Construct a kernel by calling the kernel function on a backend and optional static sizes, then
+launch it with `ndrange`:
+
+```julia
+# dynamic sizes — supply ndrange (and optionally workgroupsize) at launch
+kernel = my_kernel(backend)
+kernel(A, ndrange=size(A))
 
-## Local memory, variable lifetime and private memory
+# static workgroup size
+kernel = my_kernel(backend, 256)
+kernel(A, ndrange=size(A))
 
-[`@localmem`](@ref), [`@synchronize`](@ref), [`@private`](@ref)
+# static workgroup size and ndrange — fewer runtime checks, may reduce recompilation
+kernel = my_kernel(backend, 32, size(A))
+kernel(A)
+```
 
-# Launching kernels
+On GPU backends, obtain the backend from an array with [`get_backend`](@ref) and always call
+[`synchronize`](@ref) before reading results on the host. See the [Quickstart](@ref) for a full walkthrough and the Examples section of the manual
+for larger patterns.
diff --git a/docs/src/quickstart.md b/docs/src/quickstart.md
index 82bc0f600..4756ad34b 100644
--- a/docs/src/quickstart.md
+++ b/docs/src/quickstart.md
@@ -43,6 +43,21 @@ all(A .== 2.0)
 All kernels are launched asynchronously.
 The [`synchronize`](@ref) blocks the *host* until the kernel has completed on the backend.
 
+### Static workgroup size and `ndrange`
+
+When the workgroup size and `ndrange` are known ahead of time, pass them to the kernel
+constructor to enable additional compile-time optimizations and avoid supplying them at
+every launch:
+
+```julia
+# workgroup size 32, ndrange (128, 128) — fixed for this kernel object
+kernel = mul2_kernel(dev, 32, size(A))
+kernel(A)  # ndrange inferred from construction
+synchronize(dev)
+```
+
+See also [Memcopy with static NDRange](@ref memcopy_static).
+
 ## Launching kernel on the backend
 
 To launch the kernel on a backend-supported backend `isa(backend, KA.GPU)` (e.g., `CUDABackend()`, `ROCBackend()`, `oneAPIBackend()`, `MetalBackend()`), we generate the kernel
@@ -108,6 +123,38 @@ function mymul(A, B)
 end
 ```
 
-## Using task programming to launch kernels in parallel.
+## Using task programming to launch kernels in parallel
+
+As shown in the [Synchronization](@ref) section above, multiple kernels can be enqueued on the
+same backend before a single [`synchronize`](@ref) call. The same pattern extends to Julia's
+task-based parallelism: launch kernels from [`Threads.@spawn`](https://docs.julialang.org/en/stable/base/multi-threading/#Base.Threads.@spawn)
+tasks when you want to overlap kernel execution with other asynchronous host work.
+
+On GPU backends, [`synchronize`](@ref) is **cooperative** — it yields to the Julia scheduler
+rather than blocking inside a driver call, so other tasks can make progress while a kernel runs.
+See [Notes for backend implementations](@ref implementations_notes) for the contract backend authors must follow.
+
+```julia
+function cooperative_wait(task::Task)
+    while !Base.istaskdone(task)
+        yield()
+    end
+    return wait(task)
+end
+
+function exchange_and_compute!(backend, A, B)
+    recv = Threads.@spawn begin
+        mul2_kernel(backend, 64)(A, ndrange=length(A))
+        synchronize(backend)  # cooperative on GPU backends
+    end
+    send = Threads.@spawn begin
+        mul2_kernel(backend, 64)(B, ndrange=length(B))
+        synchronize(backend)
+    end
+    cooperative_wait(recv)
+    cooperative_wait(send)
+end
+```
 
-TODO
+A full MPI example that overlaps communication with device copies is in
+[`examples/mpi.jl`](https://github.com/JuliaGPU/KernelAbstractions.jl/blob/master/examples/mpi.jl).
diff --git a/src/KernelAbstractions.jl b/src/KernelAbstractions.jl
index 3881da55c..260ee8cfd 100644
--- a/src/KernelAbstractions.jl
+++ b/src/KernelAbstractions.jl
@@ -18,7 +18,7 @@ using Adapt
 """
     @kernel function f(args) end
 
-Takes a function definition and generates a [`Kernel`](@ref) constructor from it.
+Takes a function definition and generates a [`Kernel`](@ref KernelAbstractions.Kernel) constructor from it.
 The enclosed function is allowed to contain kernel language constructs.
 In order to call it the kernel has first to be specialized on the backend
 and then invoked on the arguments.
@@ -35,18 +35,34 @@ and then invoked on the arguments.
 - [`@synchronize`](@ref)
 - [`@print`](@ref)
 
-# Example:
+# Kernel constructor
+
+After defining a kernel function `f`, call `f(backend[, workgroupsize[, ndrange]])` to obtain a
+[`Kernel`](@ref KernelAbstractions.Kernel) specialized for that backend. Workgroup size and `ndrange` can be fixed at
+construction time (for fewer runtime checks and less recompilation) or supplied at launch:
+
+```julia
+f(backend)                    # dynamic workgroup size and ndrange
+f(backend, 64)                # static workgroup size of 64
+f(backend, 64, 1024)          # static workgroup size and ndrange
+f(backend, 64, (128, 128))    # multi-dimensional ndrange
+```
+
+# Example
 
 ```julia
+using KernelAbstractions
+
 @kernel function vecadd(A, @Const(B))
     I = @index(Global)
     @inbounds A[I] += B[I]
 end
 
+dev = CPU()
 A = ones(1024)
 B = rand(1024)
-vecadd(CPU(), 64)(A, B, ndrange=size(A))
-synchronize(backend)
+vecadd(dev, 64)(A, B, ndrange=length(A))
+synchronize(dev)
 ```
 """
 macro kernel(expr)
@@ -64,7 +80,7 @@ This allows for two different configurations:
 
 - [`@context`](@ref)
 
-!!! warn
+!!! warning
     This is an experimental feature.
 
 !!! note
@@ -195,7 +211,22 @@ function unsafe_free! end
 unsafe_free!(::AbstractArray) = return
 
 """
-Abstract type for all KernelAbstractions backends.
+    Backend
+
+Abstract supertype for all KernelAbstractions backends.
+
+Concrete backends (for example `CUDABackend` from CUDA.jl or [`CPU`](@ref) from this package)
+determine where arrays are allocated and where kernels execute. Use [`get_backend`](@ref) to
+obtain the backend for an array and [`allocate`](@ref) to create storage on a backend.
+
+# Example
+
+```julia
+backend = get_backend(A)
+kernel = my_kernel(backend, 256)
+kernel(A, ndrange=length(A))
+synchronize(backend)
+```
 """
 abstract type Backend end
 
@@ -214,7 +245,18 @@ export KernelIntrinsics
 # - @ndrange
 ###
 
+"""
+    groupsize(ctx)
+
+Return the workgroup size as a tuple. Equivalent to [`@groupsize`](@ref) inside a kernel.
+"""
 function groupsize end
+
+"""
+    ndrange(ctx)
+
+Return the launch `ndrange` as a tuple. Equivalent to [`@ndrange`](@ref) inside a kernel.
+"""
 function ndrange end
 
 """
@@ -333,7 +375,7 @@ workgroup. `cond` is not allowed to have any visible sideffects.
   - `GPU`: This synchronization will only occur if the `cond` evaluates.
   - `CPU`: This synchronization will always occur.
 
-!!! warn
+!!! warning
     This variant of the `@synchronize` macro violates the requirement that `@synchronize` must be encountered
     by all workitems of a work-group executing the kernel or by none at all.
     Since v`0.9.34` this version of the macro is deprecated and lowers to `@synchronize()`
@@ -349,7 +391,7 @@ end
 
 Access the hidden context object used by KernelAbstractions.
 
-!!! warn
+!!! warning
     Only valid to be used from a kernel with `cpu=false`.
 
 !!! note
@@ -663,25 +705,34 @@ function priority!(::Backend, prio::Symbol)
 end
 
 """
-    device(::Backend)::Int
+    device(backend::Backend)::Int
 
-Returns the ordinal number of the currently active device starting at one.
+Return the 1-based index of the currently active device for `backend`.
 """
 function device(::Backend)
     return 1
 end
 
 """
-    ndevices(::Backend)::Int
+    ndevices(backend::Backend)::Int
 
-Returns the number of devices the backend supports.
+Return the number of devices available to `backend`.
 """
 function ndevices(::Backend)
     return 1
 end
 
 """
-    device!(::Backend, id::Int)
+    device!(backend::Backend, id::Int)
+
+Select the active device for `backend`. `id` is a 1-based device index and must satisfy
+`1 <= id <= ndevices(backend)`.
+
+# Example
+
+```julia
+device!(CUDABackend(), 2)  # use the second CUDA device
+```
 """
 function device!(backend::Backend, id::Int)
     if !(0 < id <= ndevices(backend))
@@ -721,9 +772,19 @@ import .NDIteration: get
 """
     Kernel{Backend, WorkgroupSize, NDRange, Func}
 
-Kernel closure struct that is used to represent the backend
-kernel on the host. `WorkgroupSize` is the number of workitems
-in a workgroup.
+Host-side handle for a kernel specialized on a backend, workgroup size, and `ndrange`.
+
+Kernels are created by calling a [`@kernel`](@ref) function on a backend, for example
+`my_kernel(CUDABackend(), 256)`. The returned object is callable:
+
+```julia
+kernel = my_kernel(backend, 64)
+kernel(A, B, ndrange=length(A))   # launch asynchronously
+synchronize(backend)
+```
+
+Use [`workgroupsize`](@ref KernelAbstractions.workgroupsize), [`ndrange`](@ref KernelAbstractions.ndrange),
+and [`backend`](@ref KernelAbstractions.backend) to inspect a kernel's static configuration.
 
 !!! note
     Backend implementations **must** implement:
@@ -741,12 +802,40 @@ function Base.similar(kernel::Kernel{D, WS, ND}, f::F) where {D, WS, ND, F}
     return Kernel{D, WS, ND, F}(kernel.backend, f)
 end
 
-workgroupsize(::Kernel{D, WorkgroupSize}) where {D, WorkgroupSize} = WorkgroupSize
-ndrange(::Kernel{D, WorkgroupSize, NDRange}) where {D, WorkgroupSize, NDRange} = NDRange
-backend(kernel::Kernel) = kernel.backend
+"""
+    workgroupsize(kernel::Kernel)
+
+Return the static workgroup size type parameter of `kernel` (`StaticSize` or `DynamicSize`).
+"""
+function workgroupsize(::Kernel{D, WorkgroupSize}) where {D, WorkgroupSize}
+    return WorkgroupSize
+end
+
+"""
+    ndrange(kernel::Kernel)
+
+Return the static `ndrange` type parameter of `kernel` (`StaticSize` or `DynamicSize`).
+"""
+function ndrange(::Kernel{D, WorkgroupSize, NDRange}) where {D, WorkgroupSize, NDRange}
+    return NDRange
+end
+
+"""
+    backend(kernel::Kernel)
+
+Return the [`Backend`](@ref) that `kernel` was constructed for.
+"""
+function backend(kernel::Kernel)
+    return kernel.backend
+end
 
 """
-Partition a kernel for the given ndrange and workgroupsize.
+    partition(kernel, ndrange, workgroupsize)
+
+Partition the iteration space of `kernel` into workgroups.
+
+Returns the blocked iteration space and whether dynamic bounds-checking is required for the
+last (possibly partial) workgroup. Primarily used by backend implementations and tests.
 """
 @inline function partition(kernel, ndrange, workgroupsize)
     static_ndrange = KernelAbstractions.ndrange(kernel)
@@ -844,9 +933,12 @@ __size(args::Tuple) = Tuple{args...}
 __size(i::Int) = Tuple{i}
 
 """
-    argconvert(::Kernel, arg)
+    argconvert(kernel::Kernel, arg)
 
-Convert arguments to the device side representation.
+Convert `arg` to the device-side representation expected by `kernel`'s backend.
+
+Backend implementations define methods for their array and scalar types. This is called
+automatically when a kernel is launched.
 """
 argconvert(k::Kernel{T}, arg) where {T} =
     error("Don't know how to convert arguments for Kernel{$T}")
@@ -881,6 +973,34 @@ include("pocl/pocl.jl")
 using .POCL
 export POCLBackend
 
+"""
+    POCLBackend()
+
+CPU backend that compiles kernels to OpenCL via [POCL](https://portablecl.org/) and executes
+them on the host. This is the concrete type behind the [`CPU`](@ref) alias.
+
+Requires a working POCL installation. Use [`functional`](@ref KernelAbstractions.functional)(`CPU()`)
+to check availability at runtime.
+"""
+POCLBackend
+
+"""
+    CPU
+
+Type alias for [`POCLBackend`](@ref), the CPU execution backend.
+
+Construct with `CPU()` (equivalent to `POCLBackend()`). Kernels run on the host via POCL/OpenCL
+using the same programming model as GPU backends, which is useful for debugging and for running
+kernel code without a GPU.
+
+# Example
+
+```julia
+A = ones(Float32, 1024)
+mul2_kernel(CPU(), 64)(A, ndrange=length(A))
+synchronize(CPU())
+```
+"""
 const CPU = POCLBackend
 
 # precompile
diff --git a/src/nditeration.jl b/src/nditeration.jl
index 3b2be2353..83d9d4e57 100644
--- a/src/nditeration.jl
+++ b/src/nditeration.jl
@@ -10,7 +10,25 @@ struct DynamicCheck end
 struct NoDynamicCheck end
 
 abstract type _Size end
+
+"""
+    DynamicSize
+
+Marker type indicating that a kernel's workgroup size or `ndrange` is chosen at launch time.
+"""
 struct DynamicSize <: _Size end
+
+"""
+    StaticSize{S}
+
+Marker type encoding a compile-time workgroup size or `ndrange` as a tuple `S`.
+
+# Example
+
+```julia
+my_kernel(backend, StaticSize(64), StaticSize(1024))
+```
+"""
 struct StaticSize{S} <: _Size
     function StaticSize{S}() where {S}
         return new{S::Tuple{Vararg{Int}}}()
diff --git a/src/reflection.jl b/src/reflection.jl
index 53142cc1a..b853e4bd1 100644
--- a/src/reflection.jl
+++ b/src/reflection.jl
@@ -96,23 +96,22 @@ end
 
 
 """
-Get the typed IR for a kernel
+    @ka_code_typed [kwargs...] kernel(args...; ndrange=..., workgroupsize=...)
+
+Return the typed IR for a kernel's device function, similar to `InteractiveUtils.code_typed`.
+
+Pass `interactive=true` to descend into the IR with [Cthulhu](https://github.com/JuliaDebug/Cthulhu.jl)
+(must be loaded in the session). If `ndrange` is fixed at kernel construction time, it can be
+omitted at the call site.
 
 # Examples
+
+```julia
+@ka_code_typed my_kernel(backend)(A, ndrange=length(A))
+@ka_code_typed my_kernel(backend, 64)(A, ndrange=length(A))
+@ka_code_typed optimize=false my_kernel(backend)(A, ndrange=length(A))
+@ka_code_typed interactive=true my_kernel(CPU())(A, ndrange=length(A))
 ```
-@ka_code_typed kernel(args. ndrange=...)
-@ka_code_typed kernel(args. ndrange=... workgroupsize=...)
-@ka_code_typed optimize=false kernel(args. ndrange=...)
-```
-To use interactive mode (with Cthulhu), call
-```
-@ka_code_typed interactive=true kernel(args. ndrange=...)
-```
-If ndrange is statically defined, then you could call
-```
-@ka_code_typed kernel(args.)
-```
-Works for CPU or CUDA kernels, with static or dynamic declarations
 """
 macro ka_code_typed(ex0...)
     ex, args, old_args, kern = format_ex(ex0)
@@ -133,19 +132,18 @@ end
 
 
 """
-Get the llvm code for a kernel
+    @ka_code_llvm [kwargs...] kernel(args...; ndrange=..., workgroupsize=...)
+
+Return the LLVM IR for a kernel's device function, similar to `InteractiveUtils.code_llvm`.
+
+Only supported on the CPU backend. GPU kernels will throw an error.
 
 # Examples
+
+```julia
+@ka_code_llvm my_kernel(CPU())(A, ndrange=length(A))
+@ka_code_llvm my_kernel(CPU(), 64)(A, ndrange=length(A))
 ```
-@ka_code_llvm kernel(args. ndrange=...)
-@ka_code_llvm kernel(args. ndrange=... workgroupsize=...)
-@ka_code_llvm optimize=false kernel(args. ndrange=...)
-```
-If ndrange is statically defined, then you could call
-```
-@ka_code_llvm kernel(args.)
-```
-Works for CPU kernels ONLY, with static or dynamic declarations
 """
 macro ka_code_llvm(ex0...)
     ex, args, old_args, kern = format_ex(ex0)

From c6c8dec55bcbe55e58655f11655d934540ec7105 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mos=C3=A8=20Giordano?= <mose@gnu.org>
Date: Wed, 10 Jun 2026 10:10:20 +0100
Subject: [PATCH 02/11] Remove requirement of working POCL installation

---
 src/KernelAbstractions.jl | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/src/KernelAbstractions.jl b/src/KernelAbstractions.jl
index 260ee8cfd..f9f0a2cf8 100644
--- a/src/KernelAbstractions.jl
+++ b/src/KernelAbstractions.jl
@@ -978,9 +978,6 @@ export POCLBackend
 
 CPU backend that compiles kernels to OpenCL via [POCL](https://portablecl.org/) and executes
 them on the host. This is the concrete type behind the [`CPU`](@ref) alias.
-
-Requires a working POCL installation. Use [`functional`](@ref KernelAbstractions.functional)(`CPU()`)
-to check availability at runtime.
 """
 POCLBackend
 

From c77f48fd7ab10a01cd8df63cea573dc160f94a5a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mos=C3=A8=20Giordano?= <mose@gnu.org>
Date: Wed, 10 Jun 2026 19:39:27 +0100
Subject: [PATCH 03/11] [CI] Always upload build artifact

---
 .github/workflows/ci.yml | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 2aabc3845..6ca5f7a84 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -184,6 +184,11 @@ jobs:
         env:
           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
           DOCUMENTER_KEY: ${{ secrets.DOCUMENTER_KEY }}
+      - name: Upload build artefact
+        uses: actions/upload-pages-artifact@v5
+        with:
+          path: docs/build
+          name: docs-build
 
   doctests:
     name: Doctests

From ca3dec6498732e0aa802a803952e20682e4e76d8 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mos=C3=A8=20Giordano?= <mose@gnu.org>
Date: Wed, 10 Jun 2026 22:28:24 +0100
Subject: [PATCH 04/11] Do not mention `cpu=false` (it's no-op)

---
 docs/src/kernels.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/src/kernels.md b/docs/src/kernels.md
index 54e206a61..efcc38e7d 100644
--- a/docs/src/kernels.md
+++ b/docs/src/kernels.md
@@ -1,7 +1,7 @@
 # Writing kernels
 
 These kernel language constructs are intended to be used inside [`@kernel`](@ref) functions.
-They are not valid in ordinary Julia code (except when using experimental `@kernel cpu=false`).
+They are not valid in ordinary Julia code.
 
 ## Constant arguments
 

From 0c13e55f95322cc730c9fe5ca72f08bb06b32e51 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mos=C3=A8=20Giordano?= <mose@gnu.org>
Date: Wed, 10 Jun 2026 22:32:16 +0100
Subject: [PATCH 05/11] `get_backend` works with all backends

---
 docs/src/kernels.md | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/docs/src/kernels.md b/docs/src/kernels.md
index efcc38e7d..5bc5c1600 100644
--- a/docs/src/kernels.md
+++ b/docs/src/kernels.md
@@ -103,6 +103,5 @@ kernel = my_kernel(backend, 32, size(A))
 kernel(A)
 ```
 
-On GPU backends, obtain the backend from an array with [`get_backend`](@ref) and always call
-[`synchronize`](@ref) before reading results on the host. See the [Quickstart](@ref) for a full walkthrough and the Examples section of the manual
-for larger patterns.
+Obtain the backend from an array with [`get_backend`](@ref) and always call [`synchronize`](@ref) before reading results on the host.
+See the [Quickstart](@ref) for a full walkthrough and the Examples section of the manual for larger patterns.

From 5c5bde01cbdda3099b49407809f618df19e43986 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mos=C3=A8=20Giordano?= <mose@gnu.org>
Date: Wed, 10 Jun 2026 22:40:24 +0100
Subject: [PATCH 06/11] Document that only static `@localmem` is currently
 supported.

Clarify in the kernels guide that local memory allocation sizes must be
known at compile time.

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 docs/src/kernels.md | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/docs/src/kernels.md b/docs/src/kernels.md
index 5bc5c1600..0e22f6565 100644
--- a/docs/src/kernels.md
+++ b/docs/src/kernels.md
@@ -66,8 +66,11 @@ end
 
 ## Local memory, synchronization, and private memory
 
-[`@localmem`](@ref) declares storage shared by all work items in a workgroup. Reads and writes
-must be separated by [`@synchronize`](@ref) if they are performed by different work items:
+[`@localmem`](@ref) declares storage shared by all work items in a workgroup. Only **static**
+local memory is supported at the moment: the allocation size must be known at compile time
+(for example `@localmem Int (32,)` or `@localmem Int (N,)` where `N = prod(@groupsize())` and
+the workgroup size is fixed when the kernel is constructed). Reads and writes must be
+separated by [`@synchronize`](@ref) if they are performed by different work items:
 
 ```julia
 @kernel function reverse_block!(A)

From a6f175810a3a72db1e7394dbc97a1de280f3acf9 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mos=C3=A8=20Giordano?= <mose@gnu.org>
Date: Wed, 10 Jun 2026 23:01:05 +0100
Subject: [PATCH 07/11] Remove misleading "equivalent to" sentences

Suggested-by: Christian Guinard <28689358+christiangnrd@users.noreply.github.com>
---
 src/KernelAbstractions.jl | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/KernelAbstractions.jl b/src/KernelAbstractions.jl
index f9f0a2cf8..a55644752 100644
--- a/src/KernelAbstractions.jl
+++ b/src/KernelAbstractions.jl
@@ -248,14 +248,14 @@ export KernelIntrinsics
 """
     groupsize(ctx)
 
-Return the workgroup size as a tuple. Equivalent to [`@groupsize`](@ref) inside a kernel.
+Return the workgroup size as a tuple.
 """
 function groupsize end
 
 """
     ndrange(ctx)
 
-Return the launch `ndrange` as a tuple. Equivalent to [`@ndrange`](@ref) inside a kernel.
+Return the launch `ndrange` as a tuple.
 """
 function ndrange end
 

From 226135b33ec8c7850915e2dca278e0c2d414feca Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mos=C3=A8=20Giordano?= <mose@gnu.org>
Date: Wed, 10 Jun 2026 23:07:38 +0100
Subject: [PATCH 08/11] Add note for implementers to
 `device`/`ndevices`/`device!`

Suggested-by: Christian Guinard <28689358+christiangnrd@users.noreply.github.com>
---
 src/KernelAbstractions.jl | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/src/KernelAbstractions.jl b/src/KernelAbstractions.jl
index a55644752..14c4d9e26 100644
--- a/src/KernelAbstractions.jl
+++ b/src/KernelAbstractions.jl
@@ -708,6 +708,10 @@ end
     device(backend::Backend)::Int
 
 Return the 1-based index of the currently active device for `backend`.
+
+!!! note
+    Backend implementations **may** implement `device(backend::Backend)::Int` if they support multiple devices.
+    They **must** implement [`ndevices`](@ref KernelAbstractions.ndevices) and [`device!`](@ref KernelAbstractions.device!).
 """
 function device(::Backend)
     return 1
@@ -717,6 +721,10 @@ end
     ndevices(backend::Backend)::Int
 
 Return the number of devices available to `backend`.
+
+!!! note
+    Backend implementations **must** implement `ndevices(backend::Backend)::Int` and [`device!`](@ref KernelAbstractions.device!).
+    They **may** also implement [`device`](@ref KernelAbstractions.device) if they support multiple devices.
 """
 function ndevices(::Backend)
     return 1
@@ -733,6 +741,10 @@ Select the active device for `backend`. `id` is a 1-based device index and must
 ```julia
 device!(CUDABackend(), 2)  # use the second CUDA device
 ```
+
+!!! note
+    Backend implementations **must** implement `devices!(backend::Backend, id::Int)` and [`ndevices`](@ref KernelAbstractions.ndevices).
+    They **may** also implement [`device`](@ref KernelAbstractions.device) if they support multiple devices.
 """
 function device!(backend::Backend, id::Int)
     if !(0 < id <= ndevices(backend))

From 4501e10d0e08de8d416474ee315bd40667e8e91a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mos=C3=A8=20Giordano?= <m.giordano@ucl.ac.uk>
Date: Thu, 11 Jun 2026 14:20:07 +0100
Subject: [PATCH 09/11] Turn code blocks into doctests/runnable examples

---
 docs/src/kernels.md    | 128 ++++++++++++++++++++++++++++++++++++++---
 docs/src/quickstart.md |  27 +++++----
 2 files changed, 138 insertions(+), 17 deletions(-)

diff --git a/docs/src/kernels.md b/docs/src/kernels.md
index 0e22f6565..6d72f8f28 100644
--- a/docs/src/kernels.md
+++ b/docs/src/kernels.md
@@ -11,10 +11,30 @@ the kernel, and that it does not alias any other memory in the kernel. If you ar
 this is similar to `const restrict`.
 
 ```julia
+using KernelAbstractions
+
 @kernel function saxpy!(a, @Const(X), Y)
     I = @index(Global)
     @inbounds Y[I] = a * X[I] + Y[I]
 end
+
+a = 2.0
+X = collect(1.0:8.0)
+Y = fill(1.0, 8)
+saxpy!(CPU(), 8, size(Y))(a, X, Y)
+Y
+
+# output
+
+8-element Vector{Float64}:
+  3.0
+  5.0
+  7.0
+  9.0
+ 11.0
+ 13.0
+ 15.0
+ 17.0
 ```
 
 ## Indexing
@@ -34,7 +54,9 @@ and an optional **kind**:
 | `Cartesian` | `CartesianIndex` for multi-dimensional `ndrange` |
 | `NTuple` | `NTuple` of `Int` indices |
 
-```julia
+```jldoctest
+using KernelAbstractions
+
 @kernel function fill_diagonal!(A, val)
     I = @index(Global, Cartesian)
     if I[1] == I[2]
@@ -42,26 +64,92 @@ and an optional **kind**:
     end
 end
 
-@kernel function linear_example(A)
+A = collect(reshape(1.0:16.0, 4, 4))
+fill_diagonal!(CPU(), 4, size(A))(A, 42)
+A
+
+# output
+
+4×4 Matrix{Float64}:
+ 42.0   5.0   9.0  13.0
+  2.0  42.0  10.0  14.0
+  3.0   7.0  42.0  15.0
+  4.0   8.0  12.0  42.0
+```
+
+```jldoctest
+using KernelAbstractions
+
+@kernel function linear_example!(A)
     I = @index(Global, Linear)   # 1, 2, 3, ...
     g = @index(Group, Linear)    # workgroup id
     l = @index(Local, Linear)    # lane within workgroup
     @inbounds A[I] = g + l
 end
+
+A = collect(1.0:16.0)
+linear_example!(CPU(), 4, size(A))(A)
+A
+
+# output
+
+16-element Vector{Float64}:
+ 2.0
+ 3.0
+ 4.0
+ 5.0
+ 3.0
+ 4.0
+ 5.0
+ 6.0
+ 4.0
+ 5.0
+ 6.0
+ 7.0
+ 5.0
+ 6.0
+ 7.0
+ 8.0
 ```
 
 Inside a kernel, [`@groupsize`](@ref) and [`@ndrange`](@ref) query the launch configuration:
 
-```julia
+```jldoctest
+using KernelAbstractions
+
 @kernel function scale!(A, factor)
-    N = prod(@groupsize())
+    N = @uniform prod(@groupsize())
     I = @index(Global, Linear)
     lmem = @localmem Float32 (N,)
     i = @index(Local, Linear)
     lmem[i] = factor
     @synchronize()
-    @inbounds A[I] = lmem[i]
+    @inbounds A[I] = A[I] * lmem[i]
 end
+
+A = collect(1.0:16.0)
+scale!(CPU(), 8, size(A))(A, 2)
+A
+
+# output
+
+16-element Vector{Float64}:
+  2.0
+  4.0
+  6.0
+  8.0
+ 10.0
+ 12.0
+ 14.0
+ 16.0
+ 18.0
+ 20.0
+ 22.0
+ 24.0
+ 26.0
+ 28.0
+ 30.0
+ 32.0
 ```
 
 ## Local memory, synchronization, and private memory
@@ -72,16 +160,42 @@ local memory is supported at the moment: the allocation size must be known at co
 the workgroup size is fixed when the kernel is constructed). Reads and writes must be
 separated by [`@synchronize`](@ref) if they are performed by different work items:
 
-```julia
+```jldoctest
+using KernelAbstractions
+
 @kernel function reverse_block!(A)
     I = @index(Global, Linear)
     i = @index(Local, Linear)
-    N = prod(@groupsize())
+    N = @uniform prod(@groupsize())
     buf = @localmem Int (N,)
     buf[i] = i
     @synchronize()
     @inbounds A[I] = buf[N - i + 1]
 end
+
+A = collect(1.0:16.0)
+reverse_block!(CPU(), 8, size(A))(A)
+A
+
+# output
+
+16-element Vector{Float64}:
+ 8.0
+ 7.0
+ 6.0
+ 5.0
+ 4.0
+ 3.0
+ 2.0
+ 1.0
+ 8.0
+ 7.0
+ 6.0
+ 5.0
+ 4.0
+ 3.0
+ 2.0
+ 1.0
 ```
 
 [`@private`](@ref) and [`@uniform`](@ref) are deprecated for KernelAbstractions 1.0. Prefer
diff --git a/docs/src/quickstart.md b/docs/src/quickstart.md
index 4756ad34b..696a6bc3b 100644
--- a/docs/src/quickstart.md
+++ b/docs/src/quickstart.md
@@ -17,7 +17,9 @@ you can use the [kernel language](@ref api_kernel_language). As an example, the
 below will multiply each element of the array `A` by `2`. It uses the [`@index`](@ref) macro
 to obtain the global linear index of the current work item.
 
-```julia
+```@example mul2_kernel
+using KernelAbstractions
+
 @kernel function mul2_kernel(A)
   I = @index(Global)
   A[I] = 2 * A[I]
@@ -32,12 +34,12 @@ the second argument being the workgroup size. This returns a generated kernel
 executable that is then executed with the input argument `A` and the additional
 argument being a static `ndrange`.
 
-```julia
+```@example mul2_kernel
 dev = CPU()
 A = ones(1024, 1024)
-ev = mul2_kernel(dev, 64)(A, ndrange=size(A))
+mul2_kernel(dev, 64)(A, ndrange=size(A))
 synchronize(dev)
-all(A .== 2.0)
+@assert all(A .== 2.0)
 ```
 
 All kernels are launched asynchronously.
@@ -49,11 +51,12 @@ When the workgroup size and `ndrange` are known ahead of time, pass them to the
 constructor to enable additional compile-time optimizations and avoid supplying them at
 every launch:
 
-```julia
+```@example mul2_kernel
 # workgroup size 32, ndrange (128, 128) — fixed for this kernel object
 kernel = mul2_kernel(dev, 32, size(A))
 kernel(A)  # ndrange inferred from construction
 synchronize(dev)
+@assert all(A .== 4)
 ```
 
 See also [Memcopy with static NDRange](@ref memcopy_static).
@@ -89,7 +92,7 @@ The kernel generation and execution are then
 backend = get_backend(A)
 mul2_kernel(backend, 64)(A, ndrange=size(A))
 synchronize(backend)
-all(A .== 2)
+@assert all(A .== 2)
 ```
 
 ## Synchronization
@@ -100,17 +103,19 @@ all(A .== 2)
 The code around KA may heavily rely on
 [`GPUArrays`](https://github.com/JuliaGPU/GPUArrays.jl), for example, to
 initialize variables.
-```julia
+```@example mul2_kernel
 function mymul(A)
     A .= 1.0
     backend = get_backend(A)
     ev = mul2_kernel(backend, 64)(A, ndrange=size(A))
     synchronize(backend)
-    all(A .== 2.0)
+    @assert all(A .== 2.0)
 end
+
+mymul(A)
 ```
 
-```julia
+```@example mul2_kernel
 function mymul(A, B)
     A .= 1.0
     B .= 3.0
@@ -119,8 +124,10 @@ function mymul(A, B)
     mul2_kernel(backend, 64)(A, ndrange=size(A))
     mul2_kernel(backend, 64)(B, ndrange=size(B))
     synchronize(backend)
-    all(A .+ B .== 8.0)
+    @assert all(A .+ B .== 8.0)
 end
+
+mymul(A, ones(size(A)))
 ```
 
 ## Using task programming to launch kernels in parallel

From c9eb245b348067d40db26173cac99e97b92fdeea Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mos=C3=A8=20Giordano?=
 <765740+giordano@users.noreply.github.com>
Date: Thu, 11 Jun 2026 16:26:26 +0200
Subject: [PATCH 10/11] Remove `StaticSize` example

Co-authored-by: Christian Guinard <28689358+christiangnrd@users.noreply.github.com>
---
 src/nditeration.jl | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/src/nditeration.jl b/src/nditeration.jl
index 83d9d4e57..aacaa8bff 100644
--- a/src/nditeration.jl
+++ b/src/nditeration.jl
@@ -22,12 +22,6 @@ struct DynamicSize <: _Size end
     StaticSize{S}
 
 Marker type encoding a compile-time workgroup size or `ndrange` as a tuple `S`.
-
-# Example
-
-```julia
-my_kernel(backend, StaticSize(64), StaticSize(1024))
-```
 """
 struct StaticSize{S} <: _Size
     function StaticSize{S}() where {S}

From 354cecc089f60643e127e6ba67150c87a069d6bd Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mos=C3=A8=20Giordano?= <m.giordano@ucl.ac.uk>
Date: Thu, 11 Jun 2026 16:01:04 +0100
Subject: [PATCH 11/11] Add `DynamicSize` and `StaticSize` to the manual in the
 "Internal" section

Also, make it crystal clear that internals are not meant to be directly used by
end-users.
---
 docs/src/api.md | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/docs/src/api.md b/docs/src/api.md
index ee068f336..66e4fd783 100644
--- a/docs/src/api.md
+++ b/docs/src/api.md
@@ -68,8 +68,13 @@ on the CPU backend via [`@ka_code_llvm`](@ref).
 
 ## Internal
 
+The functionalities in this section are considered internal and not part of the public API contract.
+They are only documented here for developers and contributors of `KernelAbstractions.jl`, but should not be used by end users (and if they do, they should expect breakage without notice).
+
 ```@docs
 KernelAbstractions.partition
 KernelAbstractions.@context
 KernelAbstractions.argconvert
+KernelAbstractions.NDIteration.DynamicSize
+KernelAbstractions.NDIteration.StaticSize
 ```