diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index ccc66a7f6..6ca5f7a84 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -1,4 +1,5 @@
 name: CI-KA
+
 on:
   push:
     branches:
@@ -6,14 +7,17 @@ on:
       - release-*
     tags: '*'
   pull_request:
+
 defaults:
   run:
     shell: bash
+
 concurrency:
   # Skip intermediate builds: always.
   # Cancel intermediate builds: only if it is a pull request build.
   group: ${{ github.workflow }}-${{ github.ref }}
   cancel-in-progress: ${{ startsWith(github.ref, 'refs/pull/') }}
+
 jobs:
   CI:
     name: CI
@@ -159,6 +163,7 @@ jobs:
       - uses: codecov/codecov-action@v6
         with:
           files: lcov.info
+
   docs:
     name: Documentation
     runs-on: ubuntu-latest
@@ -167,12 +172,24 @@ jobs:
       - uses: julia-actions/setup-julia@v3
         with:
           version: '1'
-      - run: |
-          julia --project=docs -e 'import Pkg; Pkg.develop(path=".")'
-          julia --project=docs docs/make.jl
+      - uses: julia-actions/cache@v3
+      - name: "Instantiate docs environment"
+        shell: julia --project=docs --color=yes {0}
+        run: |
+          using Pkg
+          Pkg.instantiate()
+      - name: "Build docs"
+        run: |
+          julia --project=docs --color=yes docs/make.jl
         env:
           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
           DOCUMENTER_KEY: ${{ secrets.DOCUMENTER_KEY }}
+      - name: Upload build artefact
+        uses: actions/upload-pages-artifact@v5
+        with:
+          path: docs/build
+          name: docs-build
+
   doctests:
     name: Doctests
     runs-on: ubuntu-latest
@@ -181,9 +198,13 @@ jobs:
       - uses: julia-actions/setup-julia@v3
         with:
           version: '1'
-      - run: |
-          julia --project=docs -e 'import Pkg; Pkg.develop(path=".")'
-          julia --project=docs -e '
-            using Documenter: doctest
-            using KernelAbstractions
-            doctest(KernelAbstractions; manual = true)'
+      - uses: julia-actions/cache@v3
+      - name: "Run doctests"
+        shell: julia --project=docs --color=yes {0}
+        run: |
+          using Pkg
+          Pkg.instantiate()
+
+          using Documenter: doctest
+          using KernelAbstractions
+          doctest(KernelAbstractions; manual = true)
diff --git a/docs/Project.toml b/docs/Project.toml
index 1814eb330..96dfe1047 100644
--- a/docs/Project.toml
+++ b/docs/Project.toml
@@ -1,5 +1,9 @@
 [deps]
 Documenter = "e30172f5-a6a5-5a46-863b-614d45cd2de4"
+KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c"
 
 [compat]
 Documenter = "1"
+
+[sources]
+KernelAbstractions = {path = ".."}
diff --git a/docs/make.jl b/docs/make.jl
index 6188fdbfd..dd6753d25 100644
--- a/docs/make.jl
+++ b/docs/make.jl
@@ -1,5 +1,3 @@
-push!(Base.LOAD_PATH, dirname(@__DIR__))
-
 using KernelAbstractions
 using Documenter
 
diff --git a/docs/src/api.md b/docs/src/api.md
index 4e107075b..66e4fd783 100644
--- a/docs/src/api.md
+++ b/docs/src/api.md
@@ -13,21 +13,68 @@
 @uniform
 @groupsize
 @ndrange
-synchronize
-allocate
 ```
 
 ## Host language
 
+### Backends and arrays
+
 ```@docs
+Backend
+GPU
+CPU
+POCLBackend
+get_backend
+KernelAbstractions.allocate
 KernelAbstractions.zeros
+KernelAbstractions.ones
+KernelAbstractions.copyto!
+KernelAbstractions.pagelock!
+KernelAbstractions.unsafe_free!
+KernelAbstractions.functional
 KernelAbstractions.supports_unified
+KernelAbstractions.supports_atomics
+KernelAbstractions.supports_float64
 ```
 
-## Internal
+### Devices and execution
+
+```@docs
+synchronize
+KernelAbstractions.device
+KernelAbstractions.ndevices
+KernelAbstractions.device!
+KernelAbstractions.priority!
+```
+
+### Kernel handles
 
 ```@docs
 KernelAbstractions.Kernel
+KernelAbstractions.workgroupsize
+KernelAbstractions.ndrange
+KernelAbstractions.backend
+```
+
+## Reflection
+
+These macros help inspect the generated kernel code. GPU LLVM reflection is only supported
+on the CPU backend via [`@ka_code_llvm`](@ref).
+
+```@docs
+@ka_code_typed
+@ka_code_llvm
+```
+
+## Internal
+
+The functionalities in this section are considered internal and not part of the public API contract.
+They are only documented here for developers and contributors of `KernelAbstractions.jl`, but should not be used by end users (and if they do, they should expect breakage without notice).
+
+```@docs
 KernelAbstractions.partition
 KernelAbstractions.@context
+KernelAbstractions.argconvert
+KernelAbstractions.NDIteration.DynamicSize
+KernelAbstractions.NDIteration.StaticSize
 ```
diff --git a/docs/src/examples/memcopy_static.md b/docs/src/examples/memcopy_static.md
index cb84c387d..deba4a8e5 100644
--- a/docs/src/examples/memcopy_static.md
+++ b/docs/src/examples/memcopy_static.md
@@ -1,4 +1,4 @@
-# Memcopy with static NDRange
+# [Memcopy with static NDRange](@id memcopy_static)
 
 The first example simple copies memory from `B` to `A`. In contrast to the previous examples
 it uses a fully static kernel configuration. Specializing the kernel on the iteration range itself.
diff --git a/docs/src/implementations.md b/docs/src/implementations.md
index cc2c05d24..0b640d4f3 100644
--- a/docs/src/implementations.md
+++ b/docs/src/implementations.md
@@ -1,4 +1,4 @@
-# Notes for backend implementations
+# [Notes for backend implementations](@id implementations_notes)
 
 ## Semantics of `KernelAbstractions.synchronize`
 
diff --git a/docs/src/kernels.md b/docs/src/kernels.md
index a45ea3fec..6d72f8f28 100644
--- a/docs/src/kernels.md
+++ b/docs/src/kernels.md
@@ -1,23 +1,224 @@
-# Writing kernels 
+# Writing kernels
 
-These kernel language constructs are intended to be used as part
-of [`@kernel`](@ref) functions and not valid outside that context.
+These kernel language constructs are intended to be used inside [`@kernel`](@ref) functions.
+They are not valid in ordinary Julia code.
 
 ## Constant arguments
 
-Kernel functions allow for input arguments to be marked with the
-[`@Const`](@ref) macro. It informs the compiler that the memory
-accessed through that marked input argument, will not be written
-to as part of the kernel. This has the implication that input arguments
-are **not** allowed to alias each other. If you are used to CUDA C this
-is similar to `const restrict`.
+Kernel functions allow input arguments to be marked with the [`@Const`](@ref) macro. It informs
+the compiler that the memory accessed through that argument will not be written to as part of
+the kernel, and that it does not alias any other memory in the kernel. If you are used to CUDA C,
+this is similar to `const restrict`.
+
+```julia
+using KernelAbstractions
+
+@kernel function saxpy!(a, @Const(X), Y)
+    I = @index(Global)
+    @inbounds Y[I] = a * X[I] + Y[I]
+end
+
+a = 2.0
+X = collect(1.0:8.0)
+Y = fill(1.0, 8)
+saxpy!(CPU(), 8, size(Y))(a, X, Y)
+Y
+
+# output
+
+8-element Vector{Float64}:
+  3.0
+  5.0
+  7.0
+  9.0
+ 11.0
+ 13.0
+ 15.0
+ 17.0
+```
 
 ## Indexing
 
-There are several [`@index`](@ref) variants.
+The [`@index`](@ref) macro returns the index of the current work item. Choose a **granularity**
+and an optional **kind**:
+
+| Granularity | Meaning |
+|-------------|---------|
+| `Global` | Index over the full `ndrange` (use for global memory) |
+| `Group` | Index of the current workgroup |
+| `Local` | Index within the current workgroup |
+
+| Kind | Result type |
+|------|-------------|
+| `Linear` (default) | `Int` linear index |
+| `Cartesian` | `CartesianIndex` for multi-dimensional `ndrange` |
+| `NTuple` | `NTuple` of `Int` indices |
+
+```jldoctest
+using KernelAbstractions
+
+@kernel function fill_diagonal!(A, val)
+    I = @index(Global, Cartesian)
+    if I[1] == I[2]
+        @inbounds A[I] = val
+    end
+end
+
+A = collect(reshape(1.0:16.0, 4, 4))
+fill_diagonal!(CPU(), 4, size(A))(A, 42)
+A
+
+# output
+
+4×4 Matrix{Float64}:
+ 42.0   5.0   9.0  13.0
+  2.0  42.0  10.0  14.0
+  3.0   7.0  42.0  15.0
+  4.0   8.0  12.0  42.0
+```
+
+```jldoctest
+using KernelAbstractions
+
+@kernel function linear_example!(A)
+    I = @index(Global, Linear)   # 1, 2, 3, ...
+    g = @index(Group, Linear)    # workgroup id
+    l = @index(Local, Linear)    # lane within workgroup
+    @inbounds A[I] = g + l
+end
+
+A = collect(1.0:16.0)
+linear_example!(CPU(), 4, size(A))(A)
+A
+
+# output
+
+16-element Vector{Float64}:
+ 2.0
+ 3.0
+ 4.0
+ 5.0
+ 3.0
+ 4.0
+ 5.0
+ 6.0
+ 4.0
+ 5.0
+ 6.0
+ 7.0
+ 5.0
+ 6.0
+ 7.0
+ 8.0
+```
+
+Inside a kernel, [`@groupsize`](@ref) and [`@ndrange`](@ref) query the launch configuration:
+
+```jldoctest
+using KernelAbstractions
+
+@kernel function scale!(A, factor)
+    N = @uniform prod(@groupsize())
+    I = @index(Global, Linear)
+    lmem = @localmem Float32 (N,)
+    i = @index(Local, Linear)
+    lmem[i] = factor
+    @synchronize()
+    @inbounds A[I] = A[I] * lmem[i]
+end
+
+A = collect(1.0:16.0)
+scale!(CPU(), 8, size(A))(A, 2)
+A
+
+# output
+
+16-element Vector{Float64}:
+  2.0
+  4.0
+  6.0
+  8.0
+ 10.0
+ 12.0
+ 14.0
+ 16.0
+ 18.0
+ 20.0
+ 22.0
+ 24.0
+ 26.0
+ 28.0
+ 30.0
+ 32.0
+```
+
+## Local memory, synchronization, and private memory
+
+[`@localmem`](@ref) declares storage shared by all work items in a workgroup. Only **static**
+local memory is supported at the moment: the allocation size must be known at compile time
+(for example `@localmem Int (32,)` or `@localmem Int (N,)` where `N = prod(@groupsize())` and
+the workgroup size is fixed when the kernel is constructed). Reads and writes must be
+separated by [`@synchronize`](@ref) if they are performed by different work items:
+
+```jldoctest
+using KernelAbstractions
+
+@kernel function reverse_block!(A)
+    I = @index(Global, Linear)
+    i = @index(Local, Linear)
+    N = @uniform prod(@groupsize())
+    buf = @localmem Int (N,)
+    buf[i] = i
+    @synchronize()
+    @inbounds A[I] = buf[N - i + 1]
+end
+
+A = collect(1.0:16.0)
+reverse_block!(CPU(), 8, size(A))(A)
+A
+
+# output
+
+16-element Vector{Float64}:
+ 8.0
+ 7.0
+ 6.0
+ 5.0
+ 4.0
+ 3.0
+ 2.0
+ 1.0
+ 8.0
+ 7.0
+ 6.0
+ 5.0
+ 4.0
+ 3.0
+ 2.0
+ 1.0
+```
+
+[`@private`](@ref) and [`@uniform`](@ref) are deprecated for KernelAbstractions 1.0. Prefer
+`MArray` for per-lane scratch storage that does not need to survive across `@synchronize`.
+
+## Launching kernels
+
+Construct a kernel by calling the kernel function on a backend and optional static sizes, then
+launch it with `ndrange`:
+
+```julia
+# dynamic sizes — supply ndrange (and optionally workgroupsize) at launch
+kernel = my_kernel(backend)
+kernel(A, ndrange=size(A))
 
-## Local memory, variable lifetime and private memory
+# static workgroup size
+kernel = my_kernel(backend, 256)
+kernel(A, ndrange=size(A))
 
-[`@localmem`](@ref), [`@synchronize`](@ref), [`@private`](@ref)
+# static workgroup size and ndrange — fewer runtime checks, may reduce recompilation
+kernel = my_kernel(backend, 32, size(A))
+kernel(A)
+```
 
-# Launching kernels
+Obtain the backend from an array with [`get_backend`](@ref) and always call [`synchronize`](@ref) before reading results on the host.
+See the [Quickstart](@ref) for a full walkthrough and the Examples section of the manual for larger patterns.
diff --git a/docs/src/quickstart.md b/docs/src/quickstart.md
index 82bc0f600..696a6bc3b 100644
--- a/docs/src/quickstart.md
+++ b/docs/src/quickstart.md
@@ -17,7 +17,9 @@ you can use the [kernel language](@ref api_kernel_language). As an example, the
 below will multiply each element of the array `A` by `2`. It uses the [`@index`](@ref) macro
 to obtain the global linear index of the current work item.
 
-```julia
+```@example mul2_kernel
+using KernelAbstractions
+
 @kernel function mul2_kernel(A)
   I = @index(Global)
   A[I] = 2 * A[I]
@@ -32,17 +34,33 @@ the second argument being the workgroup size. This returns a generated kernel
 executable that is then executed with the input argument `A` and the additional
 argument being a static `ndrange`.
 
-```julia
+```@example mul2_kernel
 dev = CPU()
 A = ones(1024, 1024)
-ev = mul2_kernel(dev, 64)(A, ndrange=size(A))
+mul2_kernel(dev, 64)(A, ndrange=size(A))
 synchronize(dev)
-all(A .== 2.0)
+@assert all(A .== 2.0)
 ```
 
 All kernels are launched asynchronously.
 The [`synchronize`](@ref) blocks the *host* until the kernel has completed on the backend.
 
+### Static workgroup size and `ndrange`
+
+When the workgroup size and `ndrange` are known ahead of time, pass them to the kernel
+constructor to enable additional compile-time optimizations and avoid supplying them at
+every launch:
+
+```@example mul2_kernel
+# workgroup size 32, ndrange (128, 128) — fixed for this kernel object
+kernel = mul2_kernel(dev, 32, size(A))
+kernel(A)  # ndrange inferred from construction
+synchronize(dev)
+@assert all(A .== 4)
+```
+
+See also [Memcopy with static NDRange](@ref memcopy_static).
+
 ## Launching kernel on the backend
 
 To launch the kernel on a backend-supported backend `isa(backend, KA.GPU)` (e.g., `CUDABackend()`, `ROCBackend()`, `oneAPIBackend()`, `MetalBackend()`), we generate the kernel
@@ -74,7 +92,7 @@ The kernel generation and execution are then
 backend = get_backend(A)
 mul2_kernel(backend, 64)(A, ndrange=size(A))
 synchronize(backend)
-all(A .== 2)
+@assert all(A .== 2)
 ```
 
 ## Synchronization
@@ -85,17 +103,19 @@ all(A .== 2)
 The code around KA may heavily rely on
 [`GPUArrays`](https://github.com/JuliaGPU/GPUArrays.jl), for example, to
 initialize variables.
-```julia
+```@example mul2_kernel
 function mymul(A)
     A .= 1.0
     backend = get_backend(A)
     ev = mul2_kernel(backend, 64)(A, ndrange=size(A))
     synchronize(backend)
-    all(A .== 2.0)
+    @assert all(A .== 2.0)
 end
+
+mymul(A)
 ```
 
-```julia
+```@example mul2_kernel
 function mymul(A, B)
     A .= 1.0
     B .= 3.0
@@ -104,10 +124,44 @@ function mymul(A, B)
     mul2_kernel(backend, 64)(A, ndrange=size(A))
     mul2_kernel(backend, 64)(B, ndrange=size(B))
     synchronize(backend)
-    all(A .+ B .== 8.0)
+    @assert all(A .+ B .== 8.0)
 end
+
+mymul(A, ones(size(A)))
 ```
 
-## Using task programming to launch kernels in parallel.
+## Using task programming to launch kernels in parallel
+
+As shown in the [Synchronization](@ref) section above, multiple kernels can be enqueued on the
+same backend before a single [`synchronize`](@ref) call. The same pattern extends to Julia's
+task-based parallelism: launch kernels from [`Threads.@spawn`](https://docs.julialang.org/en/stable/base/multi-threading/#Base.Threads.@spawn)
+tasks when you want to overlap kernel execution with other asynchronous host work.
+
+On GPU backends, [`synchronize`](@ref) is **cooperative** — it yields to the Julia scheduler
+rather than blocking inside a driver call, so other tasks can make progress while a kernel runs.
+See [Notes for backend implementations](@ref implementations_notes) for the contract backend authors must follow.
+
+```julia
+function cooperative_wait(task::Task)
+    while !Base.istaskdone(task)
+        yield()
+    end
+    return wait(task)
+end
+
+function exchange_and_compute!(backend, A, B)
+    recv = Threads.@spawn begin
+        mul2_kernel(backend, 64)(A, ndrange=length(A))
+        synchronize(backend)  # cooperative on GPU backends
+    end
+    send = Threads.@spawn begin
+        mul2_kernel(backend, 64)(B, ndrange=length(B))
+        synchronize(backend)
+    end
+    cooperative_wait(recv)
+    cooperative_wait(send)
+end
+```
 
-TODO
+A full MPI example that overlaps communication with device copies is in
+[`examples/mpi.jl`](https://github.com/JuliaGPU/KernelAbstractions.jl/blob/master/examples/mpi.jl).
diff --git a/src/KernelAbstractions.jl b/src/KernelAbstractions.jl
index 3881da55c..14c4d9e26 100644
--- a/src/KernelAbstractions.jl
+++ b/src/KernelAbstractions.jl
@@ -18,7 +18,7 @@ using Adapt
 """
     @kernel function f(args) end
 
-Takes a function definition and generates a [`Kernel`](@ref) constructor from it.
+Takes a function definition and generates a [`Kernel`](@ref KernelAbstractions.Kernel) constructor from it.
 The enclosed function is allowed to contain kernel language constructs.
 In order to call it the kernel has first to be specialized on the backend
 and then invoked on the arguments.
@@ -35,18 +35,34 @@ and then invoked on the arguments.
 - [`@synchronize`](@ref)
 - [`@print`](@ref)
 
-# Example:
+# Kernel constructor
+
+After defining a kernel function `f`, call `f(backend[, workgroupsize[, ndrange]])` to obtain a
+[`Kernel`](@ref KernelAbstractions.Kernel) specialized for that backend. Workgroup size and `ndrange` can be fixed at
+construction time (for fewer runtime checks and less recompilation) or supplied at launch:
+
+```julia
+f(backend)                    # dynamic workgroup size and ndrange
+f(backend, 64)                # static workgroup size of 64
+f(backend, 64, 1024)          # static workgroup size and ndrange
+f(backend, 64, (128, 128))    # multi-dimensional ndrange
+```
+
+# Example
 
 ```julia
+using KernelAbstractions
+
 @kernel function vecadd(A, @Const(B))
     I = @index(Global)
     @inbounds A[I] += B[I]
 end
 
+dev = CPU()
 A = ones(1024)
 B = rand(1024)
-vecadd(CPU(), 64)(A, B, ndrange=size(A))
-synchronize(backend)
+vecadd(dev, 64)(A, B, ndrange=length(A))
+synchronize(dev)
 ```
 """
 macro kernel(expr)
@@ -64,7 +80,7 @@ This allows for two different configurations:
 
 - [`@context`](@ref)
 
-!!! warn
+!!! warning
     This is an experimental feature.
 
 !!! note
@@ -195,7 +211,22 @@ function unsafe_free! end
 unsafe_free!(::AbstractArray) = return
 
 """
-Abstract type for all KernelAbstractions backends.
+    Backend
+
+Abstract supertype for all KernelAbstractions backends.
+
+Concrete backends (for example `CUDABackend` from CUDA.jl or [`CPU`](@ref) from this package)
+determine where arrays are allocated and where kernels execute. Use [`get_backend`](@ref) to
+obtain the backend for an array and [`allocate`](@ref) to create storage on a backend.
+
+# Example
+
+```julia
+backend = get_backend(A)
+kernel = my_kernel(backend, 256)
+kernel(A, ndrange=length(A))
+synchronize(backend)
+```
 """
 abstract type Backend end
 
@@ -214,7 +245,18 @@ export KernelIntrinsics
 # - @ndrange
 ###
 
+"""
+    groupsize(ctx)
+
+Return the workgroup size as a tuple.
+"""
 function groupsize end
+
+"""
+    ndrange(ctx)
+
+Return the launch `ndrange` as a tuple.
+"""
 function ndrange end
 
 """
@@ -333,7 +375,7 @@ workgroup. `cond` is not allowed to have any visible sideffects.
   - `GPU`: This synchronization will only occur if the `cond` evaluates.
   - `CPU`: This synchronization will always occur.
 
-!!! warn
+!!! warning
     This variant of the `@synchronize` macro violates the requirement that `@synchronize` must be encountered
     by all workitems of a work-group executing the kernel or by none at all.
     Since v`0.9.34` this version of the macro is deprecated and lowers to `@synchronize()`
@@ -349,7 +391,7 @@ end
 
 Access the hidden context object used by KernelAbstractions.
 
-!!! warn
+!!! warning
     Only valid to be used from a kernel with `cpu=false`.
 
 !!! note
@@ -663,25 +705,46 @@ function priority!(::Backend, prio::Symbol)
 end
 
 """
-    device(::Backend)::Int
+    device(backend::Backend)::Int
+
+Return the 1-based index of the currently active device for `backend`.
 
-Returns the ordinal number of the currently active device starting at one.
+!!! note
+    Backend implementations **may** implement `device(backend::Backend)::Int` if they support multiple devices.
+    They **must** implement [`ndevices`](@ref KernelAbstractions.ndevices) and [`device!`](@ref KernelAbstractions.device!).
 """
 function device(::Backend)
     return 1
 end
 
 """
-    ndevices(::Backend)::Int
+    ndevices(backend::Backend)::Int
+
+Return the number of devices available to `backend`.
 
-Returns the number of devices the backend supports.
+!!! note
+    Backend implementations **must** implement `ndevices(backend::Backend)::Int` and [`device!`](@ref KernelAbstractions.device!).
+    They **may** also implement [`device`](@ref KernelAbstractions.device) if they support multiple devices.
 """
 function ndevices(::Backend)
     return 1
 end
 
 """
-    device!(::Backend, id::Int)
+    device!(backend::Backend, id::Int)
+
+Select the active device for `backend`. `id` is a 1-based device index and must satisfy
+`1 <= id <= ndevices(backend)`.
+
+# Example
+
+```julia
+device!(CUDABackend(), 2)  # use the second CUDA device
+```
+
+!!! note
+    Backend implementations **must** implement `devices!(backend::Backend, id::Int)` and [`ndevices`](@ref KernelAbstractions.ndevices).
+    They **may** also implement [`device`](@ref KernelAbstractions.device) if they support multiple devices.
 """
 function device!(backend::Backend, id::Int)
     if !(0 < id <= ndevices(backend))
@@ -721,9 +784,19 @@ import .NDIteration: get
 """
     Kernel{Backend, WorkgroupSize, NDRange, Func}
 
-Kernel closure struct that is used to represent the backend
-kernel on the host. `WorkgroupSize` is the number of workitems
-in a workgroup.
+Host-side handle for a kernel specialized on a backend, workgroup size, and `ndrange`.
+
+Kernels are created by calling a [`@kernel`](@ref) function on a backend, for example
+`my_kernel(CUDABackend(), 256)`. The returned object is callable:
+
+```julia
+kernel = my_kernel(backend, 64)
+kernel(A, B, ndrange=length(A))   # launch asynchronously
+synchronize(backend)
+```
+
+Use [`workgroupsize`](@ref KernelAbstractions.workgroupsize), [`ndrange`](@ref KernelAbstractions.ndrange),
+and [`backend`](@ref KernelAbstractions.backend) to inspect a kernel's static configuration.
 
 !!! note
     Backend implementations **must** implement:
@@ -741,12 +814,40 @@ function Base.similar(kernel::Kernel{D, WS, ND}, f::F) where {D, WS, ND, F}
     return Kernel{D, WS, ND, F}(kernel.backend, f)
 end
 
-workgroupsize(::Kernel{D, WorkgroupSize}) where {D, WorkgroupSize} = WorkgroupSize
-ndrange(::Kernel{D, WorkgroupSize, NDRange}) where {D, WorkgroupSize, NDRange} = NDRange
-backend(kernel::Kernel) = kernel.backend
+"""
+    workgroupsize(kernel::Kernel)
+
+Return the static workgroup size type parameter of `kernel` (`StaticSize` or `DynamicSize`).
+"""
+function workgroupsize(::Kernel{D, WorkgroupSize}) where {D, WorkgroupSize}
+    return WorkgroupSize
+end
+
+"""
+    ndrange(kernel::Kernel)
 
+Return the static `ndrange` type parameter of `kernel` (`StaticSize` or `DynamicSize`).
 """
-Partition a kernel for the given ndrange and workgroupsize.
+function ndrange(::Kernel{D, WorkgroupSize, NDRange}) where {D, WorkgroupSize, NDRange}
+    return NDRange
+end
+
+"""
+    backend(kernel::Kernel)
+
+Return the [`Backend`](@ref) that `kernel` was constructed for.
+"""
+function backend(kernel::Kernel)
+    return kernel.backend
+end
+
+"""
+    partition(kernel, ndrange, workgroupsize)
+
+Partition the iteration space of `kernel` into workgroups.
+
+Returns the blocked iteration space and whether dynamic bounds-checking is required for the
+last (possibly partial) workgroup. Primarily used by backend implementations and tests.
 """
 @inline function partition(kernel, ndrange, workgroupsize)
     static_ndrange = KernelAbstractions.ndrange(kernel)
@@ -844,9 +945,12 @@ __size(args::Tuple) = Tuple{args...}
 __size(i::Int) = Tuple{i}
 
 """
-    argconvert(::Kernel, arg)
+    argconvert(kernel::Kernel, arg)
+
+Convert `arg` to the device-side representation expected by `kernel`'s backend.
 
-Convert arguments to the device side representation.
+Backend implementations define methods for their array and scalar types. This is called
+automatically when a kernel is launched.
 """
 argconvert(k::Kernel{T}, arg) where {T} =
     error("Don't know how to convert arguments for Kernel{$T}")
@@ -881,6 +985,31 @@ include("pocl/pocl.jl")
 using .POCL
 export POCLBackend
 
+"""
+    POCLBackend()
+
+CPU backend that compiles kernels to OpenCL via [POCL](https://portablecl.org/) and executes
+them on the host. This is the concrete type behind the [`CPU`](@ref) alias.
+"""
+POCLBackend
+
+"""
+    CPU
+
+Type alias for [`POCLBackend`](@ref), the CPU execution backend.
+
+Construct with `CPU()` (equivalent to `POCLBackend()`). Kernels run on the host via POCL/OpenCL
+using the same programming model as GPU backends, which is useful for debugging and for running
+kernel code without a GPU.
+
+# Example
+
+```julia
+A = ones(Float32, 1024)
+mul2_kernel(CPU(), 64)(A, ndrange=length(A))
+synchronize(CPU())
+```
+"""
 const CPU = POCLBackend
 
 # precompile
diff --git a/src/nditeration.jl b/src/nditeration.jl
index 3b2be2353..aacaa8bff 100644
--- a/src/nditeration.jl
+++ b/src/nditeration.jl
@@ -10,7 +10,19 @@ struct DynamicCheck end
 struct NoDynamicCheck end
 
 abstract type _Size end
+
+"""
+    DynamicSize
+
+Marker type indicating that a kernel's workgroup size or `ndrange` is chosen at launch time.
+"""
 struct DynamicSize <: _Size end
+
+"""
+    StaticSize{S}
+
+Marker type encoding a compile-time workgroup size or `ndrange` as a tuple `S`.
+"""
 struct StaticSize{S} <: _Size
     function StaticSize{S}() where {S}
         return new{S::Tuple{Vararg{Int}}}()
diff --git a/src/reflection.jl b/src/reflection.jl
index 53142cc1a..b853e4bd1 100644
--- a/src/reflection.jl
+++ b/src/reflection.jl
@@ -96,23 +96,22 @@ end
 
 
 """
-Get the typed IR for a kernel
+    @ka_code_typed [kwargs...] kernel(args...; ndrange=..., workgroupsize=...)
+
+Return the typed IR for a kernel's device function, similar to `InteractiveUtils.code_typed`.
+
+Pass `interactive=true` to descend into the IR with [Cthulhu](https://github.com/JuliaDebug/Cthulhu.jl)
+(must be loaded in the session). If `ndrange` is fixed at kernel construction time, it can be
+omitted at the call site.
 
 # Examples
+
+```julia
+@ka_code_typed my_kernel(backend)(A, ndrange=length(A))
+@ka_code_typed my_kernel(backend, 64)(A, ndrange=length(A))
+@ka_code_typed optimize=false my_kernel(backend)(A, ndrange=length(A))
+@ka_code_typed interactive=true my_kernel(CPU())(A, ndrange=length(A))
 ```
-@ka_code_typed kernel(args. ndrange=...)
-@ka_code_typed kernel(args. ndrange=... workgroupsize=...)
-@ka_code_typed optimize=false kernel(args. ndrange=...)
-```
-To use interactive mode (with Cthulhu), call
-```
-@ka_code_typed interactive=true kernel(args. ndrange=...)
-```
-If ndrange is statically defined, then you could call
-```
-@ka_code_typed kernel(args.)
-```
-Works for CPU or CUDA kernels, with static or dynamic declarations
 """
 macro ka_code_typed(ex0...)
     ex, args, old_args, kern = format_ex(ex0)
@@ -133,19 +132,18 @@ end
 
 
 """
-Get the llvm code for a kernel
+    @ka_code_llvm [kwargs...] kernel(args...; ndrange=..., workgroupsize=...)
+
+Return the LLVM IR for a kernel's device function, similar to `InteractiveUtils.code_llvm`.
+
+Only supported on the CPU backend. GPU kernels will throw an error.
 
 # Examples
+
+```julia
+@ka_code_llvm my_kernel(CPU())(A, ndrange=length(A))
+@ka_code_llvm my_kernel(CPU(), 64)(A, ndrange=length(A))
 ```
-@ka_code_llvm kernel(args. ndrange=...)
-@ka_code_llvm kernel(args. ndrange=... workgroupsize=...)
-@ka_code_llvm optimize=false kernel(args. ndrange=...)
-```
-If ndrange is statically defined, then you could call
-```
-@ka_code_llvm kernel(args.)
-```
-Works for CPU kernels ONLY, with static or dynamic declarations
 """
 macro ka_code_llvm(ex0...)
     ex, args, old_args, kern = format_ex(ex0)